File Coverage

lib/Parse/File/Taxonomy/Path.pm
Criterion Covered Total %
statement 19 21 90.4
branch n/a
condition n/a
subroutine 7 7 100.0
pod n/a
total 26 28 92.8


line stmt bran cond sub pod time code
1             package Parse::File::Taxonomy::Path;
2 5     5   8787 use strict;
  5         8  
  5         177  
3 5     5   1755 use parent qw( Parse::File::Taxonomy );
  5         1022  
  5         20  
4 5     5   232 use Carp;
  5         6  
  5         243  
5 5     5   19 use Text::CSV;
  5         6  
  5         22  
6 5     5   87 use Scalar::Util qw( reftype );
  5         32  
  5         242  
7             our $VERSION = '0.03';
8 5         264 use Parse::File::Taxonomy::Auxiliary qw(
9             path_check_fields
10             components_check_fields
11 5     5   1147 );
  5         14  
12 5     5   4560 use Data::Dump;
  0            
  0            
13              
14             =head1 NAME
15              
16             Parse::File::Taxonomy::Path - Validate a file for use as a path-based taxonomy
17              
18             =head1 SYNOPSIS
19              
20             use Parse::File::Taxonomy::Path;
21              
22             # 'file' interface: reads a CSV file for you
23              
24             $source = "./t/data/alpha.csv";
25             $obj = Parse::File::Taxonomy::Path->new( {
26             file => $source,
27             } );
28              
29             # 'components' interface: as if you've already read a
30             # CSV file and now have Perl array references to header and data rows
31              
32             $obj = Parse::File::Taxonomy::Path->new( {
33             components => {
34             fields => $fields,
35             data_records => $data_records,
36             }
37             } );
38              
39             =head1 METHODS
40              
41             =head2 C
42              
43             =over 4
44              
45             =item * Purpose
46              
47             Parse::File::Taxonomy::Path constructor.
48              
49             =item * Arguments
50              
51             Single hash reference. There are two possible interfaces: C and C.
52              
53             =over 4
54              
55             =item 1 C interface
56              
57             $source = "./t/data/alpha.csv";
58             $obj = Parse::File::Taxonomy::Path->new( {
59             file => $source,
60             path_col_idx => 0,
61             path_col_sep => '|',
62             %TextCSVoptions,
63             } );
64              
65             Elements in the hash reference are keyed on:
66              
67             =over 4
68              
69             =item * C
70              
71             Absolute or relative path to the incoming taxonomy file.
72             B for this interface.
73              
74             =item * C
75              
76             If the column to be used as the "path" column in the incoming taxonomy file is
77             B the first column, this option must be set to the integer representing
78             the "path" column's index position (count starts at 0). Optional; defaults to C<0>.
79              
80             =item * C
81              
82             If the string used to distinguish components of the path in the path column in
83             the incoming taxonomy file is not a pipe (C<|>), this option must be set.
84             Optional; defaults to C<|>.
85              
86             =item * Text::CSV options
87              
88             Any other options which could normally be passed to Cnew()> will
89             be passed through to that module's constructor. On the recommendation of the
90             Text::CSV documentation, C is always set to a true value.
91              
92             =back
93              
94             =item 2 C interface
95              
96             $obj = Parse::File::Taxonomy::Path->new( {
97             components => {
98             fields => $fields,
99             data_records => $data_records,
100             }
101             } );
102              
103             Elements in this hash are keyed on:
104              
105             =over 4
106              
107             =item * C
108              
109             This element is B for the
110             C interface. The value of this element is a hash reference with two keys, C and
111             C. C is a reference to an array holding the field or
112             column names for the data set. C is a reference to an array of
113             array references, each of the latter arrayrefs holding one record or row from
114             the data set.
115              
116             =item * C
117              
118             Same as in C interface above.
119              
120             =item * C
121              
122             Same as in C interface above.
123              
124             =back
125              
126             =back
127              
128             =item * Return Value
129              
130             Parse::File::Taxonomy::Path object.
131              
132             =item * Comment
133              
134             C will throw an exception under any of the following conditions:
135              
136             =over 4
137              
138             =item * Argument to C is not a reference.
139              
140             =item * Argument to C is not a hash reference.
141              
142             =item * In the C interface, unable to locate the file which is the value of the C element.
143              
144             =item * Argument to C element is not an integer.
145              
146             =item * Argument to C is greater than the index number of the
147             last element in the header row of the incoming taxonomy file, I the
148             C is wrong.
149              
150             =item * The same field is found more than once in the header row of the
151             incoming taxonomy file.
152              
153             =item * Unable to open or close the incoming taxonomy file for reading.
154              
155             =item * In the column designated as the "path" column, the same value is
156             observed more than once.
157              
158             =item * A non-parent node's parent node cannot be located in the incoming taxonomy file.
159              
160             =item * A data row has a number of fields different from the number of fields
161             in the header row.
162              
163             =back
164              
165             =back
166              
167             =cut
168              
169             sub new {
170             my ($class, $args) = @_;
171             my $data;
172              
173             croak "Argument to 'new()' must be hashref"
174             unless (ref($args) and reftype($args) eq 'HASH');
175             croak "Argument to 'new()' must have either 'file' or 'components' element"
176             unless ($args->{file} or $args->{components});
177             croak "Argument to 'new()' must have either 'file' or 'components' element but not both"
178             if ($args->{file} and $args->{components});
179              
180             if (exists $args->{path_col_idx}) {
181             croak "Argument to 'path_col_idx' must be integer"
182             unless $args->{path_col_idx} =~ m/^\d+$/;
183             }
184             $data->{path_col_idx} = delete $args->{path_col_idx} || 0;
185             $data->{path_col_sep} = exists $args->{path_col_sep}
186             ? $args->{path_col_sep}
187             : '|';
188             if (exists $args->{path_col_sep}) {
189             $data->{path_col_sep} = $args->{path_col_sep};
190             delete $args->{path_col_sep};
191             }
192             else {
193             $data->{path_col_sep} = '|';
194             }
195              
196             if ($args->{components}) {
197             croak "Value of 'components' element must be hashref"
198             unless (ref($args->{components}) and reftype($args->{components}) eq 'HASH');
199             for my $k ( qw| fields data_records | ) {
200             croak "Value of 'components' element must have '$k' key-value pair"
201             unless exists $args->{components}->{$k};
202             croak "Value of '$k' element must be arrayref"
203             unless (ref($args->{components}->{$k}) and
204             reftype($args->{components}->{$k}) eq 'ARRAY');
205             }
206             for my $row (@{$args->{components}->{data_records}}) {
207             croak "Each element in 'data_records' array must be arrayref"
208             unless (ref($row) and reftype($row) eq 'ARRAY');
209             }
210             # We don't want to stick $args->{components} into the object as is.
211             # Rather, we want to insert 'fields' and 'data_records' for
212             # consistency with the 'file' interface. But to do that we first need
213             # to impose the same validations that we do for the 'file' interface.
214             # We also need to populate 'path_col'.
215             _prepare_fields($data, $args->{components}->{fields}, 1);
216             my $these_data_records = $args->{components}->{data_records};
217             delete $args->{components};
218             _prepare_data_records($data, $these_data_records, $args);
219             }
220             else {
221             croak "Cannot locate file '$args->{file}'"
222             unless (-f $args->{file});
223             $data->{file} = delete $args->{file};
224              
225             # We've now handled all the Parse::File::Taxonomy::Path-specific options.
226             # Any remaining options are assumed to be intended for Text::CSV::new().
227              
228             $args->{binary} = 1;
229             my $csv = Text::CSV->new ( $args )
230             or croak "Cannot use CSV: ".Text::CSV->error_diag ();
231             open my $IN, "<", $data->{file}
232             or croak "Unable to open '$data->{file}' for reading";
233             my $header_ref = $csv->getline($IN);
234              
235             _prepare_fields($data, $header_ref);
236             my $data_records = $csv->getline_all($IN);
237             close $IN or croak "Unable to close after reading";
238             _prepare_data_records($data, $data_records, $args);
239             }
240              
241             while (my ($k,$v) = each %{$args}) {
242             $data->{$k} = $v;
243             }
244             return bless $data, $class;
245             }
246              
247             sub _prepare_fields {
248             my ($data, $fields_ref, $components) = @_;
249             if (! $components) {
250             _check_path_col_idx($data, $fields_ref, 0);
251             path_check_fields($data, $fields_ref);
252             }
253             else {
254             _check_path_col_idx($data, $fields_ref, 1);
255             components_check_fields($data, $fields_ref);
256             }
257             $data->{fields} = $fields_ref;
258             $data->{path_col} = $data->{fields}->[$data->{path_col_idx}];
259             return $data;
260             }
261              
262             sub _check_path_col_idx {
263             my ($data, $fields_ref, $components) = @_;
264             my $error_msg = "Argument to 'path_col_idx' exceeds index of last field in ";
265             $error_msg .= $components
266             ? "'fields' array ref"
267             : "header row in '$data->{file}'";
268              
269             croak $error_msg if $data->{path_col_idx} > $#{$fields_ref};
270             }
271              
272             sub _prepare_data_records {
273             # Confirm no duplicate entries in column holding path:
274             # Confirm all rows have same number of columns as header:
275             my ($data, $data_records, $args) = @_;
276             my @bad_count_records = ();
277             my %paths_seen = ();
278             my $field_count = scalar(@{$data->{fields}});
279             for my $rec (@{$data_records}) {
280             $paths_seen{$rec->[$data->{path_col_idx}]}++;
281             my $this_row_count = scalar(@{$rec});
282             if ($this_row_count != $field_count) {
283             push @bad_count_records,
284             [ $rec->[$data->{path_col_idx}], $this_row_count ];
285             }
286             }
287             my @dupe_paths = ();
288             for my $path (sort keys %paths_seen) {
289             push @dupe_paths, $path if $paths_seen{$path} > 1;
290             }
291             my $error_msg = <
292             No duplicate entries are permitted in column designated as path.
293             The following entries appear the number of times shown:
294             ERROR_MSG_DUPE
295             for my $path (@dupe_paths) {
296             $error_msg .= " $path:" . sprintf(" %6s\n" => $paths_seen{$path});
297             }
298             croak $error_msg if @dupe_paths;
299              
300             $error_msg = <
301             Header row has $field_count records. The following records had different counts:
302             ERROR_MSG_WRONG_COUNT
303             for my $rec (@bad_count_records) {
304             $error_msg .= " $rec->[0]: $rec->[1]\n";
305             }
306             croak $error_msg if @bad_count_records;
307              
308             # Confirm each node appears in taxonomy:
309             my $path_args = { map { $_ => $args->{$_} } keys %{$args} };
310             $path_args->{sep_char} = $data->{path_col_sep};
311             my $path_csv = Text::CSV->new ( $path_args )
312             or croak "Cannot use CSV: ".Text::CSV->error_diag ();
313             my %missing_parents = ();
314             for my $path (sort keys %paths_seen) {
315             my $status = $path_csv->parse($path);
316             my @columns = $path_csv->fields();
317             if (@columns > 2) {
318             my $parent =
319             join($path_args->{sep_char} => @columns[0 .. ($#columns - 1)]);
320             unless (exists $paths_seen{$parent}) {
321             $missing_parents{$path} = $parent;
322             }
323             }
324             }
325             $error_msg = <
326             Each node in the taxonomy must have a parent.
327             The following nodes lack the expected parent:
328             ERROR_MSG_ORPHAN
329             for my $path (sort keys %missing_parents) {
330             $error_msg .= " $path: $missing_parents{$path}\n";
331             }
332             croak $error_msg if scalar(keys %missing_parents);
333             # BBB end of validations
334             $data->{data_records} = $data_records;
335              
336             return $data;
337             }
338              
339             =head2 C
340              
341             =over 4
342              
343             =item * Purpose
344              
345             Identify the names of the columns in the taxonomy.
346              
347             =item * Arguments
348              
349             my $fields = $self->fields();
350              
351             No arguments; the information is already inside the object.
352              
353             =item * Return Value
354              
355             Reference to an array holding a list of the columns as they appear in the
356             header row of the incoming taxonomy file.
357              
358             =item * Comment
359              
360             Read-only.
361              
362             =back
363              
364             # Implemented in lib/Parse/File/Taxonomy.pm
365              
366             =head2 C
367              
368             =over 4
369              
370             =item * Purpose
371              
372             Identify the index position (count starts at 0) of the column in the incoming
373             taxonomy file which serves as the path column.
374              
375             =item * Arguments
376              
377             my $path_col_idx = $self->path_col_idx;
378              
379             No arguments; the information is already inside the object.
380              
381             =item * Return Value
382              
383             Integer in the range from 0 to 1 less than the number of columns in the header
384             row.
385              
386             =item * Comment
387              
388             Read-only.
389              
390             =back
391              
392             =cut
393              
394             sub path_col_idx {
395             my $self = shift;
396             return $self->{path_col_idx};
397             }
398              
399             =head2 C
400              
401             =over 4
402              
403             =item * Purpose
404              
405             Identify the name of the column in the incoming taxonomy which serves as the
406             path column.
407              
408             =item * Arguments
409              
410             my $path_col = $self->path_col;
411              
412             No arguments; the information is already inside the object.
413              
414             =item * Return Value
415              
416             String.
417              
418             =item * Comment
419              
420             Read-only.
421              
422             =back
423              
424             =cut
425              
426             sub path_col {
427             my $self = shift;
428             return $self->{path_col};
429             }
430              
431             =head2 C
432              
433             =over 4
434              
435             =item * Purpose
436              
437             Identify the string used to separate path components once the taxonomy has
438             been created. This is just a "getter" and is logically distinct from the
439             option to C which is, in effect, a "setter."
440              
441             =item * Arguments
442              
443             my $path_col_sep = $self->path_col_sep;
444              
445             No arguments; the information is already inside the object.
446              
447             =item * Return Value
448              
449             String.
450              
451             =item * Comment
452              
453             Read-only.
454              
455             =back
456              
457             =cut
458              
459             sub path_col_sep {
460             my $self = shift;
461             return $self->{path_col_sep};
462             }
463              
464             =head2 C
465              
466             =over 4
467              
468             =item * Purpose
469              
470             Once the taxonomy has been validated, get a list of its data rows as a Perl
471             data structure.
472              
473             =item * Arguments
474              
475             $data_records = $self->data_records;
476              
477             None.
478              
479             =item * Return Value
480              
481             Reference to array of array references. The array will hold the data records
482             found in the incoming taxonomy file in their order in that file.
483              
484             =item * Comment
485              
486             Does not contain any information about the fields in the taxonomy, so you
487             should probably either (a) use in conjunction with C method above;
488             or (b) use C.
489              
490             =back
491              
492             # Implemented in lib/Parse/File/Taxonomy.pm
493              
494             =head2 C
495              
496             =over 4
497              
498             =item * Purpose
499              
500             Once the taxonomy has been validated, get a list of its header and data rows as a Perl
501             data structure.
502              
503             =item * Arguments
504              
505             $data_records = $self->fields_and_data_records;
506              
507             None.
508              
509             =item * Return Value
510              
511             Reference to array of array references. The first element in the array will
512             hold the header row (same as output of C). The remaining elements
513             will hold the data records found in the incoming taxonomy file in their order
514             in that file.
515              
516             =back
517              
518             =cut
519              
520             # Implemented in lib/Parse/File/Taxonomy.pm
521              
522             =head2 C
523              
524             =over 4
525              
526             =item * Purpose
527              
528             Once the taxonomy has been validated, get a list of its data rows as a Perl
529             data structure. In each element of this list, the path is now represented as
530             an array reference rather than a string.
531              
532             =item * Arguments
533              
534             $data_records_path_components = $self->data_records_path_components;
535              
536             None.
537              
538             =item * Return Value
539              
540             Reference to array of array references. The array will hold the data records
541             found in the incoming taxonomy file in their order in that file.
542              
543             =item * Comment
544              
545             Does not contain any information about the fields in the taxonomy, so you may
546             wish to use this method either (a) use in conjunction with C method
547             above; or (b) use C.
548              
549             =back
550              
551             =cut
552              
553             sub data_records_path_components {
554             my $self = shift;
555             my @all_rows = ();
556             for my $row (@{$self->{data_records}}) {
557             my $path_col = $row->[$self->{path_col_idx}];
558             my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
559             my @rewritten = ();
560             for (my $i = 0; $i <= $#{$row}; $i++) {
561             if ($i != $self->{path_col_idx}) {
562             push @rewritten, $row->[$i];
563             }
564             else {
565             push @rewritten, \@path_components;
566             }
567             }
568             push @all_rows, \@rewritten;
569             }
570             return \@all_rows;
571             }
572              
573             =head2 C
574              
575             =over 4
576              
577             =item * Purpose
578              
579             Once the taxonomy has been validated, get a list of its data rows as a Perl
580             data structure. The first element in this list is an array reference holding
581             the header row. In each data element of this list, the path is now represented as
582             an array reference rather than a string.
583              
584             =item * Arguments
585              
586             $fields_and_data_records_path_components = $self->fields_and_data_records_path_components;
587              
588             None.
589              
590             =item * Return Value
591              
592             Reference to array of array references. The array will hold the data records
593             found in the incoming taxonomy file in their order in that file.
594              
595             =back
596              
597             =cut
598              
599             sub fields_and_data_records_path_components {
600             my $self = shift;
601             my @all_rows = $self->fields;
602             for my $row (@{$self->{data_records}}) {
603             my $path_col = $row->[$self->{path_col_idx}];
604             my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
605             my @rewritten = ();
606             for (my $i = 0; $i <= $#{$row}; $i++) {
607             if ($i != $self->{path_col_idx}) {
608             push @rewritten, $row->[$i];
609             }
610             else {
611             push @rewritten, \@path_components;
612             }
613             }
614             push @all_rows, \@rewritten;
615             }
616             return \@all_rows;
617             }
618              
619             =head2 C
620              
621             =over 4
622              
623             =item * Purpose
624              
625             Identify the index position of a given field within the header row.
626              
627             =item * Arguments
628              
629             $index = $obj->get_field_position('income');
630              
631             Takes a single string holding the name of one of the fields (column names).
632              
633             =item * Return Value
634              
635             Integer representing the index position (counting from C<0>) of the field
636             provided as argument. Throws exception if the argument is not actually a
637             field.
638              
639             =back
640              
641             =cut
642              
643             # Implemented in lib/Parse/File/Taxonomy.pm
644              
645             =head2 C
646              
647             =over 4
648              
649             =item * Purpose
650              
651             Display the number of descendant (multi-generational) nodes each node in the
652             taxonomy has.
653              
654             =item * Arguments
655              
656             $child_counts = $self->child_counts();
657              
658             None.
659              
660             =item * Return Value
661              
662             Reference to hash in which each element is keyed on the value of the path
663             column in the incoming taxonomy file.
664              
665             =back
666              
667             =cut
668              
669             sub child_counts {
670             my $self = shift;
671             my %child_counts = map { $_->[$self->{path_col_idx}] => 0 } @{$self->{data_records}};
672             for my $node (keys %child_counts) {
673             for my $other_node ( grep { ! m/^\Q$node\E$/ } keys %child_counts) {
674             $child_counts{$node}++
675             if $other_node =~ m/^\Q$node$self->{path_col_sep}\E/;
676             }
677             }
678             return \%child_counts;
679             }
680              
681             =head2 C
682              
683             =over 4
684              
685             =item * Purpose
686              
687             Get the total number of descendant nodes for one specific node in a validated
688             taxonomy.
689              
690             =item * Arguments
691              
692             $child_count = $self->get_child_count('|Path|To|Node');
693              
694             String containing node's path as spelled in the taxonomy.
695              
696             =item * Return Value
697              
698             Unsigned integer >= 0. Any node whose child count is C<0> is by definition a
699             leaf node.
700              
701             =item * Comment
702              
703             Will throw an exception if the node does not exist or is misspelled.
704              
705             =back
706              
707             =cut
708              
709             sub get_child_count {
710             my ($self, $node) = @_;
711             my $child_counts = $self->child_counts();
712             croak "Node '$node' not found" unless exists $child_counts->{$node};
713             return $child_counts->{$node};
714             }
715              
716             =head2 C
717              
718             =over 4
719              
720             =item * Purpose
721              
722             Turn a validated taxonomy into a Perl hash keyed on the column designated as
723             the path column.
724              
725             =item * Arguments
726              
727             $hashref = $self->hashify_taxonomy();
728              
729             Takes an optional hashref holding a list of any of the following elements:
730              
731             =over 4
732              
733             =item * C
734              
735             Boolean, defaulting to C<0>. By default, C will spell the
736             key of the hash exactly as the value of the path column is spelled in the
737             taxonomy -- which in turn is the way it was spelled in the incoming file.
738             That is, a path in the taxonomy spelled C<|Alpha|Beta|Gamma> will be spelled
739             as a key in exactly the same way.
740              
741             However, since in many cases (including the example above) the root node of
742             the taxonomy will be empty, the user may wish to remove the first instance of
743             C. The user would do so by setting
744             C to a true value.
745              
746             $hashref = $self->hashify_taxonomy( {
747             remove_leading_path_col_sep => 1,
748             } );
749              
750             In that case they key would now be spelled: C.
751              
752             Note further that if the C switch is set to a true value, any
753             setting to C will be ignored.
754              
755             =item * C
756              
757             A string which will be used in composing the key of the hashref returned by
758             this method. The user may select this key if she does not want to use the
759             value found in the incoming CSV file (which by default will be the pipe
760             character (C<|>) and which may be overridden with the C argument
761             to C.
762              
763             $hashref = $self->hashify_taxonomy( {
764             key_delim => q{ - },
765             } );
766              
767             In the above variant, a path that in the incoming taxonomy file was
768             represented by C<|Alpha|Beta|Gamma> will in C<$hashref> be represented by
769             C< - Alpha - Beta - Gamma>.
770              
771             =item * C
772              
773             A string which will be used in composing the key of the hashref returned by
774             this method. The user will set this switch if she wishes to have the root
775             note explicitly represented. Using this switch will automatically cause
776             C to be ignored.
777              
778             Suppose the user wished to have C be the text for the root
779             node. Suppose further that the user wanted to use the string C< - > as the
780             delimiter within the key.
781              
782             $hashref = $self->hashify_taxonomy( {
783             root_str => q{All Suppliers},
784             key_delim => q{ - },
785             } );
786              
787             Then incoming path C<|Alpha|Beta|Gamma> would be keyed as:
788              
789             All Suppliers - Alpha - Beta - Gamma
790              
791             =back
792              
793             =item * Return Value
794              
795             Hash reference. The number of elements in this hash should be equal to the
796             number of non-header records in the taxonomy.
797              
798             =back
799              
800             =cut
801              
802             sub hashify_taxonomy {
803             my ($self, $args) = @_;
804             if (defined $args) {
805             croak "Argument to 'new()' must be hashref"
806             unless (ref($args) and reftype($args) eq 'HASH');
807             }
808             my %hashified = ();
809             my $fields = $self->{fields};
810             my %idx2col = map { $_ => $fields->[$_] } (0 .. $#{$fields});
811             for my $rec (@{$self->{data_records}}) {
812             my $rowkey;
813             if ($args->{root_str}) {
814             $rowkey = $args->{root_str} . $rec->[$self->{path_col_idx}];
815             }
816             else {
817             if ($args->{remove_leading_path_col_sep}) {
818             ($rowkey = $rec->[$self->{path_col_idx}]) =~ s/^\Q$self->{path_col_sep}\E(.*)/$1/;
819             }
820             else {
821             $rowkey = $rec->[$self->{path_col_idx}];
822             }
823             }
824             if ($args->{key_delim}) {
825             $rowkey =~ s/\Q$self->{path_col_sep}\E/$args->{key_delim}/g;
826             }
827             my $rowdata = { map { $idx2col{$_} => $rec->[$_] } (0 .. $#{$fields}) };
828             $hashified{$rowkey} = $rowdata;
829             }
830             return \%hashified;
831             }
832              
833             1;
834              
835             # vim: formatoptions=crqot