File Coverage

lib/Parse/Taxonomy/MaterializedPath.pm
Criterion Covered Total %
statement 326 326 100.0
branch 134 140 95.7
condition 39 40 97.5
subroutine 24 24 100.0
pod 11 11 100.0
total 534 541 98.7


line stmt bran cond sub pod time code
1             package Parse::Taxonomy::MaterializedPath;
2 6     6   13007 use strict;
  6         8  
  6         157  
3 6     6   1845 use parent qw( Parse::Taxonomy );
  6         1127  
  6         22  
4 6     6   185 use Carp;
  6         5  
  6         249  
5 6     6   3550 use Text::CSV_XS;
  6         36044  
  6         263  
6 6     6   29 use Scalar::Util qw( reftype );
  6         8  
  6         239  
7 6     6   19 use List::Util qw( max );
  6         38  
  6         454  
8 6     6   23 use Cwd;
  6         7  
  6         379  
9             our $VERSION = '0.23';
10 6         14563 use Parse::Taxonomy::Auxiliary qw(
11             path_check_fields
12             components_check_fields
13 6     6   1481 );
  6         8  
14              
15             =head1 NAME
16              
17             Parse::Taxonomy::MaterializedPath - Validate a file for use as a path-based taxonomy
18              
19             =head1 SYNOPSIS
20              
21             use Parse::Taxonomy::MaterializedPath;
22              
23             # 'file' interface: reads a CSV file for you
24              
25             $source = "./t/data/alpha.csv";
26             $self = Parse::Taxonomy::MaterializedPath->new( {
27             file => $source,
28             } );
29              
30             # 'components' interface: as if you've already read a
31             # CSV file and now have Perl array references to header and data rows
32              
33             $self = Parse::Taxonomy::MaterializedPath->new( {
34             components => {
35             fields => $fields,
36             data_records => $data_records,
37             }
38             } );
39              
40             =head1 METHODS
41              
42             =head2 C
43              
44             =over 4
45              
46             =item * Purpose
47              
48             Parse::Taxonomy::MaterializedPath constructor.
49              
50             =item * Arguments
51              
52             Single hash reference. There are two possible interfaces: C and C.
53              
54             =over 4
55              
56             =item 1 C interface
57              
58             $source = "./t/data/alpha.csv";
59             $self = Parse::Taxonomy::MaterializedPath->new( {
60             file => $source,
61             path_col_idx => 0,
62             path_col_sep => '|',
63             %TextCSVoptions,
64             } );
65              
66             Elements in the hash reference are keyed on:
67              
68             =over 4
69              
70             =item * C
71              
72             Absolute or relative path to the incoming taxonomy file.
73             B for this interface.
74              
75             =item * C
76              
77             If the column to be used as the "path" column in the incoming taxonomy file is
78             B the first column, this option must be set to the integer representing
79             the "path" column's index position (count starts at 0). Optional; defaults to C<0>.
80              
81             =item * C
82              
83             If the string used to distinguish components of the path in the path column in
84             the incoming taxonomy file is not a pipe (C<|>), this option must be set.
85             Optional; defaults to C<|>.
86              
87             =item * Text::CSV_XS options
88              
89             Any other options which could normally be passed to Cnew()> will
90             be passed through to that module's constructor. On the recommendation of the
91             Text::CSV documentation, C is always set to a true value.
92              
93             =back
94              
95             =item 2 C interface
96              
97             $self = Parse::Taxonomy::MaterializedPath->new( {
98             components => {
99             fields => $fields,
100             data_records => $data_records,
101             }
102             } );
103              
104             Elements in this hash are keyed on:
105              
106             =over 4
107              
108             =item * C
109              
110             This element is B for the
111             C interface. The value of this element is a hash reference with two keys, C and
112             C. C is a reference to an array holding the field or
113             column names for the data set. C is a reference to an array of
114             array references, each of the latter arrayrefs holding one record or row from
115             the data set.
116              
117             =item * C
118              
119             Same as in C interface above.
120              
121             =item * C
122              
123             Same as in C interface above.
124              
125             =back
126              
127             =back
128              
129             =item * Return Value
130              
131             Parse::Taxonomy::MaterializedPath object.
132              
133             =item * Comment
134              
135             C will throw an exception under any of the following conditions:
136              
137             =over 4
138              
139             =item * Argument to C is not a reference.
140              
141             =item * Argument to C is not a hash reference.
142              
143             =item * In the C interface, unable to locate the file which is the value of the C element.
144              
145             =item * Argument to C element is not an integer.
146              
147             =item * Argument to C is greater than the index number of the
148             last element in the header row of the incoming taxonomy file, I the
149             C is wrong.
150              
151             =item * The same field is found more than once in the header row of the
152             incoming taxonomy file.
153              
154             =item * Unable to open or close the incoming taxonomy file for reading.
155              
156             =item * In the column designated as the "path" column, the same value is
157             observed more than once.
158              
159             =item * C, C, C, C and C are reserved terms.
160             One or more columns is named with a reserved term.
161              
162             =item * A non-parent node's parent node cannot be located in the incoming taxonomy file.
163              
164             =item * A data row has a number of fields different from the number of fields
165             in the header row.
166              
167             =back
168              
169             =back
170              
171             =cut
172              
173             sub new {
174 57     57 1 32597 my ($class, $args) = @_;
175 57         58 my $data;
176              
177 57 100 100     629 croak "Argument to 'new()' must be hashref"
178             unless (ref($args) and reftype($args) eq 'HASH');
179 55         50 my $argscount = 0;
180 55 100       110 $argscount++ if $args->{file};
181 55 100       102 $argscount++ if $args->{components};
182 55 100       178 croak "Argument to 'new()' must have either 'file' or 'components' element"
183             if ($argscount == 0);
184 54 100       226 croak "Argument to 'new()' must have either 'file' or 'components' element but not both"
185             if ($argscount == 2);
186              
187 53 100       84 if (exists $args->{path_col_idx}) {
188             croak "Argument to 'path_col_idx' must be integer"
189 4 100       107 unless $args->{path_col_idx} =~ m/^\d+$/;
190             }
191 52   100     206 $data->{path_col_idx} = delete $args->{path_col_idx} || 0;
192             $data->{path_col_sep} = exists $args->{path_col_sep}
193             ? $args->{path_col_sep}
194 52 100       142 : '|';
195 52 100       63 if (exists $args->{path_col_sep}) {
196 5         8 $data->{path_col_sep} = $args->{path_col_sep};
197 5         6 delete $args->{path_col_sep};
198             }
199             else {
200 47         58 $data->{path_col_sep} = '|';
201             }
202              
203 52 100       67 if ($args->{components}) {
204             croak "Value of 'components' element must be hashref"
205 25 100 100     296 unless (ref($args->{components}) and reftype($args->{components}) eq 'HASH');
206 23         69 for my $k ( qw| fields data_records | ) {
207             croak "Value of 'components' element must have '$k' key-value pair"
208 43 100       253 unless exists $args->{components}->{$k};
209             croak "Value of '$k' element must be arrayref"
210             unless (ref($args->{components}->{$k}) and
211 41 100 100     430 reftype($args->{components}->{$k}) eq 'ARRAY');
212             }
213 18         19 for my $row (@{$args->{components}->{data_records}}) {
  18         69  
214 231 100 100     818 croak "Each element in 'data_records' array must be arrayref"
215             unless (ref($row) and reftype($row) eq 'ARRAY');
216             }
217             # We don't want to stick $args->{components} into the object as is.
218             # Rather, we want to insert 'fields' and 'data_records' for
219             # consistency with the 'file' interface. But to do that we first need
220             # to impose the same validations that we do for the 'file' interface.
221             # We also need to populate 'path_col'.
222 16         33 _prepare_fields($data, $args->{components}->{fields}, 1);
223 13         15 my $these_data_records = $args->{components}->{data_records};
224 13         15 delete $args->{components};
225 13         26 _prepare_data_records($data, $these_data_records, $args);
226             }
227             else {
228             croak "Cannot locate file '$args->{file}'"
229 27 100       500 unless (-f $args->{file});
230 26         41 $data->{file} = delete $args->{file};
231              
232             # We've now handled all the Parse::Taxonomy::MaterializedPath-specific options.
233             # Any remaining options are assumed to be intended for Text::CSV_XS::new().
234              
235 26         32 $args->{binary} = 1;
236 26 50       107 my $csv = Text::CSV_XS->new ( $args )
237             or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
238             open my $IN, "<", $data->{file}
239 26 50       2359 or croak "Unable to open '$data->{file}' for reading";
240 4     4   1478 my $header_ref = $csv->getline($IN);
  4         13251  
  4         118  
  26         893  
241              
242 26         874 _prepare_fields($data, $header_ref);
243 23         454 my $data_records = $csv->getline_all($IN);
244 23 50       5459 close $IN or croak "Unable to close after reading";
245 23         52 _prepare_data_records($data, $data_records, $args);
246             }
247              
248 27         28 while (my ($k,$v) = each %{$args}) {
  45         102  
249 18         31 $data->{$k} = $v;
250             }
251 27         31 my %row_analysis = ();
252 27         24 for my $el (@{$data->{data_records}}) {
  27         47  
253 364         268 my $rowkey = $el->[$data->{path_col_idx}];
254 364         633 $row_analysis{$rowkey} = split(/\Q$data->{path_col_sep}\E/, $rowkey);
255             }
256 27         52 $data->{row_analysis} = \%row_analysis;
257 27         98 return bless $data, $class;
258             }
259              
260             sub _prepare_fields {
261 42     42   53 my ($data, $fields_ref, $components) = @_;
262 42 100       71 if (! $components) {
263 26         65 _check_path_col_idx($data, $fields_ref, 0);
264 25         59 path_check_fields($data, $fields_ref);
265             }
266             else {
267 16         26 _check_path_col_idx($data, $fields_ref, 1);
268 15         35 components_check_fields($data, $fields_ref);
269             }
270              
271 38         37 my %fields_seen = map { $_ => 1 } @{$fields_ref};
  203         239  
  38         51  
272 38         69 my @bad_fields = ();
273 38         50 for my $reserved ( qw| id parent_id name lft rgh | ) {
274 190 100       260 push @bad_fields, $reserved if $fields_seen{$reserved};
275             }
276 38         88 my $msg = "Bad column names: <@bad_fields>. These are reserved for ";
277 38         29 $msg .= "Parse::Taxonomy's internal use; please rename";
278 38 100       252 croak $msg if @bad_fields;
279              
280 36         47 $data->{fields} = $fields_ref;
281 36         59 $data->{path_col} = $data->{fields}->[$data->{path_col_idx}];
282 36         52 return $data;
283             }
284              
285             sub _check_path_col_idx {
286 42     42   37 my ($data, $fields_ref, $components) = @_;
287 42         44 my $error_msg = "Argument to 'path_col_idx' exceeds index of last field in ";
288 42 100       90 $error_msg .= $components
289             ? "'fields' array ref"
290             : "header row in '$data->{file}'";
291              
292 42 100       35 croak $error_msg if $data->{path_col_idx} > $#{$fields_ref};
  42         313  
293             }
294              
295             sub _prepare_data_records {
296             # Confirm each row's path starts with path_col_sep:
297             # Confirm no duplicate entries in column holding path:
298             # Confirm all rows have same number of columns as header:
299 36     36   40 my ($data, $data_records, $args) = @_;
300 36         33 my $error_msg;
301 36         40 my @bad_path_cols = ();
302 36         26 my @bad_count_records = ();
303 36         43 my %paths_seen = ();
304 36         31 my $field_count = scalar(@{$data->{fields}});
  36         58  
305 36         27 for my $rec (@{$data_records}) {
  36         63  
306 476 100       1276 unless ($rec->[$data->{path_col_idx}] =~ m/^\Q$data->{path_col_sep}\E/) {
307 6         9 push @bad_path_cols, $rec->[$data->{path_col_idx}];
308             }
309 476         550 $paths_seen{$rec->[$data->{path_col_idx}]}++;
310 476         277 my $this_row_count = scalar(@{$rec});
  476         342  
311 476 100       655 if ($this_row_count != $field_count) {
312             push @bad_count_records,
313 5         11 [ $rec->[$data->{path_col_idx}], $this_row_count ];
314             }
315             }
316 36         46 $error_msg = <
317             The value of the column designated as path must start with the path column separator.
318             Rows with the following paths fail to do so:
319             IMPROPER_PATH
320 36         41 for my $path (@bad_path_cols) {
321 6         10 $error_msg .= " $path\n";
322             }
323 36 100       276 croak $error_msg if @bad_path_cols;
324              
325 34         33 my @dupe_paths = ();
326 34         210 for my $path (sort keys %paths_seen) {
327 443 100       509 push @dupe_paths, $path if $paths_seen{$path} > 1;
328             }
329 34         52 $error_msg = <
330             No duplicate entries are permitted in column designated as path.
331             The following entries appear the number of times shown:
332             ERROR_MSG_DUPE
333 34         36 for my $path (@dupe_paths) {
334 5         19 $error_msg .= " $path:" . sprintf(" %6s\n" => $paths_seen{$path});
335             }
336 34 100       346 croak $error_msg if @dupe_paths;
337              
338 31         49 $error_msg = <
339             Header row has $field_count records. The following records had different counts:
340             ERROR_MSG_WRONG_COUNT
341 31         38 for my $rec (@bad_count_records) {
342 4         8 $error_msg .= " $rec->[0]: $rec->[1]\n";
343             }
344 31 100       227 croak $error_msg if @bad_count_records;
345              
346             # Confirm each node appears in taxonomy:
347 29         28 my $path_args = { map { $_ => $args->{$_} } keys %{$args} };
  19         49  
  29         67  
348 29         48 $path_args->{sep} = $data->{path_col_sep};
349 29 50       117 my $path_csv = Text::CSV_XS->new ( $path_args )
350             or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
351 29         2173 my %missing_parents = ();
352 29         151 for my $path (sort keys %paths_seen) {
353 386         548 my $status = $path_csv->parse($path);
354 386         3321 my @columns = $path_csv->fields();
355 386 100       1655 if (@columns > 2) {
356             my $parent =
357 281         450 join($path_args->{sep} => @columns[0 .. ($#columns - 1)]);
358 281 100       538 unless (exists $paths_seen{$parent}) {
359 4         7 $missing_parents{$path} = $parent;
360             }
361             }
362             }
363 29         41 $error_msg = <
364             Each node in the taxonomy must have a parent.
365             The following nodes lack the expected parent:
366             ERROR_MSG_ORPHAN
367 29         56 for my $path (sort keys %missing_parents) {
368 4         7 $error_msg .= " $path: $missing_parents{$path}\n";
369             }
370 29 100       238 croak $error_msg if scalar(keys %missing_parents);
371             # BBB end of validations
372 27         37 $data->{data_records} = $data_records;
373              
374 27         244 return $data;
375             }
376              
377             =head2 C
378              
379             =over 4
380              
381             =item * Purpose
382              
383             Identify the names of the columns in the taxonomy.
384              
385             =item * Arguments
386              
387             my $fields = $self->fields();
388              
389             No arguments; the information is already inside the object.
390              
391             =item * Return Value
392              
393             Reference to an array holding a list of the columns as they appear in the
394             header row of the incoming taxonomy file.
395              
396             =item * Comment
397              
398             Read-only.
399              
400             =back
401              
402             =head2 C
403              
404             =over 4
405              
406             =item * Purpose
407              
408             Identify the index position (count starts at 0) of the column in the incoming
409             taxonomy file which serves as the path column.
410              
411             =item * Arguments
412              
413             my $path_col_idx = $self->path_col_idx;
414              
415             No arguments; the information is already inside the object.
416              
417             =item * Return Value
418              
419             Integer in the range from 0 to 1 less than the number of columns in the header
420             row.
421              
422             =item * Comment
423              
424             Read-only.
425              
426             =back
427              
428             =cut
429              
430             sub path_col_idx {
431 15     15 1 2479 my $self = shift;
432 15         20 return $self->{path_col_idx};
433             }
434              
435             =head2 C
436              
437             =over 4
438              
439             =item * Purpose
440              
441             Identify the name of the column in the incoming taxonomy which serves as the
442             path column.
443              
444             =item * Arguments
445              
446             my $path_col = $self->path_col;
447              
448             No arguments; the information is already inside the object.
449              
450             =item * Return Value
451              
452             String.
453              
454             =item * Comment
455              
456             Read-only.
457              
458             =back
459              
460             =cut
461              
462             sub path_col {
463 4     4 1 1355 my $self = shift;
464 4         8 return $self->{path_col};
465             }
466              
467             =head2 C
468              
469             =over 4
470              
471             =item * Purpose
472              
473             Identify the string used to separate path components once the taxonomy has
474             been created. This is just a "getter" and is logically distinct from the
475             option to C which is, in effect, a "setter."
476              
477             =item * Arguments
478              
479             my $path_col_sep = $self->path_col_sep;
480              
481             No arguments; the information is already inside the object.
482              
483             =item * Return Value
484              
485             String.
486              
487             =item * Comment
488              
489             Read-only.
490              
491             =back
492              
493             =cut
494              
495             sub path_col_sep {
496 4     4 1 972 my $self = shift;
497 4         6 return $self->{path_col_sep};
498             }
499              
500             =head2 C
501              
502             =over 4
503              
504             =item * Purpose
505              
506             Once the taxonomy has been validated, get a list of its data rows as a Perl
507             data structure.
508              
509             =item * Arguments
510              
511             $data_records = $self->data_records;
512              
513             None.
514              
515             =item * Return Value
516              
517             Reference to array of array references. The array will hold the data records
518             found in the incoming taxonomy file in their order in that file.
519              
520             =item * Comment
521              
522             Does not contain any information about the fields in the taxonomy, so you
523             should probably either (a) use in conjunction with C method above;
524             or (b) use C.
525              
526             =back
527              
528             =head2 C
529              
530             =over 4
531              
532             =item * Purpose
533              
534             Once the taxonomy has been validated, get a list of its header and data rows as a Perl
535             data structure.
536              
537             =item * Arguments
538              
539             $data_records = $self->fields_and_data_records;
540              
541             None.
542              
543             =item * Return Value
544              
545             Reference to array of array references. The first element in the array will
546             hold the header row (same as output of C). The remaining elements
547             will hold the data records found in the incoming taxonomy file in their order
548             in that file.
549              
550             =back
551              
552             =cut
553              
554             =head2 C
555              
556             =over 4
557              
558             =item * Purpose
559              
560             Once the taxonomy has been validated, get a list of its data rows as a Perl
561             data structure. In each element of this list, the path is now represented as
562             an array reference rather than a string.
563              
564             =item * Arguments
565              
566             $data_records_path_components = $self->data_records_path_components;
567              
568             None.
569              
570             =item * Return Value
571              
572             Reference to array of array references. The array will hold the data records
573             found in the incoming taxonomy file in their order in that file.
574              
575             =item * Comment
576              
577             Does not contain any information about the fields in the taxonomy, so you may
578             wish to use this method either (a) use in conjunction with C method
579             above; or (b) use C.
580              
581             =back
582              
583             =cut
584              
585             sub data_records_path_components {
586 13     13 1 1340 my $self = shift;
587 13         19 my @all_rows = ();
588 13         8 for my $row (@{$self->{data_records}}) {
  13         28  
589 182         144 my $path_col = $row->[$self->{path_col_idx}];
590 182         388 my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
591 182         125 my @rewritten = ();
592 182         119 for (my $i = 0; $i <= $#{$row}; $i++) {
  1062         1256  
593 880 100       777 if ($i != $self->{path_col_idx}) {
594 698         652 push @rewritten, $row->[$i];
595             }
596             else {
597 182         221 push @rewritten, \@path_components;
598             }
599             }
600 182         214 push @all_rows, \@rewritten;
601             }
602 13         40 return \@all_rows;
603             }
604              
605             =head2 C
606              
607             =over 4
608              
609             =item * Purpose
610              
611             Once the taxonomy has been validated, get a list of its data rows as a Perl
612             data structure. The first element in this list is an array reference holding
613             the header row. In each data element of this list, the path is now represented as
614             an array reference rather than a string.
615              
616             =item * Arguments
617              
618             $fields_and_data_records_path_components = $self->fields_and_data_records_path_components;
619              
620             None.
621              
622             =item * Return Value
623              
624             Reference to array of array references. The array will hold the data records
625             found in the incoming taxonomy file in their order in that file.
626              
627             =back
628              
629             =cut
630              
631             sub fields_and_data_records_path_components {
632 2     2 1 1220 my $self = shift;
633 2         4 my @all_rows = $self->fields;
634 2         3 for my $row (@{$self->{data_records}}) {
  2         4  
635 26         22 my $path_col = $row->[$self->{path_col_idx}];
636 26         57 my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
637 26         20 my @rewritten = ();
638 26         18 for (my $i = 0; $i <= $#{$row}; $i++) {
  182         207  
639 156 100       137 if ($i != $self->{path_col_idx}) {
640 130         137 push @rewritten, $row->[$i];
641             }
642             else {
643 26         31 push @rewritten, \@path_components;
644             }
645             }
646 26         31 push @all_rows, \@rewritten;
647             }
648 2         4 return \@all_rows;
649             }
650              
651             =head2 C
652              
653             =over 4
654              
655             =item * Purpose
656              
657             Identify the index position of a given field within the header row.
658              
659             =item * Arguments
660              
661             $index = $self->get_field_position('income');
662              
663             Takes a single string holding the name of one of the fields (column names).
664              
665             =item * Return Value
666              
667             Integer representing the index position (counting from C<0>) of the field
668             provided as argument. Throws exception if the argument is not actually a
669             field.
670              
671             =back
672              
673             =cut
674              
675             =head2 C
676              
677             =over 4
678              
679             =item * Purpose
680              
681             Display the number of descendant (multi-generational) nodes each node in the
682             taxonomy has.
683              
684             =item * Arguments
685              
686             $descendant_counts = $self->descendant_counts();
687              
688             $descendant_counts = $self->descendant_counts( { generations => 1 } );
689              
690             None required; one optional hash reference. Currently, the only element
691             honored in that hashref is C, whose value must be a non-negative
692             integer. If, instead of getting the count of all descendants of a node, you
693             only want the count of its first generation, i.e., its immediate children, you
694             provide a value of C<1>. Want the count of only the first and second
695             generations? Provide a value of C<2> -- and so on.
696              
697             =item * Return Value
698              
699             Reference to hash in which each element is keyed on the value of the path
700             column in the incoming taxonomy file.
701              
702             =back
703              
704             =cut
705              
706             sub descendant_counts {
707 9     9 1 3395 my ($self, $args) = @_;
708 9 100       19 if (defined $args) {
709 5 100 100     238 croak "Argument to 'descendant_counts()' must be hashref"
710             unless (ref($args) and reftype($args) eq 'HASH');
711             croak "Value for 'generations' element passed to descendant_counts() must be integer > 0"
712 3 100 100     159 unless ($args->{generations} and $args->{generations} =~ m/^[0-9]+$/);
713             }
714 5         8 my %descendant_counts = ();
715 5         8 my $hashified = $self->hashify();
716 5         7 for my $p (keys %{$hashified}) {
  5         14  
717 59         46 $descendant_counts{$p} = 0;
718 59         44 for my $q (
719 707         661 grep { $self->{row_analysis}->{$_} > $self->{row_analysis}->{$p} }
720 59         96 keys %{$hashified}
721             ) {
722 230 100       700 if ($q =~ m/^\Q$p$self->{path_col_sep}\E/) {
723 61 100       64 if (! $args->{generations}) {
724 48         50 $descendant_counts{$p}++;
725             }
726             else {
727 13         41 my @c = $p =~ m/\Q$self->{path_col_sep}\E/g;
728 13         28 my @d = $q =~ m/\Q$self->{path_col_sep}\E/g;
729             $descendant_counts{$p}++
730 13 100       34 if (scalar(@d) - scalar(@c) <= $args->{generations});
731             }
732             }
733             }
734             }
735 5         9 $self->{descendant_counts} = \%descendant_counts;
736 5         46 return $self->{descendant_counts};
737             }
738              
739             =head2 C
740              
741             =over 4
742              
743             =item * Purpose
744              
745             Get the total number of descendant nodes for one specific node in a validated
746             taxonomy.
747              
748             =item * Arguments
749              
750             $descendant_count = $self->get_descendant_count('|Path|To|Node');
751              
752             $descendant_counts = $self->get_descendant_count('|Path|To|Node', { generations => 1 } );
753              
754             One required: string containing node's path as spelled in the taxonomy.
755              
756             One optional hash reference. Currently, the only element honored in that
757             hashref is C, whose value must be a non-negative integer. If,
758             instead of getting the count of all descendants of a node, you only want the
759             count of its first generation, i.e., its immediate children, you provide a
760             value of C<1>. Want the count of only first and second generations? Provide
761             a value of C<2> -- and so on.
762              
763             =item * Return Value
764              
765             Unsigned integer >= 0. Any node whose child count is C<0> is by definition a
766             leaf node.
767              
768             =item * Comment
769              
770             Will throw an exception if the node does not exist or is misspelled.
771              
772             If C is called with no second (hashref) argument
773             following an invocation of C, it will return a value from
774             an internal cache created during that earlier method call. Otherwise, it will
775             re-create the cache from scratch. (This, of course, assumes that you have not
776             manipulated the object's internal data subsequent to its creation.)
777              
778             =back
779              
780             =cut
781              
782             sub get_descendant_count {
783 26     26 1 7609 my ($self, $node, $args) = @_;
784 26 100       53 if (defined $args) {
785 15 100 100     218 croak "Second argument to 'get_descendant_count()' must be hashref"
786             unless (ref($args) and reftype($args) eq 'HASH');
787             croak "Value for 'generations' element passed to second argument to get_descendant_count() must be integer > 0"
788 13 100 100     203 unless ($args->{generations} and $args->{generations} =~ m/^[0-9]+$/);
789             }
790 22 100       33 if (exists $self->{descendant_counts}) {
791 6         6 my $descendant_counts = $self->{descendant_counts};
792 6 100       178 croak "Node '$node' not found" unless exists $descendant_counts->{$node};
793 4         9 return $descendant_counts->{$node};
794             }
795             else {
796 16         22 my %descendant_counts = ();
797 16         22 my $hashified = $self->hashify();
798 16 100       104 croak "Node '$node' not found" unless exists $hashified->{$node};
799 15         16 for my $p ($node) {
800 15         16 $descendant_counts{$p} = 0;
801 15         9 for my $q (
802 231         248 grep { $self->{row_analysis}->{$_} > $self->{row_analysis}->{$p} }
803 15         43 keys %{$hashified}
804             ) {
805 110 100       337 if ($q =~ m/^\Q$p$self->{path_col_sep}\E/) {
806 36 100       41 if (! $args->{generations}) {
807 10         13 $descendant_counts{$p}++;
808             }
809             else {
810 26         56 my @c = $p =~ m/\Q$self->{path_col_sep}\E/g;
811 26         57 my @d = $q =~ m/\Q$self->{path_col_sep}\E/g;
812             $descendant_counts{$p}++
813 26 100       67 if (scalar(@d) - scalar(@c) <= $args->{generations});
814             }
815             }
816             }
817             }
818 15         151 return $descendant_counts{$node};
819             }
820             }
821              
822             =head2 C
823              
824             =over 4
825              
826             =item * Purpose
827              
828             Turn a validated taxonomy into a Perl hash keyed on the column designated as
829             the path column.
830              
831             =item * Arguments
832              
833             $hashref = $self->hashify();
834              
835             Takes an optional hashref holding a list of any of the following elements:
836              
837             =over 4
838              
839             =item * C
840              
841             Boolean, defaulting to C<0>. By default, C will spell the
842             key of the hash exactly as the value of the path column is spelled in the
843             taxonomy -- which in turn is the way it was spelled in the incoming file.
844             That is, a path in the taxonomy spelled C<|Alpha|Beta|Gamma> will be spelled
845             as a key in exactly the same way.
846              
847             However, since in many cases (including the example above) the root node of
848             the taxonomy will be empty, the user may wish to remove the first instance of
849             C. The user would do so by setting
850             C to a true value.
851              
852             $hashref = $self->hashify( {
853             remove_leading_path_col_sep => 1,
854             } );
855              
856             In that case they key would now be spelled: C.
857              
858             Note further that if the C switch is set to a true value, any
859             setting to C will be ignored.
860              
861             =item * C
862              
863             A string which will be used in composing the key of the hashref returned by
864             this method. The user may select this key if she does not want to use the
865             value found in the incoming CSV file (which by default will be the pipe
866             character (C<|>) and which may be overridden with the C argument
867             to C.
868              
869             $hashref = $self->hashify( {
870             key_delim => q{ - },
871             } );
872              
873             In the above variant, a path that in the incoming taxonomy file was
874             represented by C<|Alpha|Beta|Gamma> will in C<$hashref> be represented by
875             C< - Alpha - Beta - Gamma>.
876              
877             =item * C
878              
879             A string which will be used in composing the key of the hashref returned by
880             this method. The user will set this switch if she wishes to have the root
881             note explicitly represented. Using this switch will automatically cause
882             C to be ignored.
883              
884             Suppose the user wished to have C be the text for the root
885             node. Suppose further that the user wanted to use the string C< - > as the
886             delimiter within the key.
887              
888             $hashref = $self->hashify( {
889             root_str => q{All Suppliers},
890             key_delim => q{ - },
891             } );
892              
893             Then incoming path C<|Alpha|Beta|Gamma> would be keyed as:
894              
895             All Suppliers - Alpha - Beta - Gamma
896              
897             =back
898              
899             =item * Return Value
900              
901             Hash reference. The number of elements in this hash should be equal to the
902             number of non-header records in the taxonomy.
903              
904             =back
905              
906             =cut
907              
908             sub hashify {
909 33     33 1 12134 my ($self, $args) = @_;
910 33 100       58 if (defined $args) {
911 8 100 100     272 croak "Argument to 'hashify()' must be hashref"
912             unless (ref($args) and reftype($args) eq 'HASH');
913             }
914 31         30 my %hashified = ();
915 31         35 my $fields = $self->{fields};
916 31         31 my %idx2col = map { $_ => $fields->[$_] } (0 .. $#{$fields});
  170         246  
  31         52  
917 31         44 for my $rec (@{$self->{data_records}}) {
  31         54  
918 427         215 my $rowkey;
919 427 100       424 if ($args->{root_str}) {
920 39         48 $rowkey = $args->{root_str} . $rec->[$self->{path_col_idx}];
921             }
922             else {
923 388 100       343 if ($args->{remove_leading_path_col_sep}) {
924 26         127 ($rowkey = $rec->[$self->{path_col_idx}]) =~ s/^\Q$self->{path_col_sep}\E(.*)/$1/;
925             }
926             else {
927 362         301 $rowkey = $rec->[$self->{path_col_idx}];
928             }
929             }
930 427 100       487 if ($args->{key_delim}) {
931 52         149 $rowkey =~ s/\Q$self->{path_col_sep}\E/$args->{key_delim}/g;
932             }
933 427         250 my $rowdata = { map { $idx2col{$_} => $rec->[$_] } (0 .. $#{$fields}) };
  2402         2946  
  427         378  
934 427         755 $hashified{$rowkey} = $rowdata;
935             }
936 31         76 return \%hashified;
937             }
938              
939             =head2 C
940              
941             =over 4
942              
943             =item * Purpose
944              
945             Transform a taxonomy-by-materialized-path into a taxonomy-by-adjacent-list.
946              
947             =item * Arguments
948              
949             $adjacentified = $self->adjacentify();
950              
951             $adjacentified = $self->adjacentify( { serial => 500 } );
952             $adjacentified = $self->adjacentify( { floor => 500 } ); # same as serial
953              
954             Optional single hash reference.
955              
956             For that hashref, C supports the key C, which defaults
957             to C<0>. C must be a non-negative integer and sets the "floor" above
958             which new unique IDs will be assigned to the C column. Hence, if
959             C is set to C<500>, the value assigned to the C column of the
960             first record to be processed will be C<501>.
961              
962             Starting with version .19, C will serve as an alternative way of
963             providing the same information to C. If, however, by mistake
964             you provide B C and C elements in the hash, C
965             will take precedence.
966              
967             =item * Return Value
968              
969             Reference to an array of hash references. Each element represents one node in
970             the taxonomy. Each element will have key-value pairs for C, C
971             and C which will hold the adjacentification of the materialized path in the
972             original taxonomy-by-materialized-path. Each element will, as well, have KVPs for the
973             non-materialized-path fields in the records in the original taxonomy-by-materialized-path.
974              
975             =item * Comment
976              
977             See documentation for C for example.
978              
979             Note that the order in which C will assign C and
980             C values to records in the taxonomy-by-adjacent-list will almost
981             certainly B match the order in which elements appear in a CSV file or in
982             the data structure returned by a method such as C.
983              
984             =back
985              
986             =cut
987              
988             sub adjacentify {
989 15     15 1 5649 my ($self, $args) = @_;
990 15         17 my $serial = 0;
991 15 100       38 if (defined $args) {
992 9 100 100     292 croak "Argument to 'adjacentify()' must be hashref"
993             unless (ref($args) and reftype($args) eq 'HASH');
994 7         10 for my $w ('serial', 'floor') {
995 13 100       19 if (exists $args->{$w}) {
996             croak "Element '$w' in argument to 'adjacentify()' must be integer"
997 8 100       189 unless ($args->{$w} =~ m/^\d+$/);
998             }
999             }
1000 5   50     18 $serial = $args->{serial} || $args->{floor} || 0;
1001             }
1002              
1003              
1004 11         41 my $fields = $self->fields();
1005 11         22 my $drpc = $self->data_records_path_components();
1006              
1007 11         22 my $path_col_idx = $self->path_col_idx();
1008 41         60 my %non_path_col2idx = map { $fields->[$_] => $_ }
1009 52         52 grep { $_ != $path_col_idx }
1010 11         16 (0..$#{$fields});
  11         13  
1011              
1012             my @components_by_row =
1013 11         14 map { my $f = $_->[$path_col_idx]; my $c = $#{$f}; [ @{$f}[1..$c] ] } @{$drpc};
  156         97  
  156         80  
  156         98  
  156         110  
  156         232  
  11         15  
1014 11         17 my $max_components = max( map { scalar(@{$_}) } @components_by_row);
  156         76  
  156         142  
1015 11         12 my @adjacentified = ();
1016 11         10 my %paths_to_id;
1017 11         30 for my $depth (1..$max_components) {
1018 34         66 for (my $r = 0; $r <= $#components_by_row; $r++) {
1019 498 100       255 if (scalar(@{$components_by_row[$r]}) == $depth) {
  498         788  
1020 156         162 my %rowdata = map { $_ => $drpc->[$r]->[$non_path_col2idx{$_}] }
  568         628  
1021             keys %non_path_col2idx;
1022 156         127 my @path_components = @{$drpc->[$r]->[$path_col_idx]};
  156         221  
1023 156         104 my $name = $path_components[-1];
1024 156         208 my $parent_of_name = join('|' =>
1025             @path_components[1 .. ($#path_components -1)]);
1026              
1027 156 100       200 my $candidate_for_path = (length($parent_of_name))
1028             ? join('|' => $parent_of_name, $name)
1029             : $name;
1030              
1031             my %rowhash = (
1032             id => ++$serial,
1033             parent_id => $paths_to_id{$parent_of_name}{id},
1034 156         476 name => $name,
1035             %rowdata,
1036             );
1037 156         262 $paths_to_id{$candidate_for_path}{id} = $rowhash{id};
1038 156 100       232 $paths_to_id{$candidate_for_path}{parent_path} = $parent_of_name
1039             if (length($parent_of_name));
1040 156         412 push @adjacentified, \%rowhash;
1041             }
1042             }
1043             }
1044 11         147 return \@adjacentified;
1045             }
1046              
1047             =head2 C
1048              
1049             =over 4
1050              
1051             =item * Purpose
1052              
1053             Create a CSV-formatted file holding the data returned by C.
1054              
1055             =item * Arguments
1056              
1057             $csv_file = $self->write_adjacentified_to_csv( {
1058             adjacentified => $adjacentified, # output of adjacentify()
1059             csvfile => './t/data/taxonomy_out3.csv',
1060             } );
1061              
1062             Single hash reference. That hash is keyed on:
1063              
1064             =over 4
1065              
1066             =item * C
1067              
1068             B Its value must be the arrayref of hash references returned by
1069             the C method.
1070              
1071             =item * C
1072              
1073             Optional. Path to location where a CSV-formatted text file holding the
1074             taxonomy-by-adjacent-list will be written. Defaults to a file called
1075             F in the current working directory.
1076              
1077             =item * Text::CSV_XS options
1078              
1079             You can also pass through any key-value pairs normally accepted by
1080             F.
1081              
1082             =back
1083              
1084             =item * Return Value
1085              
1086             Returns path to CSV-formatted text file just created.
1087              
1088             =item * Example
1089              
1090             Suppose we have a CSV-formatted file holding the following taxonomy-by-materialized-path:
1091              
1092             "path","is_actionable"
1093             "|Alpha","0"
1094             "|Beta","0"
1095             "|Alpha|Epsilon","0"
1096             "|Alpha|Epsilon|Kappa","1"
1097             "|Alpha|Zeta","0"
1098             "|Alpha|Zeta|Lambda","1"
1099             "|Alpha|Zeta|Mu","0"
1100             "|Beta|Eta","1"
1101             "|Beta|Theta","1"
1102              
1103             After running this file through C, C and
1104             C we will have a new CSV-formatted file holding
1105             this taxonomy-by-adjacent-list:
1106              
1107             id,parent_id,name,is_actionable
1108             1,,Alpha,0
1109             2,,Beta,0
1110             3,1,Epsilon,0
1111             4,1,Zeta,0
1112             5,2,Eta,1
1113             6,2,Theta,1
1114             7,3,Kappa,1
1115             8,4,Lambda,1
1116             9,4,Mu,0
1117              
1118             Note that the C column has been replaced by the C, C and
1119             C columns.
1120              
1121             =back
1122              
1123             =cut
1124              
1125             sub write_adjacentified_to_csv {
1126 14     14 1 6988 my ($self, $args) = @_;
1127 14 100       25 if (defined $args) {
1128 13 100 100     214 croak "Argument to 'adjacentify()' must be hashref"
1129             unless (ref($args) and reftype($args) eq 'HASH');
1130             croak "Argument to 'adjacentify()' must have 'adjacentified' element"
1131 11 100       88 unless exists $args->{adjacentified};
1132             croak "Argument 'adjacentified' must be array reference"
1133             unless (ref($args->{adjacentified}) and
1134 10 100 100     194 reftype($args->{adjacentified}) eq 'ARRAY');
1135             }
1136             else {
1137 1         98 croak "write_adjacentified_to_csv() must be supplied with hashref"
1138             }
1139 8         9 my $adjacentified = $args->{adjacentified};
1140 8         10 delete $args->{adjacentified};
1141              
1142 8         21 my $columns_in = $self->fields;
1143             my @non_path_columns_in =
1144 26         35 map { $columns_in->[$_] }
1145 34         41 grep { $_ != $self->{path_col_idx} }
1146 8         12 (0..$#{$columns_in});
  8         14  
1147 8         13 my @columns_out = (qw| id parent_id name |);
1148 8         14 push @columns_out, @non_path_columns_in;
1149              
1150 8         16547 my $cwd = cwd();
1151             my $csvfile = defined($args->{csvfile})
1152             ? $args->{csvfile}
1153 8 100       58 : "$cwd/taxonomy_out.csv";
1154 8         33 delete $args->{csvfile};
1155              
1156             # By this point, we should have processed all args other than those
1157             # intended for Text::CSV_XS and assigned their contents to variables as
1158             # needed.
1159              
1160 8         38 my $csv_args = { binary => 1 };
1161 8         12 while (my ($k,$v) = each %{$args}) {
  11         55  
1162 3         9 $csv_args->{$k} = $v;
1163             }
1164 8         84 my $csv = Text::CSV_XS->new($csv_args);
1165 8 50   1   1672 open my $OUT, ">:encoding(utf8)", $csvfile
  1         10  
  1         1  
  1         9  
1166             or croak "Unable to open $csvfile for writing";
1167 8 100       8750 $csv->eol(defined($csv_args->{eol}) ? $csv_args->{eol} : "\n");
1168 8         282 $csv->print($OUT, [@columns_out]);
1169 8         65 for my $rec (@{$adjacentified}) {
  8         18  
1170             $csv->print(
1171             $OUT,
1172 117         442 [ map { $rec->{$columns_out[$_]} } (0..$#columns_out) ]
  724         963  
1173             );
1174             }
1175 8 50       370 close $OUT or croak "Unable to close $csvfile after writing";
1176              
1177 8         112 return $csvfile;
1178             }
1179              
1180             1;
1181              
1182             # vim: formatoptions=crqot