File Coverage

lib/Parse/Taxonomy/MaterializedPath.pm
Criterion Covered Total %
statement 326 326 100.0
branch 134 140 95.7
condition 39 40 97.5
subroutine 24 24 100.0
pod 11 11 100.0
total 534 541 98.7


line stmt bran cond sub pod time code
1             package Parse::Taxonomy::MaterializedPath;
2 6     6   10254 use strict;
  6         6  
  6         160  
3 6     6   2005 use parent qw( Parse::Taxonomy );
  6         1262  
  6         23  
4 6     6   186 use Carp;
  6         6  
  6         274  
5 6     6   3481 use Text::CSV_XS;
  6         36382  
  6         371  
6 6     6   36 use Scalar::Util qw( reftype );
  6         6  
  6         260  
7 6     6   21 use List::Util qw( max );
  6         39  
  6         450  
8 6     6   19 use Cwd;
  6         8  
  6         337  
9             our $VERSION = '0.24';
10 6         15186 use Parse::Taxonomy::Auxiliary qw(
11             path_check_fields
12             components_check_fields
13 6     6   1400 );
  6         8  
14              
15             =head1 NAME
16              
17             Parse::Taxonomy::MaterializedPath - Validate a file for use as a path-based taxonomy
18              
19             =head1 SYNOPSIS
20              
21             use Parse::Taxonomy::MaterializedPath;
22              
23             # 'file' interface: reads a CSV file for you
24              
25             $source = "./t/data/alpha.csv";
26             $self = Parse::Taxonomy::MaterializedPath->new( {
27             file => $source,
28             } );
29              
30             # 'components' interface: as if you've already read a
31             # CSV file and now have Perl array references to header and data rows
32              
33             $self = Parse::Taxonomy::MaterializedPath->new( {
34             components => {
35             fields => $fields,
36             data_records => $data_records,
37             }
38             } );
39              
40             =head1 METHODS
41              
42             =head2 C
43              
44             =over 4
45              
46             =item * Purpose
47              
48             Parse::Taxonomy::MaterializedPath constructor.
49              
50             =item * Arguments
51              
52             Single hash reference. There are two possible interfaces: C and C.
53              
54             =over 4
55              
56             =item 1 C interface
57              
58             $source = "./t/data/alpha.csv";
59             $self = Parse::Taxonomy::MaterializedPath->new( {
60             file => $source,
61             path_col_idx => 0,
62             path_col_sep => '|',
63             %TextCSVoptions,
64             } );
65              
66             Elements in the hash reference are keyed on:
67              
68             =over 4
69              
70             =item * C
71              
72             Absolute or relative path to the incoming taxonomy file.
73             B for this interface.
74              
75             =item * C
76              
77             If the column to be used as the "path" column in the incoming taxonomy file is
78             B the first column, this option must be set to the integer representing
79             the "path" column's index position (count starts at 0). Optional; defaults to C<0>.
80              
81             =item * C
82              
83             If the string used to distinguish components of the path in the path column in
84             the incoming taxonomy file is not a pipe (C<|>), this option must be set.
85             Optional; defaults to C<|>.
86              
87             =item * Text::CSV_XS options
88              
89             Any other options which could normally be passed to Cnew()> will
90             be passed through to that module's constructor. On the recommendation of the
91             Text::CSV documentation, C is always set to a true value.
92              
93             =back
94              
95             =item 2 C interface
96              
97             $self = Parse::Taxonomy::MaterializedPath->new( {
98             components => {
99             fields => $fields,
100             data_records => $data_records,
101             }
102             } );
103              
104             Elements in this hash are keyed on:
105              
106             =over 4
107              
108             =item * C
109              
110             This element is B for the
111             C interface. The value of this element is a hash reference with two keys, C and
112             C. C is a reference to an array holding the field or
113             column names for the data set. C is a reference to an array of
114             array references, each of the latter arrayrefs holding one record or row from
115             the data set.
116              
117             =item * C
118              
119             Same as in C interface above.
120              
121             =item * C
122              
123             Same as in C interface above.
124              
125             =back
126              
127             =back
128              
129             =item * Return Value
130              
131             Parse::Taxonomy::MaterializedPath object.
132              
133             =item * Comment
134              
135             C will throw an exception under any of the following conditions:
136              
137             =over 4
138              
139             =item * Argument to C is not a reference.
140              
141             =item * Argument to C is not a hash reference.
142              
143             =item * In the C interface, unable to locate the file which is the value of the C element.
144              
145             =item * Argument to C element is not an integer.
146              
147             =item * Argument to C is greater than the index number of the
148             last element in the header row of the incoming taxonomy file, I the
149             C is wrong.
150              
151             =item * The same field is found more than once in the header row of the
152             incoming taxonomy file.
153              
154             =item * Unable to open or close the incoming taxonomy file for reading.
155              
156             =item * In the column designated as the "path" column, the same value is
157             observed more than once.
158              
159             =item * C, C, C, C and C are reserved terms.
160             One or more columns is named with a reserved term.
161              
162             =item * A non-parent node's parent node cannot be located in the incoming taxonomy file.
163              
164             =item * A data row has a number of fields different from the number of fields
165             in the header row.
166              
167             =back
168              
169             =back
170              
171             =cut
172              
173             sub new {
174 62     62 1 36876 my ($class, $args) = @_;
175 62         63 my $data;
176              
177 62 100 100     713 croak "Argument to 'new()' must be hashref"
178             unless (ref($args) and reftype($args) eq 'HASH');
179 60         57 my $argscount = 0;
180 60 100       124 $argscount++ if $args->{file};
181 60 100       102 $argscount++ if $args->{components};
182 60 100       187 croak "Argument to 'new()' must have either 'file' or 'components' element"
183             if ($argscount == 0);
184 59 100       245 croak "Argument to 'new()' must have either 'file' or 'components' element but not both"
185             if ($argscount == 2);
186              
187 58 100       101 if (exists $args->{path_col_idx}) {
188             croak "Argument to 'path_col_idx' must be integer"
189 4 100       104 unless $args->{path_col_idx} =~ m/^\d+$/;
190             }
191 57   100     220 $data->{path_col_idx} = delete $args->{path_col_idx} || 0;
192             $data->{path_col_sep} = exists $args->{path_col_sep}
193             ? $args->{path_col_sep}
194 57 100       154 : '|';
195 57 100       75 if (exists $args->{path_col_sep}) {
196 5         6 $data->{path_col_sep} = $args->{path_col_sep};
197 5         8 delete $args->{path_col_sep};
198             }
199             else {
200 52         60 $data->{path_col_sep} = '|';
201             }
202              
203 57 100       79 if ($args->{components}) {
204             croak "Value of 'components' element must be hashref"
205 30 100 100     321 unless (ref($args->{components}) and reftype($args->{components}) eq 'HASH');
206 28         42 for my $k ( qw| fields data_records | ) {
207             croak "Value of 'components' element must have '$k' key-value pair"
208 53 100       260 unless exists $args->{components}->{$k};
209             croak "Value of '$k' element must be arrayref"
210             unless (ref($args->{components}->{$k}) and
211 51 100 100     484 reftype($args->{components}->{$k}) eq 'ARRAY');
212             }
213 23         29 for my $row (@{$args->{components}->{data_records}}) {
  23         53  
214 292 100 100     1020 croak "Each element in 'data_records' array must be arrayref"
215             unless (ref($row) and reftype($row) eq 'ARRAY');
216             }
217             # We don't want to stick $args->{components} into the object as is.
218             # Rather, we want to insert 'fields' and 'data_records' for
219             # consistency with the 'file' interface. But to do that we first need
220             # to impose the same validations that we do for the 'file' interface.
221             # We also need to populate 'path_col'.
222 21         43 _prepare_fields($data, $args->{components}->{fields}, 1);
223 18         20 my $these_data_records = $args->{components}->{data_records};
224 18         22 delete $args->{components};
225 18         34 _prepare_data_records($data, $these_data_records, $args);
226             }
227             else {
228             croak "Cannot locate file '$args->{file}'"
229 27 100       599 unless (-f $args->{file});
230 26         43 $data->{file} = delete $args->{file};
231              
232             # We've now handled all the Parse::Taxonomy::MaterializedPath-specific options.
233             # Any remaining options are assumed to be intended for Text::CSV_XS::new().
234              
235 26         34 $args->{binary} = 1;
236 26 50       107 my $csv = Text::CSV_XS->new ( $args )
237             or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
238             open my $IN, "<", $data->{file}
239 26 50       2519 or croak "Unable to open '$data->{file}' for reading";
240 4     4   1711 my $header_ref = $csv->getline($IN);
  4         13989  
  4         105  
  26         895  
241              
242 26         847 _prepare_fields($data, $header_ref);
243 23         423 my $data_records = $csv->getline_all($IN);
244 23 50       5678 close $IN or croak "Unable to close after reading";
245 23         46 _prepare_data_records($data, $data_records, $args);
246             }
247              
248 32         32 while (my ($k,$v) = each %{$args}) {
  50         122  
249 18         29 $data->{$k} = $v;
250             }
251 32         41 my %row_analysis = ();
252 32         77 for my $el (@{$data->{data_records}}) {
  32         64  
253 425         322 my $rowkey = $el->[$data->{path_col_idx}];
254 425         714 $row_analysis{$rowkey} = split(/\Q$data->{path_col_sep}\E/, $rowkey);
255             }
256 32         63 $data->{row_analysis} = \%row_analysis;
257 32         116 return bless $data, $class;
258             }
259              
260             sub _prepare_fields {
261 47     47   54 my ($data, $fields_ref, $components) = @_;
262 47 100       80 if (! $components) {
263 26         41 _check_path_col_idx($data, $fields_ref, 0);
264 25         57 path_check_fields($data, $fields_ref);
265             }
266             else {
267 21         36 _check_path_col_idx($data, $fields_ref, 1);
268 20         51 components_check_fields($data, $fields_ref);
269             }
270              
271 43         41 my %fields_seen = map { $_ => 1 } @{$fields_ref};
  229         280  
  43         52  
272 43         74 my @bad_fields = ();
273 43         58 for my $reserved ( qw| id parent_id name lft rgh | ) {
274 215 100       296 push @bad_fields, $reserved if $fields_seen{$reserved};
275             }
276 43         87 my $msg = "Bad column names: <@bad_fields>. These are reserved for ";
277 43         40 $msg .= "Parse::Taxonomy's internal use; please rename";
278 43 100       257 croak $msg if @bad_fields;
279              
280 41         57 $data->{fields} = $fields_ref;
281 41         65 $data->{path_col} = $data->{fields}->[$data->{path_col_idx}];
282 41         68 return $data;
283             }
284              
285             sub _check_path_col_idx {
286 47     47   45 my ($data, $fields_ref, $components) = @_;
287 47         49 my $error_msg = "Argument to 'path_col_idx' exceeds index of last field in ";
288 47 100       101 $error_msg .= $components
289             ? "'fields' array ref"
290             : "header row in '$data->{file}'";
291              
292 47 100       37 croak $error_msg if $data->{path_col_idx} > $#{$fields_ref};
  47         343  
293             }
294              
295             sub _prepare_data_records {
296             # Confirm each row's path starts with path_col_sep:
297             # Confirm no duplicate entries in column holding path:
298             # Confirm all rows have same number of columns as header:
299 41     41   47 my ($data, $data_records, $args) = @_;
300 41         24 my $error_msg;
301 41         44 my @bad_path_cols = ();
302 41         35 my @bad_count_records = ();
303 41         54 my %paths_seen = ();
304 41         24 my $field_count = scalar(@{$data->{fields}});
  41         62  
305 41         34 for my $rec (@{$data_records}) {
  41         56  
306 537 100       1499 unless ($rec->[$data->{path_col_idx}] =~ m/^\Q$data->{path_col_sep}\E/) {
307 6         9 push @bad_path_cols, $rec->[$data->{path_col_idx}];
308             }
309 537         684 $paths_seen{$rec->[$data->{path_col_idx}]}++;
310 537         291 my $this_row_count = scalar(@{$rec});
  537         407  
311 537 100       791 if ($this_row_count != $field_count) {
312             push @bad_count_records,
313 5         10 [ $rec->[$data->{path_col_idx}], $this_row_count ];
314             }
315             }
316 41         51 $error_msg = <
317             The value of the column designated as path must start with the path column separator.
318             Rows with the following paths fail to do so:
319             IMPROPER_PATH
320 41         48 for my $path (@bad_path_cols) {
321 6         9 $error_msg .= " $path\n";
322             }
323 41 100       287 croak $error_msg if @bad_path_cols;
324              
325 39         40 my @dupe_paths = ();
326 39         238 for my $path (sort keys %paths_seen) {
327 504 100       589 push @dupe_paths, $path if $paths_seen{$path} > 1;
328             }
329 39         60 $error_msg = <
330             No duplicate entries are permitted in column designated as path.
331             The following entries appear the number of times shown:
332             ERROR_MSG_DUPE
333 39         43 for my $path (@dupe_paths) {
334 5         17 $error_msg .= " $path:" . sprintf(" %6s\n" => $paths_seen{$path});
335             }
336 39 100       343 croak $error_msg if @dupe_paths;
337              
338 36         60 $error_msg = <
339             Header row has $field_count records. The following records had different counts:
340             ERROR_MSG_WRONG_COUNT
341 36         44 for my $rec (@bad_count_records) {
342 4         8 $error_msg .= " $rec->[0]: $rec->[1]\n";
343             }
344 36 100       237 croak $error_msg if @bad_count_records;
345              
346             # Confirm each node appears in taxonomy:
347 34         34 my $path_args = { map { $_ => $args->{$_} } keys %{$args} };
  19         51  
  34         71  
348 34         52 $path_args->{sep} = $data->{path_col_sep};
349 34 50       137 my $path_csv = Text::CSV_XS->new ( $path_args )
350             or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
351 34         2644 my %missing_parents = ();
352 34         187 for my $path (sort keys %paths_seen) {
353 447         623 my $status = $path_csv->parse($path);
354 447         3869 my @columns = $path_csv->fields();
355 447 100       1891 if (@columns > 2) {
356             my $parent =
357 324         507 join($path_args->{sep} => @columns[0 .. ($#columns - 1)]);
358 324 100       625 unless (exists $paths_seen{$parent}) {
359 4         11 $missing_parents{$path} = $parent;
360             }
361             }
362             }
363 34         51 $error_msg = <
364             Each node in the taxonomy must have a parent.
365             The following nodes lack the expected parent:
366             ERROR_MSG_ORPHAN
367 34         65 for my $path (sort keys %missing_parents) {
368 4         10 $error_msg .= " $path: $missing_parents{$path}\n";
369             }
370 34 100       289 croak $error_msg if scalar(keys %missing_parents);
371             # BBB end of validations
372 32         39 $data->{data_records} = $data_records;
373              
374 32         280 return $data;
375             }
376              
377             =head2 C
378              
379             =over 4
380              
381             =item * Purpose
382              
383             Identify the names of the columns in the taxonomy.
384              
385             =item * Arguments
386              
387             my $fields = $self->fields();
388              
389             No arguments; the information is already inside the object.
390              
391             =item * Return Value
392              
393             Reference to an array holding a list of the columns as they appear in the
394             header row of the incoming taxonomy file.
395              
396             =item * Comment
397              
398             Read-only.
399              
400             =back
401              
402             =head2 C
403              
404             =over 4
405              
406             =item * Purpose
407              
408             Identify the index position (count starts at 0) of the column in the incoming
409             taxonomy file which serves as the path column.
410              
411             =item * Arguments
412              
413             my $path_col_idx = $self->path_col_idx;
414              
415             No arguments; the information is already inside the object.
416              
417             =item * Return Value
418              
419             Integer in the range from 0 to 1 less than the number of columns in the header
420             row.
421              
422             =item * Comment
423              
424             Read-only.
425              
426             =back
427              
428             =cut
429              
430             sub path_col_idx {
431 15     15 1 2812 my $self = shift;
432 15         20 return $self->{path_col_idx};
433             }
434              
435             =head2 C
436              
437             =over 4
438              
439             =item * Purpose
440              
441             Identify the name of the column in the incoming taxonomy which serves as the
442             path column.
443              
444             =item * Arguments
445              
446             my $path_col = $self->path_col;
447              
448             No arguments; the information is already inside the object.
449              
450             =item * Return Value
451              
452             String.
453              
454             =item * Comment
455              
456             Read-only.
457              
458             =back
459              
460             =cut
461              
462             sub path_col {
463 4     4 1 1112 my $self = shift;
464 4         8 return $self->{path_col};
465             }
466              
467             =head2 C
468              
469             =over 4
470              
471             =item * Purpose
472              
473             Identify the string used to separate path components once the taxonomy has
474             been created. This is just a "getter" and is logically distinct from the
475             option to C which is, in effect, a "setter."
476              
477             =item * Arguments
478              
479             my $path_col_sep = $self->path_col_sep;
480              
481             No arguments; the information is already inside the object.
482              
483             =item * Return Value
484              
485             String.
486              
487             =item * Comment
488              
489             Read-only.
490              
491             =back
492              
493             =cut
494              
495             sub path_col_sep {
496 4     4 1 962 my $self = shift;
497 4         6 return $self->{path_col_sep};
498             }
499              
500             =head2 C
501              
502             =over 4
503              
504             =item * Purpose
505              
506             Once the taxonomy has been validated, get a list of its data rows as a Perl
507             data structure.
508              
509             =item * Arguments
510              
511             $data_records = $self->data_records;
512              
513             None.
514              
515             =item * Return Value
516              
517             Reference to array of array references. The array will hold the data records
518             found in the incoming taxonomy file in their order in that file.
519              
520             =item * Comment
521              
522             Does not contain any information about the fields in the taxonomy, so you
523             should probably either (a) use in conjunction with C method above;
524             or (b) use C.
525              
526             =back
527              
528             =head2 C
529              
530             =over 4
531              
532             =item * Purpose
533              
534             Once the taxonomy has been validated, get a list of its header and data rows as a Perl
535             data structure.
536              
537             =item * Arguments
538              
539             $data_records = $self->fields_and_data_records;
540              
541             None.
542              
543             =item * Return Value
544              
545             Reference to array of array references. The first element in the array will
546             hold the header row (same as output of C). The remaining elements
547             will hold the data records found in the incoming taxonomy file in their order
548             in that file.
549              
550             =back
551              
552             =cut
553              
554             =head2 C
555              
556             =over 4
557              
558             =item * Purpose
559              
560             Once the taxonomy has been validated, get a list of its data rows as a Perl
561             data structure. In each element of this list, the path is now represented as
562             an array reference rather than a string.
563              
564             =item * Arguments
565              
566             $data_records_path_components = $self->data_records_path_components;
567              
568             None.
569              
570             =item * Return Value
571              
572             Reference to array of array references. The array will hold the data records
573             found in the incoming taxonomy file in their order in that file.
574              
575             =item * Comment
576              
577             Does not contain any information about the fields in the taxonomy, so you may
578             wish to use this method either (a) use in conjunction with C method
579             above; or (b) use C.
580              
581             =back
582              
583             =cut
584              
585             sub data_records_path_components {
586 13     13 1 1316 my $self = shift;
587 13         17 my @all_rows = ();
588 13         10 for my $row (@{$self->{data_records}}) {
  13         28  
589 182         160 my $path_col = $row->[$self->{path_col_idx}];
590 182         682 my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
591 182         136 my @rewritten = ();
592 182         124 for (my $i = 0; $i <= $#{$row}; $i++) {
  1062         1300  
593 880 100       795 if ($i != $self->{path_col_idx}) {
594 698         702 push @rewritten, $row->[$i];
595             }
596             else {
597 182         206 push @rewritten, \@path_components;
598             }
599             }
600 182         233 push @all_rows, \@rewritten;
601             }
602 13         43 return \@all_rows;
603             }
604              
605             =head2 C
606              
607             =over 4
608              
609             =item * Purpose
610              
611             Once the taxonomy has been validated, get a list of its data rows as a Perl
612             data structure. The first element in this list is an array reference holding
613             the header row. In each data element of this list, the path is now represented as
614             an array reference rather than a string.
615              
616             =item * Arguments
617              
618             $fields_and_data_records_path_components = $self->fields_and_data_records_path_components;
619              
620             None.
621              
622             =item * Return Value
623              
624             Reference to array of array references. The array will hold the data records
625             found in the incoming taxonomy file in their order in that file.
626              
627             =back
628              
629             =cut
630              
631             sub fields_and_data_records_path_components {
632 6     6 1 3508 my $self = shift;
633 6         41 my @all_rows = $self->fields;
634 6         6 for my $row (@{$self->{data_records}}) {
  6         15  
635 78         68 my $path_col = $row->[$self->{path_col_idx}];
636 78         191 my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
637 78         55 my @rewritten = ();
638 78         59 for (my $i = 0; $i <= $#{$row}; $i++) {
  546         641  
639 468 100       413 if ($i != $self->{path_col_idx}) {
640 390         367 push @rewritten, $row->[$i];
641             }
642             else {
643 78         86 push @rewritten, \@path_components;
644             }
645             }
646 78         89 push @all_rows, \@rewritten;
647             }
648 6         14 return \@all_rows;
649             }
650              
651             =head2 C
652              
653             =over 4
654              
655             =item * Purpose
656              
657             Identify the index position of a given field within the header row.
658              
659             =item * Arguments
660              
661             $index = $self->get_field_position('income');
662              
663             Takes a single string holding the name of one of the fields (column names).
664              
665             =item * Return Value
666              
667             Integer representing the index position (counting from C<0>) of the field
668             provided as argument. Throws exception if the argument is not actually a
669             field.
670              
671             =back
672              
673             =cut
674              
675             =head2 C
676              
677             =over 4
678              
679             =item * Purpose
680              
681             Display the number of descendant (multi-generational) nodes each node in the
682             taxonomy has.
683              
684             =item * Arguments
685              
686             $descendant_counts = $self->descendant_counts();
687              
688             $descendant_counts = $self->descendant_counts( { generations => 1 } );
689              
690             None required; one optional hash reference. Currently, the only element
691             honored in that hashref is C, whose value must be a non-negative
692             integer. If, instead of getting the count of all descendants of a node, you
693             only want the count of its first generation, i.e., its immediate children, you
694             provide a value of C<1>. Want the count of only the first and second
695             generations? Provide a value of C<2> -- and so on.
696              
697             =item * Return Value
698              
699             Reference to hash in which each element is keyed on the value of the path
700             column in the incoming taxonomy file.
701              
702             =back
703              
704             =cut
705              
706             sub descendant_counts {
707 9     9 1 3449 my ($self, $args) = @_;
708 9 100       21 if (defined $args) {
709 5 100 100     228 croak "Argument to 'descendant_counts()' must be hashref"
710             unless (ref($args) and reftype($args) eq 'HASH');
711             croak "Value for 'generations' element passed to descendant_counts() must be integer > 0"
712 3 100 100     166 unless ($args->{generations} and $args->{generations} =~ m/^[0-9]+$/);
713             }
714 5         8 my %descendant_counts = ();
715 5         11 my $hashified = $self->hashify();
716 5         6 for my $p (keys %{$hashified}) {
  5         15  
717 59         53 $descendant_counts{$p} = 0;
718 59         36 for my $q (
719 707         661 grep { $self->{row_analysis}->{$_} > $self->{row_analysis}->{$p} }
720 59         97 keys %{$hashified}
721             ) {
722 230 100       733 if ($q =~ m/^\Q$p$self->{path_col_sep}\E/) {
723 61 100       61 if (! $args->{generations}) {
724 48         52 $descendant_counts{$p}++;
725             }
726             else {
727 13         29 my @c = $p =~ m/\Q$self->{path_col_sep}\E/g;
728 13         24 my @d = $q =~ m/\Q$self->{path_col_sep}\E/g;
729             $descendant_counts{$p}++
730 13 100       34 if (scalar(@d) - scalar(@c) <= $args->{generations});
731             }
732             }
733             }
734             }
735 5         10 $self->{descendant_counts} = \%descendant_counts;
736 5         35 return $self->{descendant_counts};
737             }
738              
739             =head2 C
740              
741             =over 4
742              
743             =item * Purpose
744              
745             Get the total number of descendant nodes for one specific node in a validated
746             taxonomy.
747              
748             =item * Arguments
749              
750             $descendant_count = $self->get_descendant_count('|Path|To|Node');
751              
752             $descendant_counts = $self->get_descendant_count('|Path|To|Node', { generations => 1 } );
753              
754             One required: string containing node's path as spelled in the taxonomy.
755              
756             One optional hash reference. Currently, the only element honored in that
757             hashref is C, whose value must be a non-negative integer. If,
758             instead of getting the count of all descendants of a node, you only want the
759             count of its first generation, i.e., its immediate children, you provide a
760             value of C<1>. Want the count of only first and second generations? Provide
761             a value of C<2> -- and so on.
762              
763             =item * Return Value
764              
765             Unsigned integer >= 0. Any node whose child count is C<0> is by definition a
766             leaf node.
767              
768             =item * Comment
769              
770             Will throw an exception if the node does not exist or is misspelled.
771              
772             If C is called with no second (hashref) argument
773             following an invocation of C, it will return a value from
774             an internal cache created during that earlier method call. Otherwise, it will
775             re-create the cache from scratch. (This, of course, assumes that you have not
776             manipulated the object's internal data subsequent to its creation.)
777              
778             =back
779              
780             =cut
781              
782             sub get_descendant_count {
783 26     26 1 8015 my ($self, $node, $args) = @_;
784 26 100       54 if (defined $args) {
785 15 100 100     220 croak "Second argument to 'get_descendant_count()' must be hashref"
786             unless (ref($args) and reftype($args) eq 'HASH');
787             croak "Value for 'generations' element passed to second argument to get_descendant_count() must be integer > 0"
788 13 100 100     237 unless ($args->{generations} and $args->{generations} =~ m/^[0-9]+$/);
789             }
790 22 100       35 if (exists $self->{descendant_counts}) {
791 6         7 my $descendant_counts = $self->{descendant_counts};
792 6 100       193 croak "Node '$node' not found" unless exists $descendant_counts->{$node};
793 4         7 return $descendant_counts->{$node};
794             }
795             else {
796 16         21 my %descendant_counts = ();
797 16         24 my $hashified = $self->hashify();
798 16 100       106 croak "Node '$node' not found" unless exists $hashified->{$node};
799 15         15 for my $p ($node) {
800 15         16 $descendant_counts{$p} = 0;
801 15         9 for my $q (
802 231         235 grep { $self->{row_analysis}->{$_} > $self->{row_analysis}->{$p} }
803 15         44 keys %{$hashified}
804             ) {
805 110 100       395 if ($q =~ m/^\Q$p$self->{path_col_sep}\E/) {
806 36 100       44 if (! $args->{generations}) {
807 10         14 $descendant_counts{$p}++;
808             }
809             else {
810 26         63 my @c = $p =~ m/\Q$self->{path_col_sep}\E/g;
811 26         67 my @d = $q =~ m/\Q$self->{path_col_sep}\E/g;
812             $descendant_counts{$p}++
813 26 100       112 if (scalar(@d) - scalar(@c) <= $args->{generations});
814             }
815             }
816             }
817             }
818 15         153 return $descendant_counts{$node};
819             }
820             }
821              
822             =head2 C
823              
824             =over 4
825              
826             =item * Purpose
827              
828             Turn a validated taxonomy into a Perl hash keyed on the column designated as
829             the path column.
830              
831             =item * Arguments
832              
833             $hashref = $self->hashify();
834              
835             Takes an optional hashref holding a list of any of the following elements:
836              
837             =over 4
838              
839             =item * C
840              
841             Boolean, defaulting to C<0>. By default, C will spell the
842             key of the hash exactly as the value of the path column is spelled in the
843             taxonomy -- which in turn is the way it was spelled in the incoming file.
844             That is, a path in the taxonomy spelled C<|Alpha|Beta|Gamma> will be spelled
845             as a key in exactly the same way.
846              
847             However, since in many cases (including the example above) the root node of
848             the taxonomy will be empty, the user may wish to remove the first instance of
849             C. The user would do so by setting
850             C to a true value.
851              
852             $hashref = $self->hashify( {
853             remove_leading_path_col_sep => 1,
854             } );
855              
856             In that case they key would now be spelled: C.
857              
858             Note further that if the C switch is set to a true value, any
859             setting to C will be ignored.
860              
861             =item * C
862              
863             A string which will be used in composing the key of the hashref returned by
864             this method. The user may select this key if she does not want to use the
865             value found in the incoming CSV file (which by default will be the pipe
866             character (C<|>) and which may be overridden with the C argument
867             to C.
868              
869             $hashref = $self->hashify( {
870             key_delim => q{ - },
871             } );
872              
873             In the above variant, a path that in the incoming taxonomy file was
874             represented by C<|Alpha|Beta|Gamma> will in C<$hashref> be represented by
875             C< - Alpha - Beta - Gamma>.
876              
877             =item * C
878              
879             A string which will be used in composing the key of the hashref returned by
880             this method. The user will set this switch if she wishes to have the root
881             note explicitly represented. Using this switch will automatically cause
882             C to be ignored.
883              
884             Suppose the user wished to have C be the text for the root
885             node. Suppose further that the user wanted to use the string C< - > as the
886             delimiter within the key.
887              
888             $hashref = $self->hashify( {
889             root_str => q{All Suppliers},
890             key_delim => q{ - },
891             } );
892              
893             Then incoming path C<|Alpha|Beta|Gamma> would be keyed as:
894              
895             All Suppliers - Alpha - Beta - Gamma
896              
897             =back
898              
899             =item * Return Value
900              
901             Hash reference. The number of elements in this hash should be equal to the
902             number of non-header records in the taxonomy.
903              
904             =back
905              
906             =cut
907              
908             sub hashify {
909 33     33 1 12152 my ($self, $args) = @_;
910 33 100       58 if (defined $args) {
911 8 100 100     242 croak "Argument to 'hashify()' must be hashref"
912             unless (ref($args) and reftype($args) eq 'HASH');
913             }
914 31         33 my %hashified = ();
915 31         39 my $fields = $self->{fields};
916 31         29 my %idx2col = map { $_ => $fields->[$_] } (0 .. $#{$fields});
  170         263  
  31         48  
917 31         46 for my $rec (@{$self->{data_records}}) {
  31         66  
918 427         225 my $rowkey;
919 427 100       431 if ($args->{root_str}) {
920 39         48 $rowkey = $args->{root_str} . $rec->[$self->{path_col_idx}];
921             }
922             else {
923 388 100       346 if ($args->{remove_leading_path_col_sep}) {
924 26         121 ($rowkey = $rec->[$self->{path_col_idx}]) =~ s/^\Q$self->{path_col_sep}\E(.*)/$1/;
925             }
926             else {
927 362         296 $rowkey = $rec->[$self->{path_col_idx}];
928             }
929             }
930 427 100       506 if ($args->{key_delim}) {
931 52         147 $rowkey =~ s/\Q$self->{path_col_sep}\E/$args->{key_delim}/g;
932             }
933 427         257 my $rowdata = { map { $idx2col{$_} => $rec->[$_] } (0 .. $#{$fields}) };
  2402         2997  
  427         399  
934 427         776 $hashified{$rowkey} = $rowdata;
935             }
936 31         80 return \%hashified;
937             }
938              
939             =head2 C
940              
941             =over 4
942              
943             =item * Purpose
944              
945             Transform a taxonomy-by-materialized-path into a taxonomy-by-adjacent-list.
946              
947             =item * Arguments
948              
949             $adjacentified = $self->adjacentify();
950              
951             $adjacentified = $self->adjacentify( { serial => 500 } );
952             $adjacentified = $self->adjacentify( { floor => 500 } ); # same as serial
953              
954             Optional single hash reference.
955              
956             For that hashref, C supports the key C, which defaults
957             to C<0>. C must be a non-negative integer and sets the "floor" above
958             which new unique IDs will be assigned to the C column. Hence, if
959             C is set to C<500>, the value assigned to the C column of the
960             first record to be processed will be C<501>.
961              
962             Starting with version .19, C will serve as an alternative way of
963             providing the same information to C. If, however, by mistake
964             you provide B C and C elements in the hash, C
965             will take precedence.
966              
967             =item * Return Value
968              
969             Reference to an array of hash references. Each element represents one node in
970             the taxonomy. Each element will have key-value pairs for C, C
971             and C which will hold the adjacentification of the materialized path in the
972             original taxonomy-by-materialized-path. Each element will, as well, have KVPs for the
973             non-materialized-path fields in the records in the original taxonomy-by-materialized-path.
974              
975             =item * Comment
976              
977             See documentation for C for example.
978              
979             Note that the order in which C will assign C and
980             C values to records in the taxonomy-by-adjacent-list will almost
981             certainly B match the order in which elements appear in a CSV file or in
982             the data structure returned by a method such as C.
983              
984             =back
985              
986             =cut
987              
988             sub adjacentify {
989 15     15 1 5750 my ($self, $args) = @_;
990 15         17 my $serial = 0;
991 15 100       28 if (defined $args) {
992 9 100 100     247 croak "Argument to 'adjacentify()' must be hashref"
993             unless (ref($args) and reftype($args) eq 'HASH');
994 7         11 for my $w ('serial', 'floor') {
995 13 100       25 if (exists $args->{$w}) {
996             croak "Element '$w' in argument to 'adjacentify()' must be integer"
997 8 100       186 unless ($args->{$w} =~ m/^\d+$/);
998             }
999             }
1000 5   50     22 $serial = $args->{serial} || $args->{floor} || 0;
1001             }
1002              
1003              
1004 11         38 my $fields = $self->fields();
1005 11         21 my $drpc = $self->data_records_path_components();
1006              
1007 11         19 my $path_col_idx = $self->path_col_idx();
1008 41         62 my %non_path_col2idx = map { $fields->[$_] => $_ }
1009 52         54 grep { $_ != $path_col_idx }
1010 11         13 (0..$#{$fields});
  11         14  
1011              
1012             my @components_by_row =
1013 11         14 map { my $f = $_->[$path_col_idx]; my $c = $#{$f}; [ @{$f}[1..$c] ] } @{$drpc};
  156         103  
  156         80  
  156         101  
  156         92  
  156         237  
  11         10  
1014 11         16 my $max_components = max( map { scalar(@{$_}) } @components_by_row);
  156         78  
  156         140  
1015 11         14 my @adjacentified = ();
1016 11         10 my %paths_to_id;
1017 11         22 for my $depth (1..$max_components) {
1018 34         54 for (my $r = 0; $r <= $#components_by_row; $r++) {
1019 498 100       290 if (scalar(@{$components_by_row[$r]}) == $depth) {
  498         880  
1020 156         190 my %rowdata = map { $_ => $drpc->[$r]->[$non_path_col2idx{$_}] }
  568         734  
1021             keys %non_path_col2idx;
1022 156         147 my @path_components = @{$drpc->[$r]->[$path_col_idx]};
  156         248  
1023 156         125 my $name = $path_components[-1];
1024 156         228 my $parent_of_name = join('|' =>
1025             @path_components[1 .. ($#path_components -1)]);
1026              
1027 156 100       243 my $candidate_for_path = (length($parent_of_name))
1028             ? join('|' => $parent_of_name, $name)
1029             : $name;
1030              
1031             my %rowhash = (
1032             id => ++$serial,
1033             parent_id => $paths_to_id{$parent_of_name}{id},
1034 156         524 name => $name,
1035             %rowdata,
1036             );
1037 156         316 $paths_to_id{$candidate_for_path}{id} = $rowhash{id};
1038 156 100       272 $paths_to_id{$candidate_for_path}{parent_path} = $parent_of_name
1039             if (length($parent_of_name));
1040 156         503 push @adjacentified, \%rowhash;
1041             }
1042             }
1043             }
1044 11         166 return \@adjacentified;
1045             }
1046              
1047             =head2 C
1048              
1049             =over 4
1050              
1051             =item * Purpose
1052              
1053             Create a CSV-formatted file holding the data returned by C.
1054              
1055             =item * Arguments
1056              
1057             $csv_file = $self->write_adjacentified_to_csv( {
1058             adjacentified => $adjacentified, # output of adjacentify()
1059             csvfile => './t/data/taxonomy_out3.csv',
1060             } );
1061              
1062             Single hash reference. That hash is keyed on:
1063              
1064             =over 4
1065              
1066             =item * C
1067              
1068             B Its value must be the arrayref of hash references returned by
1069             the C method.
1070              
1071             =item * C
1072              
1073             Optional. Path to location where a CSV-formatted text file holding the
1074             taxonomy-by-adjacent-list will be written. Defaults to a file called
1075             F in the current working directory.
1076              
1077             =item * Text::CSV_XS options
1078              
1079             You can also pass through any key-value pairs normally accepted by
1080             F.
1081              
1082             =back
1083              
1084             =item * Return Value
1085              
1086             Returns path to CSV-formatted text file just created.
1087              
1088             =item * Example
1089              
1090             Suppose we have a CSV-formatted file holding the following taxonomy-by-materialized-path:
1091              
1092             "path","is_actionable"
1093             "|Alpha","0"
1094             "|Beta","0"
1095             "|Alpha|Epsilon","0"
1096             "|Alpha|Epsilon|Kappa","1"
1097             "|Alpha|Zeta","0"
1098             "|Alpha|Zeta|Lambda","1"
1099             "|Alpha|Zeta|Mu","0"
1100             "|Beta|Eta","1"
1101             "|Beta|Theta","1"
1102              
1103             After running this file through C, C and
1104             C we will have a new CSV-formatted file holding
1105             this taxonomy-by-adjacent-list:
1106              
1107             id,parent_id,name,is_actionable
1108             1,,Alpha,0
1109             2,,Beta,0
1110             3,1,Epsilon,0
1111             4,1,Zeta,0
1112             5,2,Eta,1
1113             6,2,Theta,1
1114             7,3,Kappa,1
1115             8,4,Lambda,1
1116             9,4,Mu,0
1117              
1118             Note that the C column has been replaced by the C, C and
1119             C columns.
1120              
1121             =back
1122              
1123             =cut
1124              
1125             sub write_adjacentified_to_csv {
1126 14     14 1 6632 my ($self, $args) = @_;
1127 14 100       25 if (defined $args) {
1128 13 100 100     224 croak "Argument to 'adjacentify()' must be hashref"
1129             unless (ref($args) and reftype($args) eq 'HASH');
1130             croak "Argument to 'adjacentify()' must have 'adjacentified' element"
1131 11 100       90 unless exists $args->{adjacentified};
1132             croak "Argument 'adjacentified' must be array reference"
1133             unless (ref($args->{adjacentified}) and
1134 10 100 100     194 reftype($args->{adjacentified}) eq 'ARRAY');
1135             }
1136             else {
1137 1         74 croak "write_adjacentified_to_csv() must be supplied with hashref"
1138             }
1139 8         8 my $adjacentified = $args->{adjacentified};
1140 8         11 delete $args->{adjacentified};
1141              
1142 8         17 my $columns_in = $self->fields;
1143             my @non_path_columns_in =
1144 26         34 map { $columns_in->[$_] }
1145 34         40 grep { $_ != $self->{path_col_idx} }
1146 8         11 (0..$#{$columns_in});
  8         12  
1147 8         12 my @columns_out = (qw| id parent_id name |);
1148 8         10 push @columns_out, @non_path_columns_in;
1149              
1150 8         11283 my $cwd = cwd();
1151             my $csvfile = defined($args->{csvfile})
1152             ? $args->{csvfile}
1153 8 100       49 : "$cwd/taxonomy_out.csv";
1154 8         23 delete $args->{csvfile};
1155              
1156             # By this point, we should have processed all args other than those
1157             # intended for Text::CSV_XS and assigned their contents to variables as
1158             # needed.
1159              
1160 8         33 my $csv_args = { binary => 1 };
1161 8         10 while (my ($k,$v) = each %{$args}) {
  11         45  
1162 3         11 $csv_args->{$k} = $v;
1163             }
1164 8         69 my $csv = Text::CSV_XS->new($csv_args);
1165 8 50   1   1456 open my $OUT, ">:encoding(utf8)", $csvfile
  1         7  
  1         1  
  1         8  
1166             or croak "Unable to open $csvfile for writing";
1167 8 100       8616 $csv->eol(defined($csv_args->{eol}) ? $csv_args->{eol} : "\n");
1168 8         255 $csv->print($OUT, [@columns_out]);
1169 8         62 for my $rec (@{$adjacentified}) {
  8         21  
1170             $csv->print(
1171             $OUT,
1172 117         532 [ map { $rec->{$columns_out[$_]} } (0..$#columns_out) ]
  724         1035  
1173             );
1174             }
1175 8 50       339 close $OUT or croak "Unable to close $csvfile after writing";
1176              
1177 8         105 return $csvfile;
1178             }
1179              
1180             1;
1181              
1182             # vim: formatoptions=crqot