File Coverage

lib/Parse/Taxonomy/MaterializedPath.pm
Criterion Covered Total %
statement 326 326 100.0
branch 134 140 95.7
condition 39 40 97.5
subroutine 24 24 100.0
pod 11 11 100.0
total 534 541 98.7


line stmt bran cond sub pod time code
1             package Parse::Taxonomy::MaterializedPath;
2 6     6   15488 use strict;
  6         10  
  6         212  
3 6     6   3210 use parent qw( Parse::Taxonomy );
  6         1443  
  6         31  
4 6     6   246 use Carp;
  6         11  
  6         332  
5 6     6   6180 use Text::CSV_XS;
  6         58172  
  6         394  
6 6     6   55 use Scalar::Util qw( reftype );
  6         10  
  6         328  
7 6     6   29 use List::Util qw( max );
  6         72  
  6         630  
8 6     6   29 use Cwd;
  6         9  
  6         504  
9             our $VERSION = '0.22';
10 6         23072 use Parse::Taxonomy::Auxiliary qw(
11             path_check_fields
12             components_check_fields
13 6     6   2059 );
  6         14  
14              
15             =head1 NAME
16              
17             Parse::Taxonomy::MaterializedPath - Validate a file for use as a path-based taxonomy
18              
19             =head1 SYNOPSIS
20              
21             use Parse::Taxonomy::MaterializedPath;
22              
23             # 'file' interface: reads a CSV file for you
24              
25             $source = "./t/data/alpha.csv";
26             $self = Parse::Taxonomy::MaterializedPath->new( {
27             file => $source,
28             } );
29              
30             # 'components' interface: as if you've already read a
31             # CSV file and now have Perl array references to header and data rows
32              
33             $self = Parse::Taxonomy::MaterializedPath->new( {
34             components => {
35             fields => $fields,
36             data_records => $data_records,
37             }
38             } );
39              
40             =head1 METHODS
41              
42             =head2 C
43              
44             =over 4
45              
46             =item * Purpose
47              
48             Parse::Taxonomy::MaterializedPath constructor.
49              
50             =item * Arguments
51              
52             Single hash reference. There are two possible interfaces: C and C.
53              
54             =over 4
55              
56             =item 1 C interface
57              
58             $source = "./t/data/alpha.csv";
59             $self = Parse::Taxonomy::MaterializedPath->new( {
60             file => $source,
61             path_col_idx => 0,
62             path_col_sep => '|',
63             %TextCSVoptions,
64             } );
65              
66             Elements in the hash reference are keyed on:
67              
68             =over 4
69              
70             =item * C
71              
72             Absolute or relative path to the incoming taxonomy file.
73             B for this interface.
74              
75             =item * C
76              
77             If the column to be used as the "path" column in the incoming taxonomy file is
78             B the first column, this option must be set to the integer representing
79             the "path" column's index position (count starts at 0). Optional; defaults to C<0>.
80              
81             =item * C
82              
83             If the string used to distinguish components of the path in the path column in
84             the incoming taxonomy file is not a pipe (C<|>), this option must be set.
85             Optional; defaults to C<|>.
86              
87             =item * Text::CSV_XS options
88              
89             Any other options which could normally be passed to Cnew()> will
90             be passed through to that module's constructor. On the recommendation of the
91             Text::CSV documentation, C is always set to a true value.
92              
93             =back
94              
95             =item 2 C interface
96              
97             $self = Parse::Taxonomy::MaterializedPath->new( {
98             components => {
99             fields => $fields,
100             data_records => $data_records,
101             }
102             } );
103              
104             Elements in this hash are keyed on:
105              
106             =over 4
107              
108             =item * C
109              
110             This element is B for the
111             C interface. The value of this element is a hash reference with two keys, C and
112             C. C is a reference to an array holding the field or
113             column names for the data set. C is a reference to an array of
114             array references, each of the latter arrayrefs holding one record or row from
115             the data set.
116              
117             =item * C
118              
119             Same as in C interface above.
120              
121             =item * C
122              
123             Same as in C interface above.
124              
125             =back
126              
127             =back
128              
129             =item * Return Value
130              
131             Parse::Taxonomy::MaterializedPath object.
132              
133             =item * Comment
134              
135             C will throw an exception under any of the following conditions:
136              
137             =over 4
138              
139             =item * Argument to C is not a reference.
140              
141             =item * Argument to C is not a hash reference.
142              
143             =item * In the C interface, unable to locate the file which is the value of the C element.
144              
145             =item * Argument to C element is not an integer.
146              
147             =item * Argument to C is greater than the index number of the
148             last element in the header row of the incoming taxonomy file, I the
149             C is wrong.
150              
151             =item * The same field is found more than once in the header row of the
152             incoming taxonomy file.
153              
154             =item * Unable to open or close the incoming taxonomy file for reading.
155              
156             =item * In the column designated as the "path" column, the same value is
157             observed more than once.
158              
159             =item * C, C, C, C and C are reserved terms.
160             One or more columns is named with a reserved term.
161              
162             =item * A non-parent node's parent node cannot be located in the incoming taxonomy file.
163              
164             =item * A data row has a number of fields different from the number of fields
165             in the header row.
166              
167             =back
168              
169             =back
170              
171             =cut
172              
173             sub new {
174 62     62 1 61297 my ($class, $args) = @_;
175 62         110 my $data;
176              
177 62 100 100     941 croak "Argument to 'new()' must be hashref"
178             unless (ref($args) and reftype($args) eq 'HASH');
179 60         91 my $argscount = 0;
180 60 100       182 $argscount++ if $args->{file};
181 60 100       154 $argscount++ if $args->{components};
182 60 100       254 croak "Argument to 'new()' must have either 'file' or 'components' element"
183             if ($argscount == 0);
184 59 100       329 croak "Argument to 'new()' must have either 'file' or 'components' element but not both"
185             if ($argscount == 2);
186              
187 58 100       152 if (exists $args->{path_col_idx}) {
188             croak "Argument to 'path_col_idx' must be integer"
189 4 100       143 unless $args->{path_col_idx} =~ m/^\d+$/;
190             }
191 57   100     330 $data->{path_col_idx} = delete $args->{path_col_idx} || 0;
192             $data->{path_col_sep} = exists $args->{path_col_sep}
193             ? $args->{path_col_sep}
194 57 100       233 : '|';
195 57 100       134 if (exists $args->{path_col_sep}) {
196 5         14 $data->{path_col_sep} = $args->{path_col_sep};
197 5         11 delete $args->{path_col_sep};
198             }
199             else {
200 52         102 $data->{path_col_sep} = '|';
201             }
202              
203 57 100       129 if ($args->{components}) {
204             croak "Value of 'components' element must be hashref"
205 30 100 100     478 unless (ref($args->{components}) and reftype($args->{components}) eq 'HASH');
206 28         53 for my $k ( qw| fields data_records | ) {
207             croak "Value of 'components' element must have '$k' key-value pair"
208 53 100       393 unless exists $args->{components}->{$k};
209             croak "Value of '$k' element must be arrayref"
210             unless (ref($args->{components}->{$k}) and
211 51 100 100     710 reftype($args->{components}->{$k}) eq 'ARRAY');
212             }
213 23         39 for my $row (@{$args->{components}->{data_records}}) {
  23         57  
214 292 100 100     1677 croak "Each element in 'data_records' array must be arrayref"
215             unless (ref($row) and reftype($row) eq 'ARRAY');
216             }
217             # We don't want to stick $args->{components} into the object as is.
218             # Rather, we want to insert 'fields' and 'data_records' for
219             # consistency with the 'file' interface. But to do that we first need
220             # to impose the same validations that we do for the 'file' interface.
221             # We also need to populate 'path_col'.
222 21         69 _prepare_fields($data, $args->{components}->{fields}, 1);
223 18         36 my $these_data_records = $args->{components}->{data_records};
224 18         35 delete $args->{components};
225 18         56 _prepare_data_records($data, $these_data_records, $args);
226             }
227             else {
228             croak "Cannot locate file '$args->{file}'"
229 27 100       722 unless (-f $args->{file});
230 26         65 $data->{file} = delete $args->{file};
231              
232             # We've now handled all the Parse::Taxonomy::MaterializedPath-specific options.
233             # Any remaining options are assumed to be intended for Text::CSV_XS::new().
234              
235 26         53 $args->{binary} = 1;
236 26 50       139 my $csv = Text::CSV_XS->new ( $args )
237             or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
238             open my $IN, "<", $data->{file}
239 26 50       3339 or croak "Unable to open '$data->{file}' for reading";
240 4     4   2677 my $header_ref = $csv->getline($IN);
  4         20950  
  4         146  
  26         1221  
241              
242 26         1299 _prepare_fields($data, $header_ref);
243 23         591 my $data_records = $csv->getline_all($IN);
244 23 50       9275 close $IN or croak "Unable to close after reading";
245 23         78 _prepare_data_records($data, $data_records, $args);
246             }
247              
248 32         56 while (my ($k,$v) = each %{$args}) {
  50         179  
249 18         43 $data->{$k} = $v;
250             }
251 32         62 my %row_analysis = ();
252 32         46 for my $el (@{$data->{data_records}}) {
  32         74  
253 425         636 my $rowkey = $el->[$data->{path_col_idx}];
254 425         1338 $row_analysis{$rowkey} = split(/\Q$data->{path_col_sep}\E/, $rowkey);
255             }
256 32         97 $data->{row_analysis} = \%row_analysis;
257 32         161 return bless $data, $class;
258             }
259              
260             sub _prepare_fields {
261 47     47   90 my ($data, $fields_ref, $components) = @_;
262 47 100       120 if (! $components) {
263 26         61 _check_path_col_idx($data, $fields_ref, 0);
264 25         88 path_check_fields($data, $fields_ref);
265             }
266             else {
267 21         60 _check_path_col_idx($data, $fields_ref, 1);
268 20         68 components_check_fields($data, $fields_ref);
269             }
270              
271 43         93 my %fields_seen = map { $_ => 1 } @{$fields_ref};
  229         508  
  43         86  
272 43         133 my @bad_fields = ();
273 43         102 for my $reserved ( qw| id parent_id name lft rgh | ) {
274 215 100       525 push @bad_fields, $reserved if $fields_seen{$reserved};
275             }
276 43         121 my $msg = "Bad column names: <@bad_fields>. These are reserved for ";
277 43         67 $msg .= "Parse::Taxonomy's internal use; please rename";
278 43 100       379 croak $msg if @bad_fields;
279              
280 41         100 $data->{fields} = $fields_ref;
281 41         119 $data->{path_col} = $data->{fields}->[$data->{path_col_idx}];
282 41         109 return $data;
283             }
284              
285             sub _check_path_col_idx {
286 47     47   71 my ($data, $fields_ref, $components) = @_;
287 47         122 my $error_msg = "Argument to 'path_col_idx' exceeds index of last field in ";
288 47 100       145 $error_msg .= $components
289             ? "'fields' array ref"
290             : "header row in '$data->{file}'";
291              
292 47 100       70 croak $error_msg if $data->{path_col_idx} > $#{$fields_ref};
  47         487  
293             }
294              
295             sub _prepare_data_records {
296             # Confirm each row's path starts with path_col_sep:
297             # Confirm no duplicate entries in column holding path:
298             # Confirm all rows have same number of columns as header:
299 41     41   70 my ($data, $data_records, $args) = @_;
300 41         57 my $error_msg;
301 41         68 my @bad_path_cols = ();
302 41         59 my @bad_count_records = ();
303 41         78 my %paths_seen = ();
304 41         53 my $field_count = scalar(@{$data->{fields}});
  41         97  
305 41         53 for my $rec (@{$data_records}) {
  41         141  
306 537 100       2406 unless ($rec->[$data->{path_col_idx}] =~ m/^\Q$data->{path_col_sep}\E/) {
307 6         13 push @bad_path_cols, $rec->[$data->{path_col_idx}];
308             }
309 537         1223 $paths_seen{$rec->[$data->{path_col_idx}]}++;
310 537         569 my $this_row_count = scalar(@{$rec});
  537         791  
311 537 100       1423 if ($this_row_count != $field_count) {
312             push @bad_count_records,
313 5         18 [ $rec->[$data->{path_col_idx}], $this_row_count ];
314             }
315             }
316 41         88 $error_msg = <
317             The value of the column designated as path must start with the path column separator.
318             Rows with the following paths fail to do so:
319             IMPROPER_PATH
320 41         117 for my $path (@bad_path_cols) {
321 6         17 $error_msg .= " $path\n";
322             }
323 41 100       395 croak $error_msg if @bad_path_cols;
324              
325 39         72 my @dupe_paths = ();
326 39         344 for my $path (sort keys %paths_seen) {
327 504 100       1025 push @dupe_paths, $path if $paths_seen{$path} > 1;
328             }
329 39         107 $error_msg = <
330             No duplicate entries are permitted in column designated as path.
331             The following entries appear the number of times shown:
332             ERROR_MSG_DUPE
333 39         79 for my $path (@dupe_paths) {
334 5         26 $error_msg .= " $path:" . sprintf(" %6s\n" => $paths_seen{$path});
335             }
336 39 100       609 croak $error_msg if @dupe_paths;
337              
338 36         92 $error_msg = <
339             Header row has $field_count records. The following records had different counts:
340             ERROR_MSG_WRONG_COUNT
341 36         64 for my $rec (@bad_count_records) {
342 4         14 $error_msg .= " $rec->[0]: $rec->[1]\n";
343             }
344 36 100       346 croak $error_msg if @bad_count_records;
345              
346             # Confirm each node appears in taxonomy:
347 34         53 my $path_args = { map { $_ => $args->{$_} } keys %{$args} };
  19         71  
  34         86  
348 34         90 $path_args->{sep} = $data->{path_col_sep};
349 34 50       180 my $path_csv = Text::CSV_XS->new ( $path_args )
350             or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
351 34         3921 my %missing_parents = ();
352 34         246 for my $path (sort keys %paths_seen) {
353 447         1234 my $status = $path_csv->parse($path);
354 447         7193 my @columns = $path_csv->fields();
355 447 100       3423 if (@columns > 2) {
356             my $parent =
357 324         873 join($path_args->{sep} => @columns[0 .. ($#columns - 1)]);
358 324 100       1209 unless (exists $paths_seen{$parent}) {
359 4         15 $missing_parents{$path} = $parent;
360             }
361             }
362             }
363 34         108 $error_msg = <
364             Each node in the taxonomy must have a parent.
365             The following nodes lack the expected parent:
366             ERROR_MSG_ORPHAN
367 34         82 for my $path (sort keys %missing_parents) {
368 4         13 $error_msg .= " $path: $missing_parents{$path}\n";
369             }
370 34 100       361 croak $error_msg if scalar(keys %missing_parents);
371             # BBB end of validations
372 32         63 $data->{data_records} = $data_records;
373              
374 32         423 return $data;
375             }
376              
377             =head2 C
378              
379             =over 4
380              
381             =item * Purpose
382              
383             Identify the names of the columns in the taxonomy.
384              
385             =item * Arguments
386              
387             my $fields = $self->fields();
388              
389             No arguments; the information is already inside the object.
390              
391             =item * Return Value
392              
393             Reference to an array holding a list of the columns as they appear in the
394             header row of the incoming taxonomy file.
395              
396             =item * Comment
397              
398             Read-only.
399              
400             =back
401              
402             =head2 C
403              
404             =over 4
405              
406             =item * Purpose
407              
408             Identify the index position (count starts at 0) of the column in the incoming
409             taxonomy file which serves as the path column.
410              
411             =item * Arguments
412              
413             my $path_col_idx = $self->path_col_idx;
414              
415             No arguments; the information is already inside the object.
416              
417             =item * Return Value
418              
419             Integer in the range from 0 to 1 less than the number of columns in the header
420             row.
421              
422             =item * Comment
423              
424             Read-only.
425              
426             =back
427              
428             =cut
429              
430             sub path_col_idx {
431 15     15 1 3952 my $self = shift;
432 15         33 return $self->{path_col_idx};
433             }
434              
435             =head2 C
436              
437             =over 4
438              
439             =item * Purpose
440              
441             Identify the name of the column in the incoming taxonomy which serves as the
442             path column.
443              
444             =item * Arguments
445              
446             my $path_col = $self->path_col;
447              
448             No arguments; the information is already inside the object.
449              
450             =item * Return Value
451              
452             String.
453              
454             =item * Comment
455              
456             Read-only.
457              
458             =back
459              
460             =cut
461              
462             sub path_col {
463 4     4 1 1522 my $self = shift;
464 4         10 return $self->{path_col};
465             }
466              
467             =head2 C
468              
469             =over 4
470              
471             =item * Purpose
472              
473             Identify the string used to separate path components once the taxonomy has
474             been created. This is just a "getter" and is logically distinct from the
475             option to C which is, in effect, a "setter."
476              
477             =item * Arguments
478              
479             my $path_col_sep = $self->path_col_sep;
480              
481             No arguments; the information is already inside the object.
482              
483             =item * Return Value
484              
485             String.
486              
487             =item * Comment
488              
489             Read-only.
490              
491             =back
492              
493             =cut
494              
495             sub path_col_sep {
496 4     4 1 1505 my $self = shift;
497 4         10 return $self->{path_col_sep};
498             }
499              
500             =head2 C
501              
502             =over 4
503              
504             =item * Purpose
505              
506             Once the taxonomy has been validated, get a list of its data rows as a Perl
507             data structure.
508              
509             =item * Arguments
510              
511             $data_records = $self->data_records;
512              
513             None.
514              
515             =item * Return Value
516              
517             Reference to array of array references. The array will hold the data records
518             found in the incoming taxonomy file in their order in that file.
519              
520             =item * Comment
521              
522             Does not contain any information about the fields in the taxonomy, so you
523             should probably either (a) use in conjunction with C method above;
524             or (b) use C.
525              
526             =back
527              
528             =head2 C
529              
530             =over 4
531              
532             =item * Purpose
533              
534             Once the taxonomy has been validated, get a list of its header and data rows as a Perl
535             data structure.
536              
537             =item * Arguments
538              
539             $data_records = $self->fields_and_data_records;
540              
541             None.
542              
543             =item * Return Value
544              
545             Reference to array of array references. The first element in the array will
546             hold the header row (same as output of C). The remaining elements
547             will hold the data records found in the incoming taxonomy file in their order
548             in that file.
549              
550             =back
551              
552             =cut
553              
554             =head2 C
555              
556             =over 4
557              
558             =item * Purpose
559              
560             Once the taxonomy has been validated, get a list of its data rows as a Perl
561             data structure. In each element of this list, the path is now represented as
562             an array reference rather than a string.
563              
564             =item * Arguments
565              
566             $data_records_path_components = $self->data_records_path_components;
567              
568             None.
569              
570             =item * Return Value
571              
572             Reference to array of array references. The array will hold the data records
573             found in the incoming taxonomy file in their order in that file.
574              
575             =item * Comment
576              
577             Does not contain any information about the fields in the taxonomy, so you may
578             wish to use this method either (a) use in conjunction with C method
579             above; or (b) use C.
580              
581             =back
582              
583             =cut
584              
585             sub data_records_path_components {
586 13     13 1 2109 my $self = shift;
587 13         24 my @all_rows = ();
588 13         16 for my $row (@{$self->{data_records}}) {
  13         51  
589 182         298 my $path_col = $row->[$self->{path_col_idx}];
590 182         646 my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
591 182         267 my @rewritten = ();
592 182         251 for (my $i = 0; $i <= $#{$row}; $i++) {
  1062         2289  
593 880 100       1526 if ($i != $self->{path_col_idx}) {
594 698         1325 push @rewritten, $row->[$i];
595             }
596             else {
597 182         381 push @rewritten, \@path_components;
598             }
599             }
600 182         414 push @all_rows, \@rewritten;
601             }
602 13         61 return \@all_rows;
603             }
604              
605             =head2 C
606              
607             =over 4
608              
609             =item * Purpose
610              
611             Once the taxonomy has been validated, get a list of its data rows as a Perl
612             data structure. The first element in this list is an array reference holding
613             the header row. In each data element of this list, the path is now represented as
614             an array reference rather than a string.
615              
616             =item * Arguments
617              
618             $fields_and_data_records_path_components = $self->fields_and_data_records_path_components;
619              
620             None.
621              
622             =item * Return Value
623              
624             Reference to array of array references. The array will hold the data records
625             found in the incoming taxonomy file in their order in that file.
626              
627             =back
628              
629             =cut
630              
631             sub fields_and_data_records_path_components {
632 6     6 1 4230 my $self = shift;
633 6         21 my @all_rows = $self->fields;
634 6         10 for my $row (@{$self->{data_records}}) {
  6         18  
635 78         127 my $path_col = $row->[$self->{path_col_idx}];
636 78         239 my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
637 78         102 my @rewritten = ();
638 78         107 for (my $i = 0; $i <= $#{$row}; $i++) {
  546         1095  
639 468 100       815 if ($i != $self->{path_col_idx}) {
640 390         721 push @rewritten, $row->[$i];
641             }
642             else {
643 78         148 push @rewritten, \@path_components;
644             }
645             }
646 78         187 push @all_rows, \@rewritten;
647             }
648 6         17 return \@all_rows;
649             }
650              
651             =head2 C
652              
653             =over 4
654              
655             =item * Purpose
656              
657             Identify the index position of a given field within the header row.
658              
659             =item * Arguments
660              
661             $index = $self->get_field_position('income');
662              
663             Takes a single string holding the name of one of the fields (column names).
664              
665             =item * Return Value
666              
667             Integer representing the index position (counting from C<0>) of the field
668             provided as argument. Throws exception if the argument is not actually a
669             field.
670              
671             =back
672              
673             =cut
674              
675             =head2 C
676              
677             =over 4
678              
679             =item * Purpose
680              
681             Display the number of descendant (multi-generational) nodes each node in the
682             taxonomy has.
683              
684             =item * Arguments
685              
686             $descendant_counts = $self->descendant_counts();
687              
688             $descendant_counts = $self->descendant_counts( { generations => 1 } );
689              
690             None required; one optional hash reference. Currently, the only element
691             honored in that hashref is C, whose value must be a non-negative
692             integer. If, instead of getting the count of all descendants of a node, you
693             only want the count of its first generation, i.e., its immediate children, you
694             provide a value of C<1>. Want the count of only the first and second
695             generations? Provide a value of C<2> -- and so on.
696              
697             =item * Return Value
698              
699             Reference to hash in which each element is keyed on the value of the path
700             column in the incoming taxonomy file.
701              
702             =back
703              
704             =cut
705              
706             sub descendant_counts {
707 9     9 1 5146 my ($self, $args) = @_;
708 9 100       25 if (defined $args) {
709 5 100 100     337 croak "Argument to 'descendant_counts()' must be hashref"
710             unless (ref($args) and reftype($args) eq 'HASH');
711             croak "Value for 'generations' element passed to descendant_counts() must be integer > 0"
712 3 100 100     211 unless ($args->{generations} and $args->{generations} =~ m/^[0-9]+$/);
713             }
714 5         8 my %descendant_counts = ();
715 5         15 my $hashified = $self->hashify();
716 5         7 for my $p (keys %{$hashified}) {
  5         23  
717 59         101 $descendant_counts{$p} = 0;
718 59         77 for my $q (
719 707         1496 grep { $self->{row_analysis}->{$_} > $self->{row_analysis}->{$p} }
720 59         175 keys %{$hashified}
721             ) {
722 230 100       1170 if ($q =~ m/^\Q$p$self->{path_col_sep}\E/) {
723 61 100       117 if (! $args->{generations}) {
724 48         102 $descendant_counts{$p}++;
725             }
726             else {
727 13         47 my @c = $p =~ m/\Q$self->{path_col_sep}\E/g;
728 13         45 my @d = $q =~ m/\Q$self->{path_col_sep}\E/g;
729             $descendant_counts{$p}++
730 13 100       59 if (scalar(@d) - scalar(@c) <= $args->{generations});
731             }
732             }
733             }
734             }
735 5         16 $self->{descendant_counts} = \%descendant_counts;
736 5         55 return $self->{descendant_counts};
737             }
738              
739             =head2 C
740              
741             =over 4
742              
743             =item * Purpose
744              
745             Get the total number of descendant nodes for one specific node in a validated
746             taxonomy.
747              
748             =item * Arguments
749              
750             $descendant_count = $self->get_descendant_count('|Path|To|Node');
751              
752             $descendant_counts = $self->get_descendant_count('|Path|To|Node', { generations => 1 } );
753              
754             One required: string containing node's path as spelled in the taxonomy.
755              
756             One optional hash reference. Currently, the only element honored in that
757             hashref is C, whose value must be a non-negative integer. If,
758             instead of getting the count of all descendants of a node, you only want the
759             count of its first generation, i.e., its immediate children, you provide a
760             value of C<1>. Want the count of only first and second generations? Provide
761             a value of C<2> -- and so on.
762              
763             =item * Return Value
764              
765             Unsigned integer >= 0. Any node whose child count is C<0> is by definition a
766             leaf node.
767              
768             =item * Comment
769              
770             Will throw an exception if the node does not exist or is misspelled.
771              
772             If C is called with no second (hashref) argument
773             following an invocation of C, it will return a value from
774             an internal cache created during that earlier method call. Otherwise, it will
775             re-create the cache from scratch. (This, of course, assumes that you have not
776             manipulated the object's internal data subsequent to its creation.)
777              
778             =back
779              
780             =cut
781              
782             sub get_descendant_count {
783 26     26 1 11235 my ($self, $node, $args) = @_;
784 26 100       100 if (defined $args) {
785 15 100 100     286 croak "Second argument to 'get_descendant_count()' must be hashref"
786             unless (ref($args) and reftype($args) eq 'HASH');
787             croak "Value for 'generations' element passed to second argument to get_descendant_count() must be integer > 0"
788 13 100 100     332 unless ($args->{generations} and $args->{generations} =~ m/^[0-9]+$/);
789             }
790 22 100       48 if (exists $self->{descendant_counts}) {
791 6         10 my $descendant_counts = $self->{descendant_counts};
792 6 100       286 croak "Node '$node' not found" unless exists $descendant_counts->{$node};
793 4         10 return $descendant_counts->{$node};
794             }
795             else {
796 16         25 my %descendant_counts = ();
797 16         31 my $hashified = $self->hashify();
798 16 100       165 croak "Node '$node' not found" unless exists $hashified->{$node};
799 15         24 for my $p ($node) {
800 15         24 $descendant_counts{$p} = 0;
801 15         17 for my $q (
802 231         529 grep { $self->{row_analysis}->{$_} > $self->{row_analysis}->{$p} }
803 15         62 keys %{$hashified}
804             ) {
805 110 100       586 if ($q =~ m/^\Q$p$self->{path_col_sep}\E/) {
806 36 100       77 if (! $args->{generations}) {
807 10         24 $descendant_counts{$p}++;
808             }
809             else {
810 26         86 my @c = $p =~ m/\Q$self->{path_col_sep}\E/g;
811 26         97 my @d = $q =~ m/\Q$self->{path_col_sep}\E/g;
812             $descendant_counts{$p}++
813 26 100       117 if (scalar(@d) - scalar(@c) <= $args->{generations});
814             }
815             }
816             }
817             }
818 15         236 return $descendant_counts{$node};
819             }
820             }
821              
822             =head2 C
823              
824             =over 4
825              
826             =item * Purpose
827              
828             Turn a validated taxonomy into a Perl hash keyed on the column designated as
829             the path column.
830              
831             =item * Arguments
832              
833             $hashref = $self->hashify();
834              
835             Takes an optional hashref holding a list of any of the following elements:
836              
837             =over 4
838              
839             =item * C
840              
841             Boolean, defaulting to C<0>. By default, C will spell the
842             key of the hash exactly as the value of the path column is spelled in the
843             taxonomy -- which in turn is the way it was spelled in the incoming file.
844             That is, a path in the taxonomy spelled C<|Alpha|Beta|Gamma> will be spelled
845             as a key in exactly the same way.
846              
847             However, since in many cases (including the example above) the root node of
848             the taxonomy will be empty, the user may wish to remove the first instance of
849             C. The user would do so by setting
850             C to a true value.
851              
852             $hashref = $self->hashify( {
853             remove_leading_path_col_sep => 1,
854             } );
855              
856             In that case they key would now be spelled: C.
857              
858             Note further that if the C switch is set to a true value, any
859             setting to C will be ignored.
860              
861             =item * C
862              
863             A string which will be used in composing the key of the hashref returned by
864             this method. The user may select this key if she does not want to use the
865             value found in the incoming CSV file (which by default will be the pipe
866             character (C<|>) and which may be overridden with the C argument
867             to C.
868              
869             $hashref = $self->hashify( {
870             key_delim => q{ - },
871             } );
872              
873             In the above variant, a path that in the incoming taxonomy file was
874             represented by C<|Alpha|Beta|Gamma> will in C<$hashref> be represented by
875             C< - Alpha - Beta - Gamma>.
876              
877             =item * C
878              
879             A string which will be used in composing the key of the hashref returned by
880             this method. The user will set this switch if she wishes to have the root
881             note explicitly represented. Using this switch will automatically cause
882             C to be ignored.
883              
884             Suppose the user wished to have C be the text for the root
885             node. Suppose further that the user wanted to use the string C< - > as the
886             delimiter within the key.
887              
888             $hashref = $self->hashify( {
889             root_str => q{All Suppliers},
890             key_delim => q{ - },
891             } );
892              
893             Then incoming path C<|Alpha|Beta|Gamma> would be keyed as:
894              
895             All Suppliers - Alpha - Beta - Gamma
896              
897             =back
898              
899             =item * Return Value
900              
901             Hash reference. The number of elements in this hash should be equal to the
902             number of non-header records in the taxonomy.
903              
904             =back
905              
906             =cut
907              
908             sub hashify {
909 33     33 1 20102 my ($self, $args) = @_;
910 33 100       101 if (defined $args) {
911 8 100 100     306 croak "Argument to 'hashify()' must be hashref"
912             unless (ref($args) and reftype($args) eq 'HASH');
913             }
914 31         48 my %hashified = ();
915 31         58 my $fields = $self->{fields};
916 31         45 my %idx2col = map { $_ => $fields->[$_] } (0 .. $#{$fields});
  170         455  
  31         72  
917 31         77 for my $rec (@{$self->{data_records}}) {
  31         75  
918 427         581 my $rowkey;
919 427 100       757 if ($args->{root_str}) {
920 39         83 $rowkey = $args->{root_str} . $rec->[$self->{path_col_idx}];
921             }
922             else {
923 388 100       614 if ($args->{remove_leading_path_col_sep}) {
924 26         175 ($rowkey = $rec->[$self->{path_col_idx}]) =~ s/^\Q$self->{path_col_sep}\E(.*)/$1/;
925             }
926             else {
927 362         676 $rowkey = $rec->[$self->{path_col_idx}];
928             }
929             }
930 427 100       885 if ($args->{key_delim}) {
931 52         229 $rowkey =~ s/\Q$self->{path_col_sep}\E/$args->{key_delim}/g;
932             }
933 427         492 my $rowdata = { map { $idx2col{$_} => $rec->[$_] } (0 .. $#{$fields}) };
  2402         6173  
  427         766  
934 427         1605 $hashified{$rowkey} = $rowdata;
935             }
936 31         127 return \%hashified;
937             }
938              
939             =head2 C
940              
941             =over 4
942              
943             =item * Purpose
944              
945             Transform a taxonomy-by-materialized-path into a taxonomy-by-adjacent-list.
946              
947             =item * Arguments
948              
949             $adjacentified = $self->adjacentify();
950              
951             $adjacentified = $self->adjacentify( { serial => 500 } );
952             $adjacentified = $self->adjacentify( { floor => 500 } ); # same as serial
953              
954             Optional single hash reference.
955              
956             For that hashref, C supports the key C, which defaults
957             to C<0>. C must be a non-negative integer and sets the "floor" above
958             which new unique IDs will be assigned to the C column. Hence, if
959             C is set to C<500>, the value assigned to the C column of the
960             first record to be processed will be C<501>.
961              
962             Starting with version .19, C will serve as an alternative way of
963             providing the same information to C. If, however, by mistake
964             you provide B C and C elements in the hash, C
965             will take precedence.
966              
967             =item * Return Value
968              
969             Reference to an array of hash references. Each element represents one node in
970             the taxonomy. Each element will have key-value pairs for C, C
971             and C which will hold the adjacentification of the materialized path in the
972             original taxonomy-by-materialized-path. Each element will, as well, have KVPs for the
973             non-materialized-path fields in the records in the original taxonomy-by-materialized-path.
974              
975             =item * Comment
976              
977             See documentation for C for example.
978              
979             Note that the order in which C will assign C and
980             C values to records in the taxonomy-by-adjacent-list will almost
981             certainly B match the order in which elements appear in a CSV file or in
982             the data structure returned by a method such as C.
983              
984             =back
985              
986             =cut
987              
988             sub adjacentify {
989 15     15 1 9045 my ($self, $args) = @_;
990 15         26 my $serial = 0;
991 15 100       38 if (defined $args) {
992 9 100 100     353 croak "Argument to 'adjacentify()' must be hashref"
993             unless (ref($args) and reftype($args) eq 'HASH');
994 7         17 for my $w ('serial', 'floor') {
995 13 100       33 if (exists $args->{$w}) {
996             croak "Element '$w' in argument to 'adjacentify()' must be integer"
997 8 100       260 unless ($args->{$w} =~ m/^\d+$/);
998             }
999             }
1000 5   50     33 $serial = $args->{serial} || $args->{floor} || 0;
1001             }
1002              
1003              
1004 11         58 my $fields = $self->fields();
1005 11         34 my $drpc = $self->data_records_path_components();
1006              
1007 11         31 my $path_col_idx = $self->path_col_idx();
1008 41         96 my %non_path_col2idx = map { $fields->[$_] => $_ }
1009 52         102 grep { $_ != $path_col_idx }
1010 11         20 (0..$#{$fields});
  11         20  
1011              
1012             my @components_by_row =
1013 11         25 map { my $f = $_->[$path_col_idx]; my $c = $#{$f}; [ @{$f}[1..$c] ] } @{$drpc};
  156         199  
  156         164  
  156         228  
  156         206  
  156         436  
  11         22  
1014 11         24 my $max_components = max( map { scalar(@{$_}) } @components_by_row);
  156         165  
  156         257  
1015 11         16 my @adjacentified = ();
1016 11         19 my %paths_to_id;
1017 11         35 for my $depth (1..$max_components) {
1018 34         80 for (my $r = 0; $r <= $#components_by_row; $r++) {
1019 498 100       487 if (scalar(@{$components_by_row[$r]}) == $depth) {
  498         1496  
1020 156         299 my %rowdata = map { $_ => $drpc->[$r]->[$non_path_col2idx{$_}] }
  568         1341  
1021             keys %non_path_col2idx;
1022 156         276 my @path_components = @{$drpc->[$r]->[$path_col_idx]};
  156         396  
1023 156         232 my $name = $path_components[-1];
1024 156         331 my $parent_of_name = join('|' =>
1025             @path_components[1 .. ($#path_components -1)]);
1026              
1027 156 100       348 my $candidate_for_path = (length($parent_of_name))
1028             ? join('|' => $parent_of_name, $name)
1029             : $name;
1030              
1031             my %rowhash = (
1032             id => ++$serial,
1033             parent_id => $paths_to_id{$parent_of_name}{id},
1034 156         885 name => $name,
1035             %rowdata,
1036             );
1037 156         486 $paths_to_id{$candidate_for_path}{id} = $rowhash{id};
1038 156 100       408 $paths_to_id{$candidate_for_path}{parent_path} = $parent_of_name
1039             if (length($parent_of_name));
1040 156         719 push @adjacentified, \%rowhash;
1041             }
1042             }
1043             }
1044 11         285 return \@adjacentified;
1045             }
1046              
1047             =head2 C
1048              
1049             =over 4
1050              
1051             =item * Purpose
1052              
1053             Create a CSV-formatted file holding the data returned by C.
1054              
1055             =item * Arguments
1056              
1057             $csv_file = $self->write_adjacentified_to_csv( {
1058             adjacentified => $adjacentified, # output of adjacentify()
1059             csvfile => './t/data/taxonomy_out3.csv',
1060             } );
1061              
1062             Single hash reference. That hash is keyed on:
1063              
1064             =over 4
1065              
1066             =item * C
1067              
1068             B Its value must be the arrayref of hash references returned by
1069             the C method.
1070              
1071             =item * C
1072              
1073             Optional. Path to location where a CSV-formatted text file holding the
1074             taxonomy-by-adjacent-list will be written. Defaults to a file called
1075             F in the current working directory.
1076              
1077             =item * Text::CSV_XS options
1078              
1079             You can also pass through any key-value pairs normally accepted by
1080             F.
1081              
1082             =back
1083              
1084             =item * Return Value
1085              
1086             Returns path to CSV-formatted text file just created.
1087              
1088             =item * Example
1089              
1090             Suppose we have a CSV-formatted file holding the following taxonomy-by-materialized-path:
1091              
1092             "path","is_actionable"
1093             "|Alpha","0"
1094             "|Beta","0"
1095             "|Alpha|Epsilon","0"
1096             "|Alpha|Epsilon|Kappa","1"
1097             "|Alpha|Zeta","0"
1098             "|Alpha|Zeta|Lambda","1"
1099             "|Alpha|Zeta|Mu","0"
1100             "|Beta|Eta","1"
1101             "|Beta|Theta","1"
1102              
1103             After running this file through C, C and
1104             C we will have a new CSV-formatted file holding
1105             this taxonomy-by-adjacent-list:
1106              
1107             id,parent_id,name,is_actionable
1108             1,,Alpha,0
1109             2,,Beta,0
1110             3,1,Epsilon,0
1111             4,1,Zeta,0
1112             5,2,Eta,1
1113             6,2,Theta,1
1114             7,3,Kappa,1
1115             8,4,Lambda,1
1116             9,4,Mu,0
1117              
1118             Note that the C column has been replaced by the C, C and
1119             C columns.
1120              
1121             =back
1122              
1123             =cut
1124              
1125             sub write_adjacentified_to_csv {
1126 14     14 1 12827 my ($self, $args) = @_;
1127 14 100       38 if (defined $args) {
1128 13 100 100     278 croak "Argument to 'adjacentify()' must be hashref"
1129             unless (ref($args) and reftype($args) eq 'HASH');
1130             croak "Argument to 'adjacentify()' must have 'adjacentified' element"
1131 11 100       127 unless exists $args->{adjacentified};
1132             croak "Argument 'adjacentified' must be array reference"
1133             unless (ref($args->{adjacentified}) and
1134 10 100 100     262 reftype($args->{adjacentified}) eq 'ARRAY');
1135             }
1136             else {
1137 1         107 croak "write_adjacentified_to_csv() must be supplied with hashref"
1138             }
1139 8         16 my $adjacentified = $args->{adjacentified};
1140 8         14 delete $args->{adjacentified};
1141              
1142 8         28 my $columns_in = $self->fields;
1143             my @non_path_columns_in =
1144 26         57 map { $columns_in->[$_] }
1145 34         72 grep { $_ != $self->{path_col_idx} }
1146 8         14 (0..$#{$columns_in});
  8         17  
1147 8         22 my @columns_out = (qw| id parent_id name |);
1148 8         14 push @columns_out, @non_path_columns_in;
1149              
1150 8         42222 my $cwd = cwd();
1151             my $csvfile = defined($args->{csvfile})
1152             ? $args->{csvfile}
1153 8 100       89 : "$cwd/taxonomy_out.csv";
1154 8         66 delete $args->{csvfile};
1155              
1156             # By this point, we should have processed all args other than those
1157             # intended for Text::CSV_XS and assigned their contents to variables as
1158             # needed.
1159              
1160 8         67 my $csv_args = { binary => 1 };
1161 8         21 while (my ($k,$v) = each %{$args}) {
  11         70  
1162 3         17 $csv_args->{$k} = $v;
1163             }
1164 8         153 my $csv = Text::CSV_XS->new($csv_args);
1165 8 50   1   2840 open my $OUT, ">:encoding(utf8)", $csvfile
  1         14  
  1         8  
  1         42  
1166             or croak "Unable to open $csvfile for writing";
1167 8 100       13519 $csv->eol(defined($csv_args->{eol}) ? $csv_args->{eol} : "\n");
1168 8         413 $csv->print($OUT, [@columns_out]);
1169 8         83 for my $rec (@{$adjacentified}) {
  8         24  
1170             $csv->print(
1171             $OUT,
1172 117         819 [ map { $rec->{$columns_out[$_]} } (0..$#columns_out) ]
  724         1920  
1173             );
1174             }
1175 8 50       594 close $OUT or croak "Unable to close $csvfile after writing";
1176              
1177 8         148 return $csvfile;
1178             }
1179              
1180             1;
1181              
1182             # vim: formatoptions=crqot