File Coverage

lib/Parse/File/Taxonomy.pm
Criterion Covered Total %
statement 172 172 100.0
branch 58 62 93.5
condition 11 11 100.0
subroutine 17 17 100.0
pod 12 13 92.3
total 270 275 98.1


line stmt bran cond sub pod time code
1             package Parse::File::Taxonomy;
2 4     4   12497 use strict;
  4         9  
  4         171  
3 4     4   24 use Carp;
  4         7  
  4         336  
4 4     4   3000 use Text::CSV;
  4         98487  
  4         28  
5 4     4   200 use Scalar::Util qw( reftype );
  4         7  
  4         15617  
6             our $VERSION = '0.02';
7             #use Data::Dump;
8              
9             =head1 NAME
10              
11             Parse::File::Taxonomy - Validate a file for use as a taxonomy
12              
13             =head1 SYNOPSIS
14              
15             use Parse::File::Taxonomy;
16              
17             $source = "./t/data/alpha.csv";
18             $obj = Parse::File::Taxonomy->new( {
19             file => $source,
20             } );
21              
22             $hashified_taxonomy = $obj->hashify_taxonomy();
23              
24             =head1 DESCRIPTION
25              
26             This module takes as input a plain-text file, verifies that it can be used as
27             a taxonomy, then creates a Perl data structure representing that taxonomy.
28              
29             B
30              
31             =head2 Taxonomy: definition
32              
33             For the purpose of this module, a B is defined as a tree-like data
34             structure with a root node, zero or more branch (child) nodes, and one or more
35             leaf nodes. The root node and each branch node must have at least one child
36             node, but leaf nodes have no child nodes. The number of branches
37             between a leaf node and the root node is variable.
38              
39             B
40              
41             Root
42             |
43             ----------------------------------------------------
44             | | | |
45             Branch Branch Branch Leaf
46             | | |
47             ------------------------- ------------ |
48             | | | | |
49             Branch Branch Leaf Leaf Branch
50             | | |
51             | ------------ |
52             | | | |
53             Leaf Leaf Leaf Leaf
54              
55             =head2 Taxonomy File: definition
56              
57             For the purpose of this module, a B is (a) a CSV file in which
58             one column holds data on the position of each record within the taxonomy and
59             (b) in which each node in the tree other than the root node is uniquely
60             represented by a record within the file.
61              
62             =head3 CSV
63              
64             B<"CSV">, strictly speaking, refers to B:
65              
66             path,nationality,gender,age,income,id_no
67              
68             For the purpose of this module, however, column separators in a taxonomy file
69             may be any user-specified character handled by the F library.
70             Formats frequently observed are B:
71              
72             path nationality gender age income id_no
73              
74             and B:
75              
76             path|nationality|gender|age|income|id_no
77              
78             The documentation for F comments that the CSV format could
79             perhaps better [be] called ASV (anything separated values)">, but we shall for
80             convenience use "CSV" herein regardless of the specific delimiter.
81              
82             Since it is often the case that the characters used as column separators may
83             occur within the data recorded in the columns as well, it is customary to
84             quote either all columns:
85              
86             "path","nationality","gender","age","income","id_no"
87              
88             or, at the very least, all columns which can hold
89             data other than pure integers or floating-point numbers:
90              
91             "path","nationality","gender",age,income,id_no
92              
93             =head3 Tree structure
94              
95             To qualify as a taxonomy file, it is not sufficient for a file to be in CSV
96             format. In each non-header record in that file, one column must hold data
97             capable of exactly specifying the record's position in the taxonomy, I
98             the route or B from the root node to the node being represented by that
99             record. That data must itself be in delimiter-separated format. Each
100             non-root node in the taxonomy must have exactly one record holding its path
101             data. Within that path column the value corresponding to the root node need
102             not be specified, I may be represented by an empty string.
103              
104             Let's rewrite Diagram 1 with values to make this clear.
105              
106             B
107              
108             ""
109             |
110             ----------------------------------------------------
111             | | | |
112             Alpha Beta Gamma Delta
113             | | |
114             ------------------------- ------------ |
115             | | | | |
116             Epsilon Zeta Eta Theta Iota
117             | | |
118             | ------------ |
119             | | | |
120             Kappa Lambda Mu Nu
121              
122             Let us suppose that our taxonomy file held comma-separated, quoted records.
123             Let us further supposed that the column holding taxonomy paths was, not
124             surprisingly, called C and that the separator within the C column
125             was a pipe (C<|>) character. Let us further suppose that for now we are not
126             concerned with the data in any columns other than C so that, for purpose
127             of illustration, they will hold empty (albeit quoted) string.
128              
129             Then the taxonomy file describing the tree in
130             Diagram 2 would look like this:
131              
132             "path","nationality","gender","age","income","id_no"
133             "|Alpha","","","","",""
134             "|Alpha|Epsilon","","","","",""
135             "|Alpha|Epsilon|Kappa","","","","",""
136             "|Alpha|Zeta","","","","",""
137             "|Alpha|Zeta|Lambda","","","","",""
138             "|Alpha|Zeta|Mu","","","","",""
139             "|Beta","","","","",""
140             "|Beta|Eta","","","","",""
141             "|Beta|Theta","","","","",""
142             "|Gamma","","","","",""
143             "|Gamma|Iota","","","","",""
144             "|Gamma|Iota|Nu","","","","",""
145             "|Delta","","","","",""
146              
147             Note that while in the C<|Gamma> branch we ultimately have only one leaf node,
148             C<|Gamma|Iota|Nu>, we require separate records in the taxonomy file for
149             C<|Gamma> and C<|Gamma|Iota>. To put this another way, the existence of a
150             C leaf must not be taken to "auto-vivify" C<|Gamma> and
151             C<|Gamma|Iota> nodes. Each non-root node must be explicitly represented in
152             the taxonomy file for the file to be considered valid.
153              
154             Note further that there is no restriction on the values of the B of
155             the C across records. It only the B path that must be unique.
156             Let us illustrate that by modifying the data in Diagram 2:
157              
158             B
159              
160             ""
161             |
162             ----------------------------------------------------
163             | | | |
164             Alpha Beta Gamma Delta
165             | | |
166             ------------------------- ------------ |
167             | | | | |
168             Epsilon Zeta Eta Theta Iota
169             | | |
170             | ------------ |
171             | | | |
172             Kappa Lambda Mu Delta
173              
174             Here we have two leaf nodes each named C. However, we follow different
175             paths from the root node to get to each of them. The taxonomy file
176             representing this tree would look like this:
177              
178             "path","nationality","gender","age","income","id_no"
179             "|Alpha","","","","",""
180             "|Alpha|Epsilon","","","","",""
181             "|Alpha|Epsilon|Kappa","","","","",""
182             "|Alpha|Zeta","","","","",""
183             "|Alpha|Zeta|Lambda","","","","",""
184             "|Alpha|Zeta|Mu","","","","",""
185             "|Beta","","","","",""
186             "|Beta|Eta","","","","",""
187             "|Beta|Theta","","","","",""
188             "|Gamma","","","","",""
189             "|Gamma|Iota","","","","",""
190             "|Gamma|Iota|Delta","","","","",""
191             "|Delta","","","","",""
192              
193             =head2 Taxonomy Validation
194              
195             The C constructor, C, will probe a taxonomy file
196             provided to it as an argument to determine whether it can be considered a
197             valid taxonomy according to the description provided above.
198              
199             TODO: Cnew() should also be able to accept a
200             reference to an array of CSV records already held in memory.
201              
202             TODO: What would it mean for Cnew() to accept a
203             filehandle as an argument, rather than a file? Would that be difficult to
204             implement?
205              
206             TODO: The user of this library, however, must be permitted to write
207             B which will be applied to a
208             taxonomy by means of a C method called on a
209             Parse::File::Taxonomy object. Should the file fail to meet those rules, the
210             user may choose not to proceed further even though the taxonomy meets the
211             basic validation criteria implemented in the constructor. This method will
212             take a reference to an array of subroutines references as its argument. Each
213             such code reference will be a user-defined rule which the taxonomy must obey.
214             The method will apply each code reference to the taxonomy in sequence and will
215             return with a true value if and only if all the individual criteria return
216             true as well.
217              
218             =head1 METHODS
219              
220             =head2 C
221              
222             =over 4
223              
224             =item * Purpose
225              
226             Parse::File::Taxonomy constructor.
227              
228             =item * Arguments
229              
230             $source = "./t/data/alpha.csv";
231             $obj = Parse::File::Taxonomy->new( {
232             file => $source,
233             } );
234              
235             Single hash reference. Elements in that hash are keyed on:
236              
237             =over 4
238              
239             =item * C
240              
241             Absolute or relative path to the incoming taxonomy file. Currently
242             B (but this may change if we implement ability to use a list of CSV
243             strings instead of a file).
244              
245             =item * C
246              
247             If the column to be used as the "path" column in the incoming taxonomy file is
248             B the first column, this option must be set to the integer representing
249             the "path" column's index position (count starts at 0). Optional; defaults to C<0>.
250              
251             =item * C
252              
253             If the string used to distinguish components of the path in the path column in
254             the incoming taxonomy file is not a pipe (C<|>), this option must be set.
255             Optional; defaults to C<|>.
256              
257             =item * Text::CSV options
258              
259             Any other options which could normally be passed to Cnew()> will
260             be passed through to that module's constructor. On the recommendation of the
261             Text::CSV documentation, C is always set to a true value.
262              
263             =back
264              
265             =item * Return Value
266              
267             Parse::File::Taxonomy object.
268              
269             =item * Comment
270              
271             C will throw an exception under any of the following conditions:
272              
273             =over 4
274              
275             =item * Argument to C is not a reference.
276              
277             =item * Argument to C is not a hash reference.
278              
279             =item * Unable to locate the file which is the value of the C element.
280              
281             =item * Argument to C element is not an integer.
282              
283             =item * Argument to C is greater than the index number of the
284             last element in the header row of the incoming taxonomy file, I the
285             C is wrong.
286              
287             =item * The same field is found more than once in the header row of the
288             incoming taxonomy file.
289              
290             =item * Unable to open or close the incoming taxonomy file for reading.
291              
292             =item * In the column designated as the "path" column, the same value is
293             observed more than once.
294              
295             =item * A non-parent node's parent node cannot be located in the incoming taxonomy file.
296              
297             =item * A data row has a number of fields different from the number of fields
298             in the header row.
299              
300             =back
301              
302             =back
303              
304             =cut
305              
306             sub new {
307 17     17 1 16912 my ($class, $args) = @_;
308 17         38 my %data;
309              
310 17 100 100     762 croak "Argument to 'new()' must be hashref"
311             unless (ref($args) and reftype($args) eq 'HASH');
312 15 100       242 croak "Argument to 'new()' must have 'file' element" unless $args->{file};
313 14 100       550 croak "Cannot locate file '$args->{file}'"
314             unless (-f $args->{file});
315 13         61 $data{file} = delete $args->{file};
316              
317 13 100       59 if (exists $args->{path_col_idx}) {
318 2 100       168 croak "Argument to 'path_col_idx' must be integer"
319             unless $args->{path_col_idx} =~ m/^\d+$/;
320             }
321 12   100     89 $data{path_col_idx} = delete $args->{path_col_idx} || 0;
322 12 100       62 $data{path_col_sep} = exists $args->{path_col_sep}
323             ? $args->{path_col_sep}
324             : '|';
325 12 100       38 if (exists $args->{path_col_sep}) {
326 2         6 $data{path_col_sep} = $args->{path_col_sep};
327 2         6 delete $args->{path_col_sep};
328             }
329             else {
330 10         28 $data{path_col_sep} = '|';
331             }
332              
333             # We've now handled all the Parse::File::Taxonomy-specific options.
334             # Any remaining options are assumed to be intended for Text::CSV::new().
335              
336 12         32 $args->{binary} = 1;
337 12 50       100 my $csv = Text::CSV->new ( $args )
338             or croak "Cannot use CSV: ".Text::CSV->error_diag ();
339 12 50       1938 open my $IN, "<", $data{file}
340             or croak "Unable to open '$data{file}' for reading";
341 12         70 my $header_ref = $csv->getline($IN);
342              
343 12         302 croak "Argument to 'path_col_idx' exceeds index of last field in header row in '$data{file}'"
344 12 100       42134 if $data{path_col_idx} > $#{$header_ref};
345              
346 11         19 my %header_fields_seen;
347 11         19 for (@{$header_ref}) {
  11         396  
348 65 100       109 if (exists $header_fields_seen{$_}) {
349 1         123 croak "Duplicate field '$_' observed in '$data{file}'";
350             }
351             else {
352 64         230 $header_fields_seen{$_}++;
353             }
354             }
355 10         190 $data{fields} = $header_ref;
356 10         22 my $field_count = scalar(@{$data{fields}});
  10         56  
357 10         40 $data{path_col} = $data{fields}->[$data{path_col_idx}];
358              
359 10         131 my $data_records = $csv->getline_all($IN);
360 10 50       47546 close $IN or croak "Unable to close after reading";
361              
362              
363             # Confirm no duplicate entries in column holding path:
364             # Confirm all rows have same number of columns as header:
365 10         33 my @bad_count_records = ();
366 10         30 my %paths_seen = ();
367 10         17 for my $rec (@{$data_records}) {
  10         31  
368 131         250 $paths_seen{$rec->[$data{path_col_idx}]}++;
369 131         111 my $this_row_count = scalar(@{$rec});
  131         138  
370 131 100       245 if ($this_row_count != $field_count) {
371 2         9 push @bad_count_records,
372             [ $rec->[$data{path_col_idx}], $this_row_count ];
373             }
374             }
375 10         28 my @dupe_paths = ();
376 10         117 for my $path (sort keys %paths_seen) {
377 128 100       222 push @dupe_paths, $path if $paths_seen{$path} > 1;
378             }
379 10         36 my $error_msg = <
380             No duplicate entries are permitted in column designated as path.
381             The following entries appear the number of times shown:
382             ERROR_MSG_DUPE
383 10         26 for my $path (@dupe_paths) {
384 2         10 $error_msg .= " $path:" . sprintf(" %6s\n" => $paths_seen{$path});
385             }
386 10 100       246 croak $error_msg if @dupe_paths;
387              
388 9         39 $error_msg = <
389             Header row had $field_count records. The following records had different counts:
390             ERROR_MSG_WRONG_COUNT
391 9         20 for my $rec (@bad_count_records) {
392 2         10 $error_msg .= " $rec->[0]: $rec->[1]\n";
393             }
394 9 100       334 croak $error_msg if @bad_count_records;
395              
396             # Confirm each node appears in taxonomy:
397 8         16 my $path_args = { map { $_ => $args->{$_} } keys %{$args} };
  8         42  
  8         30  
398 8         30 $path_args->{sep_char} = $data{path_col_sep};
399 8 50       70 my $path_csv = Text::CSV->new ( $path_args )
400             or croak "Cannot use CSV: ".Text::CSV->error_diag ();
401 8         872 my %missing_parents = ();
402 8         62 for my $path (sort keys %paths_seen) {
403 102         241 my $status = $path_csv->parse($path);
404 102         15620 my @columns = $path_csv->fields();
405 102 100       811 if (@columns > 2) {
406 70         262 my $parent =
407             join($path_args->{sep_char} => @columns[0 .. ($#columns - 1)]);
408 70 100       267 unless (exists $paths_seen{$parent}) {
409 2         6 $missing_parents{$path} = $parent;
410             }
411             }
412             }
413 8         30 $error_msg = <
414             Each node in the taxonomy must have a parent.
415             The following nodes lack the expected parent:
416             ERROR_MSG_ORPHAN
417 8         45 for my $path (sort keys %missing_parents) {
418 2         5 $error_msg .= " $path: $missing_parents{$path}\n";
419             }
420 8 100       211 croak $error_msg if scalar(keys %missing_parents);
421 7         23 $data{data_records} = $data_records;
422              
423              
424 7         13 while (my ($k,$v) = each %{$args}) {
  14         85  
425 7         22 $data{$k} = $v;
426             }
427 7         435 return bless \%data, $class;
428             }
429              
430             =head2 C
431              
432             =over 4
433              
434             =item * Purpose
435              
436             Identify the names of the columns in the taxonomy.
437              
438             =item * Arguments
439              
440             my $fields = $self->fields();
441              
442             No arguments; the information is already inside the object.
443              
444             =item * Return Value
445              
446             Reference to an array holding a list of the columns as they appear in the
447             header row of the incoming taxonomy file.
448              
449             =item * Comment
450              
451             Read-only.
452              
453             =back
454              
455             =cut
456              
457             sub fields {
458 4     4 1 2737 my $self = shift;
459 4         21 return $self->{fields};
460             }
461              
462             =head2 C
463              
464             =over 4
465              
466             =item * Purpose
467              
468             Identify the index position (count starts at 0) of the column in the incoming
469             taxonomy file which serves as the path column.
470              
471             =item * Arguments
472              
473             my $path_col_idx = $self->path_col_idx;
474              
475             No arguments; the information is already inside the object.
476              
477             =item * Return Value
478              
479             Integer in the range from 0 to 1 less than the number of columns in the header
480             row.
481              
482             =item * Comment
483              
484             Read-only.
485              
486             =back
487              
488             =cut
489              
490             sub path_col_idx {
491 2     2 1 3042 my $self = shift;
492 2         10 return $self->{path_col_idx};
493             }
494              
495             =head2 C
496              
497             =over 4
498              
499             =item * Purpose
500              
501             Identify the name of the column in the incoming taxonomy which serves as the
502             path column.
503              
504             =item * Arguments
505              
506             my $path_col = $self->path_col;
507              
508             No arguments; the information is already inside the object.
509              
510             =item * Return Value
511              
512             String.
513              
514             =item * Comment
515              
516             Read-only.
517              
518             =back
519              
520             =cut
521              
522             sub path_col {
523 2     2 1 1347 my $self = shift;
524 2         12 return $self->{path_col};
525             }
526              
527             =head2 C
528              
529             =over 4
530              
531             =item * Purpose
532              
533             Identify the string used to separate path components once the taxonomy has
534             been created. This is just a "getter" and is logically distinct from the
535             option to C which is, in effect, a "setter."
536              
537             =item * Arguments
538              
539             my $path_col_sep = $self->path_col_sep;
540              
541             No arguments; the information is already inside the object.
542              
543             =item * Return Value
544              
545             String.
546              
547             =item * Comment
548              
549             Read-only.
550              
551             =back
552              
553             =cut
554              
555             sub path_col_sep {
556 2     2 1 1235 my $self = shift;
557 2         9 return $self->{path_col_sep};
558             }
559              
560             =head2 C
561              
562             =over 4
563              
564             =item * Purpose
565              
566             Once the taxonomy has been validated, get a list of its data rows as a Perl
567             data structure.
568              
569             =item * Arguments
570              
571             $data_records = $self->data_records;
572              
573             None.
574              
575             =item * Return Value
576              
577             Reference to array of array references. The array will hold the data records
578             found in the incoming taxonomy file in their order in that file.
579              
580             =item * Comment
581              
582             Does not contain any information about the fields in the taxonomy, so you
583             should probably either (a) use in conjunction with C method above;
584             or (b) use C.
585              
586             =back
587              
588             =cut
589              
590             sub data_records {
591 2     2 1 603 my $self = shift;
592 2         9 return $self->{data_records};
593             }
594              
595             =head2 C
596              
597             =over 4
598              
599             =item * Purpose
600              
601             Once the taxonomy has been validated, get a list of its header and data rows as a Perl
602             data structure.
603              
604             =item * Arguments
605              
606             $data_records = $self->fields_and_data_records;
607              
608             None.
609              
610             =item * Return Value
611              
612             Reference to array of array references. The first element in the array will
613             hold the header row (same as output of C). The remaining elements
614             will hold the data records found in the incoming taxonomy file in their order
615             in that file.
616              
617             =back
618              
619             =cut
620              
621             sub fields_and_data_records {
622 1     1 1 1452 my $self = shift;
623 1         5 my @all_rows = $self->fields;
624 1         3 for my $row (@{$self->data_records}) {
  1         4  
625 13         18 push @all_rows, $row;
626             }
627 1         4 return \@all_rows;
628             }
629              
630             =head2 C
631              
632             =over 4
633              
634             =item * Purpose
635              
636             Once the taxonomy has been validated, get a list of its data rows as a Perl
637             data structure. In each element of this list, the path is now represented as
638             an array reference rather than a string.
639              
640             =item * Arguments
641              
642             $data_records_path_components = $self->data_records_path_components;
643              
644             None.
645              
646             =item * Return Value
647              
648             Reference to array of array references. The array will hold the data records
649             found in the incoming taxonomy file in their order in that file.
650              
651             =item * Comment
652              
653             Does not contain any information about the fields in the taxonomy, so you
654             should probably either (a) use in conjunction with C method above;
655             or (b) use C.
656              
657             =back
658              
659             =cut
660              
661             sub data_records_path_components {
662 1     1 1 1585 my $self = shift;
663 1         3 my @all_rows = ();
664 1         2 for my $row (@{$self->{data_records}}) {
  1         4  
665 13         18 my $path_col = $row->[$self->{path_col_idx}];
666 13         55 my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
667 13         15 my @rewritten = ();
668 13         16 for (my $i = 0; $i <= $#{$row}; $i++) {
  91         155  
669 78 100       109 if ($i != $self->{path_col_idx}) {
670 65         115 push @rewritten, $row->[$i];
671             }
672             else {
673 13         24 push @rewritten, \@path_components;
674             }
675             }
676 13         35 push @all_rows, \@rewritten;
677             }
678 1         5 return \@all_rows;
679             }
680              
681             =head2 C
682              
683             =over 4
684              
685             =item * Purpose
686              
687             Once the taxonomy has been validated, get a list of its data rows as a Perl
688             data structure. The first element in this list is an array reference holding
689             the header row. In each data element of this list, the path is now represented as
690             an array reference rather than a string.
691              
692             =item * Arguments
693              
694             $fields_and_data_records_path_components = $self->fields_and_data_records_path_components;
695              
696             None.
697              
698             =item * Return Value
699              
700             Reference to array of array references. The array will hold the data records
701             found in the incoming taxonomy file in their order in that file.
702              
703             =back
704              
705             =cut
706              
707             sub fields_and_data_records_path_components {
708 1     1 1 1447 my $self = shift;
709 1         4 my @all_rows = $self->fields;
710 1         2 for my $row (@{$self->{data_records}}) {
  1         3  
711 13         18 my $path_col = $row->[$self->{path_col_idx}];
712 13         47 my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
713 13         17 my @rewritten = ();
714 13         15 for (my $i = 0; $i <= $#{$row}; $i++) {
  91         160  
715 78 100       110 if ($i != $self->{path_col_idx}) {
716 65         107 push @rewritten, $row->[$i];
717             }
718             else {
719 13         22 push @rewritten, \@path_components;
720             }
721             }
722 13         46 push @all_rows, \@rewritten;
723             }
724 1         6 return \@all_rows;
725             }
726              
727             =head2 C
728              
729             =over 4
730              
731             =item * Purpose
732              
733             Display the number of descendant (multi-generational) nodes each node in the
734             taxonomy has.
735              
736             =item * Arguments
737              
738             $child_counts = $self->child_counts();
739              
740             None.
741              
742             =item * Return Value
743              
744             Reference to hash in which each element is keyed on the value of the path
745             column in the incoming taxonomy file.
746              
747             =back
748              
749             =cut
750              
751             sub child_counts {
752 4     4 1 2334 my $self = shift;
753 4         7 my %child_counts = map { $_->[$self->{path_col_idx}] => 0 } @{$self->{data_records}};
  52         111  
  4         21  
754 4         22 for my $node (keys %child_counts) {
755 52         145 for my $other_node ( grep { ! m/^\Q$node\E$/ } keys %child_counts) {
  676         2116  
756 624 100       2438 $child_counts{$node}++
757             if $other_node =~ m/^\Q$node$self->{path_col_sep}\E/;
758             }
759             }
760 4         20 return \%child_counts;
761             }
762              
763             =head2 C
764              
765             =over 4
766              
767             =item * Purpose
768              
769             Get the total number of descendant nodes for one specific node in a validated
770             taxonomy.
771              
772             =item * Arguments
773              
774             $child_count = $self->get_child_count('|Path|To|Node');
775              
776             String containing node's path as spelled in the taxonomy.
777              
778             =item * Return Value
779              
780             Unsigned integer >= 0. Any node whose child count is C<0> is by definition a
781             leaf node.
782              
783             =item * Comment
784              
785             Will throw an exception if the node does not exist or is misspelled.
786              
787             =back
788              
789             =cut
790              
791             sub get_child_count {
792 3     3 1 2162 my ($self, $node) = @_;
793 3         9 my $child_counts = $self->child_counts();
794 3 100       201 croak "Node '$node' not found" unless exists $child_counts->{$node};
795 2         12 return $child_counts->{$node};
796             }
797              
798             =head2 C
799              
800             =over 4
801              
802             =item * Purpose
803              
804             Turn a validated taxonomy into a Perl hash keyed on the column designated as
805             the path column.
806              
807             =item * Arguments
808              
809             $hashref = $self->hashify_taxonomy();
810              
811             Takes an optional hashref holding a list of any of the following elements:
812              
813             =over 4
814              
815             =item * C
816              
817             Boolean, defaulting to C<0>. By default, C will spell the
818             key of the hash exactly as the value of the path column is spelled in the
819             taxonomy -- which in turn is the way it was spelled in the incoming file.
820             That is, a path in the taxonomy spelled C<|Alpha|Beta|Gamma> will be spelled
821             as a key in exactly the same way.
822              
823             However, since in many cases (including the example above) the root node of the taxonomy will be empty, the
824             user may wish to remove the first instance of C. The user would
825             do so by setting C to a true value.
826              
827             $hashref = $self->hashify_taxonomy( {
828             remove_leading_path_col_sep => 1,
829             } );
830              
831             In that case they key would now be spelled: C.
832              
833             Note further that if the C switch is set to a true value, any
834             setting to C will be ignored.
835              
836             =item * C
837              
838             A string which will be used in composing the key of the hashref returned by
839             this method. The user may select this key if she does not want to use the
840             value found in the incoming CSV file (which by default will be the pipe
841             character (C<|>) and which may be overridden with the C argument
842             to C.
843              
844             $hashref = $self->hashify_taxonomy( {
845             key_delim => q{ - },
846             } );
847              
848             In the above variant, a path that in the incoming taxonomy file was
849             represented by C<|Alpha|Beta|Gamma> will in C<$hashref> be represented by
850             C< - Alpha - Beta - Gamma>.
851              
852             =item * C
853              
854             A string which will be used in composing the key of the hashref returned by
855             this method. The user will set this switch if she wishes to have the root
856             note explicitly represented. Using this switch will automatically cause
857             C to be ignored.
858              
859             Suppose the user wished to have C be the text for the root
860             node. Suppose further that the user wanted to use the string C< - > as the
861             delimiter within the key.
862              
863             $hashref = $self->hashify_taxonomy( {
864             root_str => q{All Suppliers},
865             key_delim => q{ - },
866             } );
867              
868             Then incoming path C<|Alpha|Beta|Gamma> would be keyed as:
869              
870             All Suppliers - Alpha - Beta - Gamma
871              
872             =back
873              
874             =item * Return Value
875              
876             Hash reference. The number of elements in this hash should be equal to the
877             number of non-header records in the taxonomy.
878              
879             =back
880              
881             =cut
882              
883             sub hashify_taxonomy {
884 8     8 1 19623 my ($self, $args) = @_;
885 8 100       40 if (defined $args) {
886 7 100 100     380 croak "Argument to 'new()' must be hashref"
887             unless (ref($args) and reftype($args) eq 'HASH');
888             }
889 6         12 my %hashified = ();
890 6         21 my $fields = $self->{fields};
891 6         15 my %idx2col = map { $_ => $fields->[$_] } (0 .. $#{$fields});
  36         117  
  6         17  
892 6         17 for my $rec (@{$self->{data_records}}) {
  6         20  
893 78         56 my $rowkey;
894 78 100       108 if ($args->{root_str}) {
895 26         44 $rowkey = $args->{root_str} . $rec->[$self->{path_col_idx}];
896             }
897             else {
898 52 100       65 if ($args->{remove_leading_path_col_sep}) {
899 26         180 ($rowkey = $rec->[$self->{path_col_idx}]) =~ s/^\Q$self->{path_col_sep}\E(.*)/$1/;
900             }
901             else {
902 26         36 $rowkey = $rec->[$self->{path_col_idx}];
903             }
904             }
905 78 100       138 if ($args->{key_delim}) {
906 39         158 $rowkey =~ s/\Q$self->{path_col_sep}\E/$args->{key_delim}/g;
907             }
908 78         69 my $rowdata = { map { $idx2col{$_} => $rec->[$_] } (0 .. $#{$fields}) };
  468         867  
  78         113  
909 78         211 $hashified{$rowkey} = $rowdata;
910             }
911 6         32 return \%hashified;
912             }
913              
914             sub local_validate {
915 4     4 0 3587 my ($self, $args) = @_;
916              
917 4 100 100     351 croak "Argument to local_validate() must be an array ref"
918             unless defined $args and ref($args) eq 'ARRAY';
919 2         3 foreach my $rule (@{$args}) {
  2         6  
920 4 100       154 croak "Each element in arrayref of arguments to local_validate() must be a code ref"
921             unless ref($rule) eq 'CODE';
922             }
923             # TODO: implementation; documentation
924              
925 1         6 return 1;
926             }
927              
928             1;
929              
930             # vim: formatoptions=crqot