File Coverage

lib/Parse/Taxonomy/AdjacentList.pm
Criterion Covered Total %
statement 258 258 100.0
branch 103 108 95.3
condition 36 36 100.0
subroutine 21 21 100.0
pod 6 9 66.6
total 424 432 98.1


line stmt bran cond sub pod time code
1             package Parse::Taxonomy::AdjacentList;
2 4     4   9549 use strict;
  4         7  
  4         107  
3 4     4   1902 use parent qw( Parse::Taxonomy );
  4         860  
  4         23  
4 4     4   175 use Carp;
  4         8  
  4         208  
5 4     4   3565 use Text::CSV_XS;
  4         30887  
  4         217  
6 4     4   31 use Scalar::Util qw( reftype );
  4         8  
  4         187  
7 4     4   19 use Cwd;
  4         33  
  4         310  
8             our $VERSION = '0.22';
9 4         11877 use Parse::Taxonomy::Auxiliary qw(
10             path_check_fields
11             components_check_fields
12 4     4   1320 );
  4         10  
13              
14             =head1 NAME
15              
16             Parse::Taxonomy::AdjacentList - Extract a taxonomy from a hierarchy inside a CSV file
17              
18             =head1 SYNOPSIS
19              
20             use Parse::Taxonomy::AdjacentList;
21              
22             $source = "./t/data/alpha.csv";
23             $self = Parse::Taxonomy::AdjacentList->new( {
24             file => $source,
25             } );
26              
27             =cut
28              
29             =head1 METHODS
30              
31             =head2 C
32              
33             =over 4
34              
35             =item * Purpose
36              
37             Parse::Taxonomy::AdjacentList constructor.
38              
39             =item * Arguments
40              
41             Single hash reference. There are two possible interfaces: C and C.
42              
43             =over 4
44              
45             =item 1 C interface
46              
47             $source = "./t/data/delta.csv";
48             $self = Parse::Taxonomy::AdjacentList->new( {
49             file => $source,
50             } );
51              
52             Elements in the hash reference are keyed on:
53              
54             =over 4
55              
56             =item * C
57              
58             Absolute or relative path to the incoming taxonomy file.
59             B for this interface.
60              
61             =item * C
62              
63             The name of the column in the header row under which each data record's unique
64             ID can be found. Defaults to C.
65              
66             =item * C
67              
68             The name of the column in the header row under which each data record's parent
69             ID can be found. (Will be empty in the case of top-level nodes, as they have
70             no parent.) Defaults to C.
71              
72             =item * C
73              
74             The name of the column in the header row under which, in each data record, there
75             is a found a string which differentiates that record from all other records
76             with the same parent ID. Defaults to C.
77              
78             =item * Text::CSV_XS options
79              
80             Any other options which could normally be passed to Cnew()> will
81             be passed through to that module's constructor. On the recommendation of the
82             Text::CSV documentation, C is always set to a true value.
83              
84             =back
85              
86             =item 2 C interface
87              
88             $self = Parse::Taxonomy::AdjacentList->new( {
89             components => {
90             fields => $fields,
91             data_records => $data_records,
92             }
93             } );
94              
95             Elements in this hash are keyed on:
96              
97             =over 4
98              
99             =item * C
100              
101             This element is B for the C interface. The value of this
102             element is a hash reference with two keys, C and C.
103             C is a reference to an array holding the field or column names for the
104             data set. C is a reference to an array of array references,
105             each of the latter arrayrefs holding one record or row from the data set.
106              
107             =back
108              
109             =back
110              
111             =item * Return Value
112              
113             Parse::Taxonomy::AdjacentList object.
114              
115             =item * Exceptions
116              
117             C will throw an exception under any of the following conditions:
118              
119             =over 4
120              
121             =item * Argument to C is not a reference.
122              
123             =item * Argument to C is not a hash reference.
124              
125             =item * Argument to C must have either 'file' or 'components' element but not both.
126              
127             =item * Lack columns in header row to match requirements.
128              
129             =item * Non-numeric entry in C or C column.
130              
131             =item * Duplicate entries in C column.
132              
133             =item * Number of fields in a data record does not match number in header row.
134              
135             =item * Empty string in a C column of a record.
136              
137             =item * Unable to locate a record whose C is the C of a different record.
138              
139             =item * No records with same C may share value of C column.
140              
141             =item * C interface
142              
143             =over 4
144              
145             =item * In the C interface, unable to locate the file which is the value of the C element.
146              
147             =item * The same field is found more than once in the header row of the
148             incoming taxonomy file.
149              
150             =item * Unable to open or close the incoming taxonomy file for reading.
151              
152             =back
153              
154             =item * C interface
155              
156             =over 4
157              
158             =item * In the C interface, C element must be a hash reference with C and C elements.
159              
160             =item * C element must be array reference.
161              
162             =item * C element must be reference to array of array references.
163              
164             =item * No duplicate fields in C element's array reference.
165              
166             =back
167              
168             =back
169              
170             =back
171              
172             =cut
173              
174             sub new {
175 51     51 1 53107 my ($class, $args) = @_;
176 51         71 my $data;
177              
178 51 100 100     793 croak "Argument to 'new()' must be hashref"
179             unless (ref($args) and reftype($args) eq 'HASH');
180 49         69 my $argscount = 0;
181 49 100       139 $argscount++ if $args->{file};
182 49 100       133 $argscount++ if $args->{components};
183 49 100       223 croak "Argument to 'new()' must have either 'file' or 'components' element"
184             if ($argscount == 0);
185 48 100       221 croak "Argument to 'new()' must have either 'file' or 'components' element but not both"
186             if ($argscount == 2);
187              
188             $data->{id_col} = $args->{id_col}
189             ? delete $args->{id_col}
190 47 100       170 : 'id';
191             $data->{parent_id_col} = $args->{parent_id_col}
192             ? delete $args->{parent_id_col}
193 47 100       131 : 'parent_id';
194             $data->{leaf_col} = $args->{leaf_col}
195             ? delete $args->{leaf_col}
196 47 100       108 : 'name';
197              
198 47 100       97 if ($args->{components}) {
199             croak "Value of 'components' element must be hashref"
200 22 100 100     367 unless (ref($args->{components}) and reftype($args->{components}) eq 'HASH');
201 20         34 for my $k ( qw| fields data_records | ) {
202             croak "Value of 'components' element must have '$k' key-value pair"
203 37 100       320 unless exists $args->{components}->{$k};
204             croak "Value of '$k' element must be arrayref"
205             unless (ref($args->{components}->{$k}) and
206 35 100 100     542 reftype($args->{components}->{$k}) eq 'ARRAY');
207             }
208 15         21 for my $row (@{$args->{components}->{data_records}}) {
  15         36  
209 173 100 100     993 croak "Each element in 'data_records' array must be arrayref"
210             unless (ref($row) and reftype($row) eq 'ARRAY');
211             }
212 13         37 _prepare_fields($data, $args->{components}->{fields}, 1);
213 9         16 my $these_data_records = $args->{components}->{data_records};
214 9         15 delete $args->{components};
215 9         21 _prepare_data_records($data, $these_data_records, $args);
216             }
217             else {
218             croak "Cannot locate file '$args->{file}'"
219 25 100       680 unless (-f $args->{file});
220 24         103 $data->{file} = delete $args->{file};
221 24         49 $args->{binary} = 1;
222 24 50       137 my $csv = Text::CSV_XS->new ( $args )
223             or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
224             open my $IN, "<", $data->{file}
225 24 50       2914 or croak "Unable to open '$data->{file}' for reading";
226 24     4   1133 my $header_ref = $csv->getline($IN);
  4         2621  
  4         20925  
  4         124  
227 24         1283 _prepare_fields($data, $header_ref);
228              
229 20         513 my $data_records = $csv->getline_all($IN);
230 20 50       7880 close $IN or croak "Unable to close after reading";
231 20         63 _prepare_data_records($data, $data_records, $args);
232             }
233              
234 18         37 while (my ($k,$v) = each %{$args}) {
  31         109  
235 13         28 $data->{$k} = $v;
236             }
237 18         91 return bless $data, $class;
238             }
239              
240             sub _prepare_fields {
241 37     37   71 my ($data, $fields_ref, $components) = @_;
242 37 100       92 if (! $components) {
243 24         86 path_check_fields($data, $fields_ref);
244 23         115 _check_required_columns($data, $fields_ref);
245             }
246             else { # 'components' interface
247 13         105 components_check_fields($data, $fields_ref);
248 12         26 _check_required_columns($data, $fields_ref);
249             }
250 29         50 $data->{fields} = $fields_ref;
251 29         45 return $data;
252             }
253              
254             sub _check_required_columns {
255 35     35   58 my ($data, $fields_ref) = @_;
256 35         53 my %col2idx = map { $fields_ref->[$_] => $_ } (0 .. $#{$fields_ref});
  255         578  
  35         79  
257 35         109 my %missing_columns = ();
258 35         56 my %main_columns = map { $_ => 1 } ( qw| id_col parent_id_col leaf_col | );
  105         252  
259 35         110 for my $c ( keys %main_columns ) {
260 105 100       281 if (! exists $col2idx{$data->{$c}}) {
261 6         14 $missing_columns{$c} = $data->{$c};
262             }
263             }
264 35         73 my $error_msg = "Could not locate columns in header to match required arguments:";
265 35         100 for my $c (sort keys %missing_columns) {
266 6         17 $error_msg .= "\n $c: $missing_columns{$c}";
267             }
268 35 100       909 croak $error_msg if scalar keys %missing_columns;
269 29         72 $data->{fields} = $fields_ref;
270 29         60 for my $c (keys %main_columns) {
271 87         260 $data->{$c.'_idx'} = $col2idx{$data->{$c}};
272             }
273 29         116 return $data;
274             }
275              
276             sub _prepare_data_records {
277 29     29   52 my ($data, $data_records, $args) = @_;
278             # Confirm no duplicate entries in 'id_col'. DONE
279             # Confirm all rows have same number of columns as header. DONE
280 29         44 my $error_msg = '';
281 29         37 my $field_count = scalar(@{$data->{fields}});
  29         61  
282 29         50 my @non_numeric_id_records = ();
283 29         56 my %ids_seen = ();
284 29         37 my @bad_count_records = ();
285 29         45 my @nameless_component_records = ();
286 29         32 for my $rec (@{$data_records}) {
  29         65  
287 373 100       1199 if ($rec->[$data->{id_col_idx}] !~ m/^\d+$/) {
288 2         7 push @non_numeric_id_records, [ $rec->[$data->{id_col_idx}], '' ];
289             }
290 373 100 100     1751 if (length($rec->[$data->{parent_id_col_idx}]) and
291             $rec->[$data->{parent_id_col_idx}] !~ m/^\d+$/
292             ) {
293             push @non_numeric_id_records, [
294             $rec->[$data->{id_col_idx}],
295 2         6 $rec->[$data->{parent_id_col_idx}]
296             ];
297             }
298 373         766 $ids_seen{$rec->[$data->{id_col_idx}]}++;
299 373         383 my $this_row_count = scalar(@{$rec});
  373         515  
300 373 100       717 if ($this_row_count != $field_count) {
301             push @bad_count_records,
302 6         14 [ $rec->[$data->{id_col_idx}], $this_row_count ];
303             }
304 373 100       938 if (! length($rec->[$data->{leaf_col_idx}])) {
305 3         7 push @nameless_component_records, $rec->[$data->{id_col_idx}];
306             }
307             }
308 29         98 $error_msg = <
309             Non-numeric entries are not permitted in the '$data->{id_col}' or '$data->{parent_id_col}' columns.
310             The following records each violate this restriction one or two times:
311             NON_NUMERIC_IDS
312 29         57 for my $rec (@non_numeric_id_records) {
313 4         14 $error_msg .= " $data->{id_col}: $rec->[0]\t$data->{parent_id_col}: $rec->[1]\n";
314             }
315 29 100       203 croak $error_msg if @non_numeric_id_records;
316              
317 28         51 my @dupe_ids = ();
318 28         198 for my $id (sort keys %ids_seen) {
319 356 100       744 push @dupe_ids, $id if $ids_seen{$id} > 1;
320             }
321 28         91 $error_msg = <
322             No duplicate entries are permitted in the '$data->{id_col}'column.
323             The following entries appear the number of times shown:
324             ERROR_MSG_DUPE
325 28         88 for my $id (@dupe_ids) {
326 4         19 $error_msg .= " $id:" . sprintf(" %6s\n" => $ids_seen{$id});
327             }
328 28 100       331 croak $error_msg if @dupe_ids;
329              
330 26         76 $error_msg = <
331             Header row has $field_count columns. The following records
332             (identified by the value in their '$data->{id_col}' columns)
333             have different counts:
334             ERROR_MSG_WRONG_COUNT
335 26         45 for my $rec (@bad_count_records) {
336 6         15 $error_msg .= " $rec->[0]: $rec->[1]\n";
337             }
338 26 100       309 croak $error_msg if @bad_count_records;
339              
340 24         57 $error_msg = <
341             Each data record must have a non-empty string in its 'leaf' column.
342             The following records (identified by the value in their '$data->{id_col}' columns)
343             lack valid leaves:
344             NAMELESS_LEAF
345 24         43 for my $rec (@nameless_component_records) {
346 3         6 $error_msg .= " id: $rec\n";
347             }
348 24 100       179 croak $error_msg if @nameless_component_records;
349              
350 23         48 my %ids_missing_parents = ();
351 23         27 for my $rec (@{$data_records}) {
  23         45  
352 295         415 my $parent_id = $rec->[$data->{parent_id_col_idx}];
353 295 100 100     1147 if ( (length($parent_id)) and (! $ids_seen{$parent_id}) ) {
354 4         11 $ids_missing_parents{$rec->[$data->{id_col_idx}]} = $parent_id;
355             }
356             }
357 23         79 $error_msg = <
358             For each record with a non-null value in the '$data->{parent_id_col}' column,
359             there must be a record with that value in the '$data->{id_col}' column.
360             The following records (identified by the value in their '$data->{id_col}' columns)
361             appear to to have parent IDs which do not have records of their own:
362             ERROR_MSG_MISSING_PARENT
363 23         66 for my $k (sort {$a <=> $b} keys %ids_missing_parents) {
  2         10  
364 4         10 $error_msg .= " $k: $ids_missing_parents{$k}\n";
365             }
366 23 100       328 croak $error_msg if scalar keys %ids_missing_parents;
367              
368 21         33 my %families = ();
369 21         27 for my $rec (@{$data_records}) {
  21         41  
370 269 100       607 if (length($rec->[$data->{parent_id_col_idx}])) {
371 200         581 $families{$rec->[$data->{parent_id_col_idx}]}{$rec->[$data->{leaf_col_idx}]}++;
372             }
373             }
374 21         63 $error_msg = <
375             No record with a non-null value in the '$data->{parent_id_col}' column
376             may have two children with the same value in the '$data->{leaf_col}' column.
377             The following are violations:
378             ERROR_MSG_SIBLINGS_NAMED_SAME
379              
380 21         32 my $same_names = 0;
381 21         88 for my $k (sort {$a <=> $b} keys %families) {
  212         300  
382 123         142 for my $l (sort keys %{$families{$k}}) {
  123         351  
383 195 100       485 if ($families{$k}{$l} > 1) {
384 5         19 $error_msg .= " $data->{parent_id_col}: $k|$data->{leaf_col}: $l|count of $data->{leaf_col}s: $families{$k}{$l}\n";
385 5         12 $same_names++;
386             }
387             }
388             }
389 21 100       450 croak $error_msg if $same_names;
390              
391 18         47 $data->{data_records} = $data_records;
392 18         236 return $data;
393             }
394              
395             =head2 C
396              
397             =over 4
398              
399             =item * Purpose
400              
401             Identify the names of the columns in the taxonomy.
402              
403             =item * Arguments
404              
405             my $fields = $self->fields();
406              
407             No arguments; the information is already inside the object.
408              
409             =item * Return Value
410              
411             Reference to an array holding a list of the columns as they appear in the
412             header row of the incoming taxonomy file.
413              
414             =item * Comment
415              
416             Read-only.
417              
418             =back
419              
420             =head2 C
421              
422             =over 4
423              
424             =item * Purpose
425              
426             Once the taxonomy has been validated, get a list of its data rows as a Perl
427             data structure.
428              
429             =item * Arguments
430              
431             $data_records = $self->data_records;
432              
433             None.
434              
435             =item * Return Value
436              
437             Reference to array of array references. The array will hold the data records
438             found in the incoming taxonomy file in their order in that file.
439              
440             =item * Comment
441              
442             Does not contain any information about the fields in the taxonomy, so you
443             should probably either (a) use in conjunction with C method above;
444             or (b) use C.
445              
446             =back
447              
448             =cut
449              
450             =head2 C
451              
452             =over 4
453              
454             =item * Purpose
455              
456             Identify the index position of a given field within the header row.
457              
458             =item * Arguments
459              
460             $index = $self->get_field_position('income');
461              
462             Takes a single string holding the name of one of the fields (column names).
463              
464             =item * Return Value
465              
466             Integer representing the index position (counting from C<0>) of the field
467             provided as argument. Throws exception if the argument is not actually a
468             field.
469              
470             =back
471              
472             =cut
473              
474             =head2 Accessors
475              
476             The following methods provide information about key columns in a
477             Parse::Taxonomy::MaterializedPath object. The key columns are those which hold the
478             ID, parent ID and component information. They take no arguments. The methods
479             whose names end in C<_idx> return integers, as they return the index position
480             of the column in the header row. The other methods return strings.
481              
482             $index_of_id_column = $self->id_col_idx;
483              
484             $name_of_id_column = $self->id_col;
485              
486             $index_of_parent_id_column = $self->parent_id_col_idx;
487              
488             $name_of_parent_id_column = $self->parent_id_col;
489              
490             $index_of_leaf_column = $self->leaf_col_idx;
491              
492             $name_of_leaf_column = $self->leaf_col;
493              
494             =cut
495              
496             sub id_col_idx {
497 403     403 0 1090 my $self = shift;
498 403         825 return $self->{id_col_idx};
499             }
500              
501             sub id_col {
502 101     101 1 126 my $self = shift;
503 101         360 return $self->{id_col};
504             }
505              
506             sub parent_id_col_idx {
507 205     205 0 231 my $self = shift;
508 205         476 return $self->{parent_id_col_idx};
509             }
510              
511             sub parent_id_col {
512 85     85 1 104 my $self = shift;
513 85         374 return $self->{parent_id_col};
514             }
515              
516             sub leaf_col_idx {
517 205     205 0 252 my $self = shift;
518 205         796 return $self->{leaf_col_idx};
519             }
520              
521             sub leaf_col {
522 69     69 1 86 my $self = shift;
523 69         250 return $self->{leaf_col};
524             }
525              
526             =head2 C
527              
528             =over 4
529              
530             =item * Purpose
531              
532             Generate a new Perl data structure which holds the same information as a
533             Parse::Taxonomy::AdjacentList object but which expresses the route from the
534             root node to a given branch or leaf node as either a separator-delimited
535             string (as in the C column of a Parse::Taxonomy::MaterializedPath object) or
536             as an array reference holding the list of names which delineate that route.
537              
538             Another way of expressing this: Transform a taxonomy-by-adjacent-list to a
539             taxonomy-by-materialized-path.
540              
541             Example: Suppose we have a CSV file which serves as a taxonomy-by-adjacent-list for
542             this data:
543              
544             "id","parent_id","name","is_actionable"
545             "1","","Alpha","0"
546             "2","","Beta","0"
547             "3","1","Epsilon","0"
548             "4","3","Kappa","1"
549             "5","1","Zeta","0"
550             "6","5","Lambda","1"
551             "7","5","Mu","0"
552             "8","2","Eta","1"
553             "9","2","Theta","1"
554              
555             Instead of having the route from the root node to a given node be represented
556             B by following Cs up the tree, suppose we want that
557             route to be represented by a string. Assuming that we work with default
558             column names, that would mean representing the information currently spread
559             out among the C, C and C columns in a single C
560             column which, by default, would hold an array reference.
561              
562             $source = "./t/data/theta.csv";
563             $self = Parse::Taxonomy::AdjacentList->new( {
564             file => $source,
565             } );
566              
567             $taxonomy_with_path_as_array = $self->pathify;
568              
569             Yielding:
570              
571             [
572             ["path", "is_actionable"],
573             [["", "Alpha"], 0],
574             [["", "Beta"], 0],
575             [["", "Alpha", "Epsilon"], 0],
576             [["", "Alpha", "Epsilon", "Kappa"], 1],
577             [["", "Alpha", "Zeta"], 0],
578             [["", "Alpha", "Zeta", "Lambda"], 1],
579             [["", "Alpha", "Zeta", "Mu"], 0],
580             [["", "Beta", "Eta"], 1],
581             [["", "Beta", "Theta"], 1],
582             ]
583              
584             If we wanted the path information represented as a string rather than an array
585             reference, we would say:
586              
587             $taxonomy_with_path_as_string = $self->pathify( { as_string => 1 } );
588              
589             Yielding:
590              
591             [
592             ["path", "is_actionable"],
593             ["|Alpha", 0],
594             ["|Beta", 0],
595             ["|Alpha|Epsilon", 0],
596             ["|Alpha|Epsilon|Kappa", 1],
597             ["|Alpha|Zeta", 0],
598             ["|Alpha|Zeta|Lambda", 1],
599             ["|Alpha|Zeta|Mu", 0],
600             ["|Beta|Eta", 1],
601             ["|Beta|Theta", 1],
602             ]
603              
604             If we are providing a true value to the C key, we also get to
605             choose what character to use as the separator in the C column.
606              
607             $taxonomy_with_path_as_string_different_path_col_sep =
608             $self->pathify( {
609             as_string => 1,
610             path_col_sep => '~~',
611             } );
612              
613             Yields:
614              
615             [
616             ["path", "is_actionable"],
617             ["~~Alpha", 0],
618             ["~~Beta", 0],
619             ["~~Alpha~~Epsilon", 0],
620             ["~~Alpha~~Epsilon~~Kappa", 1],
621             ["~~Alpha~~Zeta", 0],
622             ["~~Alpha~~Zeta~~Lambda", 1],
623             ["~~Alpha~~Zeta~~Mu", 0],
624             ["~~Beta~~Eta", 1],
625             ["~~Beta~~Theta", 1],
626             ]
627              
628             Finally, should we want the C column in the returned arrayref to be
629             named something other than I, we can provide a value to the C
630             key.
631              
632             [
633             ["foo", "is_actionable"],
634             [["", "Alpha"], 0],
635             [["", "Beta"], 0],
636             [["", "Alpha", "Epsilon"], 0],
637             [["", "Alpha", "Epsilon", "Kappa"], 1],
638             [["", "Alpha", "Zeta"], 0],
639             [["", "Alpha", "Zeta", "Lambda"], 1],
640             [["", "Alpha", "Zeta", "Mu"], 0],
641             [["", "Beta", "Eta"], 1],
642             [["", "Beta", "Theta"], 1],
643             ]
644              
645             item * Arguments
646              
647             Optional single hash reference. If provided, the following keys may be used:
648              
649             =over 4
650              
651             =item * C
652              
653             User-supplied name for column holding path information in the returned array
654             reference. Defaults to C.
655              
656             =item * C
657              
658             Boolean. If supplied with a true value, path information will be represented
659             as a separator-delimited string rather than an array reference.
660              
661             =item * C
662              
663             User-supplied string to be used to separate the parts of the route when
664             C is called with a true value. Not meaningful unless C
665             is true.
666              
667             =back
668              
669             =item * Return Value
670              
671             Reference to an array of array references. The first element in the array
672             will be a reference to an array of field names. Each succeeding element will
673             be a reference to an array holding data for one record in the original
674             taxonomy. The path data will be represented, by default, as an array
675             reference built up from the component (C) column in the original
676             taxonomy, but if C is selected, the path data in all non-header
677             elements will be a separator-delimited string.
678              
679             =back
680              
681             =cut
682              
683             sub pathify {
684 20     20 1 20056 my ($self, $args) = @_;
685 20 100       96 if (defined $args) {
686 12 100 100     88 unless (ref($args) and (reftype($args) eq 'HASH')) {
687 2         269 croak "Argument to pathify() must be hash ref";
688             }
689 10         17 my %permissible_args = map { $_ => 1 } ( qw| path_col as_string path_col_sep | );
  30         77  
690 10         16 for my $k (keys %{$args}) {
  10         56  
691             croak "'$k' is not a recognized key for pathify() argument hashref"
692 14 100       136 unless $permissible_args{$k};
693             }
694 9 100 100     45 if ($args->{path_col_sep} and not $args->{as_string}) {
695 1         96 croak "Supplying a value for key 'path_col_step' is only valid when also supplying true value for 'as_string'";
696             }
697             }
698 16 100       58 $args->{path_col} = defined($args->{path_col}) ? $args->{path_col} : 'path';
699 16 100       41 if ($args->{as_string}) {
700 6 100       19 $args->{path_col_sep} = defined($args->{path_col_sep}) ? $args->{path_col_sep} : '|';
701             }
702              
703 16         29 my @rewritten = ();
704 16         26 my @fields_in = @{$self->fields};
  16         70  
705 16         37 my @fields_out = ( $args->{path_col} );
706 16         32 for my $f (@fields_in) {
707 94 100 100     184 unless (
      100        
708             ($f eq $self->id_col) or
709             ($f eq $self->parent_id_col) or
710             ($f eq $self->leaf_col)
711             ) {
712 46         104 push @fields_out, $f;
713             }
714             }
715 16         30 push @rewritten, \@fields_out;
716              
717 16         39 my %colsin2idx = map { $fields_in[$_] => $_ } (0 .. $#fields_in);
  94         218  
718              
719 198         402 my %hashed_data = map { $_->[$self->id_col_idx] => {
720             parent_id => $_->[$self->parent_id_col_idx],
721             leaf => $_->[$self->leaf_col_idx],
722 16         36 } } @{$self->data_records};
  16         58  
723              
724 16         48 my @this_path = ();
725 16         19 my $code;
726             $code = sub {
727 447     447   609 my $id = shift;
728 447         811 push @this_path, $hashed_data{$id}{leaf};
729 447         662 my $parent_id = $hashed_data{$id}{parent_id};
730 447 100       711 if (length($parent_id)) {
731 249         278 &{$code}($parent_id);
  249         502  
732             }
733             else {
734 198         306 push @this_path, '';
735             }
736 447         650 return;
737 16         63 };
738 16         25 for my $rec (@{$self->data_records}) {
  16         42  
739 198         213 my @new_record;
740 198         394 &{$code}($rec->[$self->id_col_idx]);
  198         372  
741 198         485 my $path_as_array_ref = [ reverse @this_path ];
742 198 100       414 if ($args->{as_string}) {
743             push @new_record,
744 62         84 join($args->{path_col_sep} => @{$path_as_array_ref});
  62         155  
745             }
746             else {
747 136         221 push @new_record, $path_as_array_ref;
748             }
749 198         363 for my $f (grep { $_ ne $args->{path_col} } @fields_out) {
  804         1514  
750 606         1258 push @new_record, $rec->[$colsin2idx{$f}];
751             }
752 198         317 push @rewritten, \@new_record;
753 198         457 @this_path = ();
754             }
755 16         83 return \@rewritten;
756             }
757              
758             =head2 C
759              
760             =over 4
761              
762             =item * Purpose
763              
764             Create a CSV-formatted file holding the data returned by C.
765              
766             =item * Arguments
767              
768             $csv_file = $self->write_pathified_to_csv( {
769             pathified => $pathified, # output of pathify()
770             csvfile => './t/data/taxonomy_out5.csv',
771             } );
772              
773             Single hash reference. That hash is keyed on:
774              
775             =over 4
776              
777             =item * C
778              
779             B Its value must be the arrayref of hash references returned by
780             the C method.
781              
782             =item * C
783              
784             Optional. Path to location where a CSV-formatted text file holding the
785             taxonomy-by-adjacent-list will be written. Defaults to a file called
786             F in the current working directory.
787              
788             =item * Text::CSV_XS options
789              
790             You can also pass through any key-value pairs normally accepted by
791             F.
792              
793             =back
794              
795             =item * Return Value
796              
797             Returns path to CSV-formatted text file just created.
798              
799             =item * Example
800              
801             Suppose we have a CSV-formatted file holding the following taxonomy-by-adjacent-list:
802              
803             "id","parent_id","name","is_actionable"
804             "1","","Alpha","0"
805             "2","","Beta","0"
806             "3","1","Epsilon","0"
807             "4","3","Kappa","1"
808             "5","1","Zeta","0"
809             "6","5","Lambda","1"
810             "7","5","Mu","0"
811             "8","2","Eta","1"
812             "9","2","Theta","1"
813              
814             After running this file through C, C and
815             C we will have a new CSV-formatted file holding
816             this taxonomy-by-materialized-path:
817              
818             path,is_actionable
819             |Alpha,0
820             |Beta,0
821             |Alpha|Epsilon,0
822             |Alpha|Epsilon|Kappa,1
823             |Alpha|Zeta,0
824             |Alpha|Zeta|Lambda,1
825             |Alpha|Zeta|Mu,0
826             |Beta|Eta,1
827             |Beta|Theta,1
828              
829             Note that the C, C and C columns have been replaced by the column.
830              
831             =back
832              
833             =cut
834              
835             sub write_pathified_to_csv {
836 8     8 1 3175 my ($self, $args) = @_;
837 8 100       17 if (defined $args) {
838 7 100 100     241 croak "Argument to 'pathify()' must be hashref"
839             unless (ref($args) and reftype($args) eq 'HASH');
840             croak "Argument to 'pathify()' must have 'pathified' element"
841 5 100       133 unless exists $args->{pathified};
842             croak "Argument 'pathified' must be array reference"
843             unless (ref($args->{pathified}) and
844 4 100 100     222 reftype($args->{pathified}) eq 'ARRAY');
845             }
846             else {
847 1         155 croak "write_pathified_to_csv() must be supplied with hashref"
848             }
849 2         4 my $pathified = $args->{pathified};
850 2         5 delete $args->{pathified};
851              
852             # Test whether we're working with first element array ref or first element
853             # string
854 2 100       7 my $path_as_string = (! ref($pathified->[1]->[0])) ? 1 : 0;
855              
856 2         9 my $columns_in = $self->fields;
857 6         17 my %path_columns = map {$_ => 1} (
858             $self->{id_col},
859             $self->{parent_id_col},
860             $self->{leaf_col},
861 2         7 );
862             my @non_path_columns_in =
863 2         6 map { $columns_in->[$_] }
864 8         22 grep { ! $path_columns{$columns_in->[$_]} }
865 2         6 (0..$#{$columns_in});
  2         6  
866 2         7 my @columns_out = (qw| path |);
867 2         4 push @columns_out, @non_path_columns_in;
868              
869 2         8954 my $cwd = cwd();
870             my $csvfile = defined($args->{csvfile})
871             ? $args->{csvfile}
872 2 100       34 : "$cwd/taxonomy_out.csv";
873 2         15 delete $args->{csvfile};
874              
875             # By this point, we should have processed all args other than those
876             # intended for Text::CSV_XS and assigned their contents to variables as
877             # needed.
878              
879 2         23 my $csv_args = { binary => 1 };
880 2         8 while (my ($k,$v) = each %{$args}) {
  4         32  
881 2         13 $csv_args->{$k} = $v;
882             }
883 2         56 my $csv = Text::CSV_XS->new($csv_args);
884 2 50       665 open my $OUT, ">:encoding(utf8)", $csvfile
885             or croak "Unable to open $csvfile for writing";
886 2 100       13444 $csv->eol(defined($csv_args->{eol}) ? $csv_args->{eol} : "\n");
887 2         118 $csv->print($OUT, [@columns_out]);
888 2         28 for my $rec (@{$pathified}[1..$#{$pathified}]) {
  2         12  
  2         8  
889             $csv->print(
890             $OUT,
891             $path_as_string
892             ? $rec
893             : [
894 9         32 join('|' => @{$rec->[0]}),
895 18 100       228 @{$rec}[1..$#columns_out]
  9         52  
896             ]
897             );
898             }
899 2 50       138 close $OUT or croak "Unable to close $csvfile after writing";
900              
901 2         58 return $csvfile;
902             }
903              
904             1;
905              
906             # vim: formatoptions=crqot