File Coverage

lib/Parse/Taxonomy/MaterializedPath.pm

Criterion	Covered	Total	%
statement	326	326	100.0
branch	134	140	95.7
condition	39	40	97.5
subroutine	24	24	100.0
pod	11	11	100.0
total	534	541	98.7

line	stmt	bran	cond	sub	pod	time	code
1							package Parse::Taxonomy::MaterializedPath;
2	6			6		15488	use strict;
	6					10
	6					212
3	6			6		3210	use parent qw( Parse::Taxonomy );
	6					1443
	6					31
4	6			6		246	use Carp;
	6					11
	6					332
5	6			6		6180	use Text::CSV_XS;
	6					58172
	6					394
6	6			6		55	use Scalar::Util qw( reftype );
	6					10
	6					328
7	6			6		29	use List::Util qw( max );
	6					72
	6					630
8	6			6		29	use Cwd;
	6					9
	6					504
9							our $VERSION = '0.22';
10	6					23072	use Parse::Taxonomy::Auxiliary qw(
11							path_check_fields
12							components_check_fields
13	6			6		2059	);
	6					14
14
15							=head1 NAME
16
17							Parse::Taxonomy::MaterializedPath - Validate a file for use as a path-based taxonomy
18
19							=head1 SYNOPSIS
20
21							use Parse::Taxonomy::MaterializedPath;
22
23							# 'file' interface: reads a CSV file for you
24
25							$source = "./t/data/alpha.csv";
26							$self = Parse::Taxonomy::MaterializedPath->new( {
27							file => $source,
28							} );
29
30							# 'components' interface: as if you've already read a
31							# CSV file and now have Perl array references to header and data rows
32
33							$self = Parse::Taxonomy::MaterializedPath->new( {
34							components => {
35							fields => $fields,
36							data_records => $data_records,
37							}
38							} );
39
40							=head1 METHODS
41
42							=head2 C
43
44							=over 4
45
46							=item * Purpose
47
48							Parse::Taxonomy::MaterializedPath constructor.
49
50							=item * Arguments
51
52							Single hash reference. There are two possible interfaces: C and C.
53
54							=over 4
55
56							=item 1 C interface
57
58							$source = "./t/data/alpha.csv";
59							$self = Parse::Taxonomy::MaterializedPath->new( {
60							file => $source,
61							path_col_idx => 0,
62							path_col_sep => '\|',
63							%TextCSVoptions,
64							} );
65
66							Elements in the hash reference are keyed on:
67
68							=over 4
69
70							=item * C
71
72							Absolute or relative path to the incoming taxonomy file.
73							B for this interface.
74
75							=item * C
76
77							If the column to be used as the "path" column in the incoming taxonomy file is
78							B the first column, this option must be set to the integer representing
79							the "path" column's index position (count starts at 0). Optional; defaults to C<0>.
80
81							=item * C
82
83							If the string used to distinguish components of the path in the path column in
84							the incoming taxonomy file is not a pipe (C<\|>), this option must be set.
85							Optional; defaults to C<\|>.
86
87							=item * Text::CSV_XS options
88
89							Any other options which could normally be passed to Cnew()> will
90							be passed through to that module's constructor. On the recommendation of the
91							Text::CSV documentation, C is always set to a true value.
92
93							=back
94
95							=item 2 C interface
96
97							$self = Parse::Taxonomy::MaterializedPath->new( {
98							components => {
99							fields => $fields,
100							data_records => $data_records,
101							}
102							} );
103
104							Elements in this hash are keyed on:
105
106							=over 4
107
108							=item * C
109
110							This element is B for the
111							C interface. The value of this element is a hash reference with two keys, C and
112							C. C is a reference to an array holding the field or
113							column names for the data set. C is a reference to an array of
114							array references, each of the latter arrayrefs holding one record or row from
115							the data set.
116
117							=item * C
118
119							Same as in C interface above.
120
121							=item * C
122
123							Same as in C interface above.
124
125							=back
126
127							=back
128
129							=item * Return Value
130
131							Parse::Taxonomy::MaterializedPath object.
132
133							=item * Comment
134
135							C will throw an exception under any of the following conditions:
136
137							=over 4
138
139							=item * Argument to C is not a reference.
140
141							=item * Argument to C is not a hash reference.
142
143							=item * In the C interface, unable to locate the file which is the value of the C element.
144
145							=item * Argument to C element is not an integer.
146
147							=item * Argument to C is greater than the index number of the
148							last element in the header row of the incoming taxonomy file, I the
149							C is wrong.
150
151							=item * The same field is found more than once in the header row of the
152							incoming taxonomy file.
153
154							=item * Unable to open or close the incoming taxonomy file for reading.
155
156							=item * In the column designated as the "path" column, the same value is
157							observed more than once.
158
159							=item * C, C, C, C and C are reserved terms.
160							One or more columns is named with a reserved term.
161
162							=item * A non-parent node's parent node cannot be located in the incoming taxonomy file.
163
164							=item * A data row has a number of fields different from the number of fields
165							in the header row.
166
167							=back
168
169							=back
170
171							=cut
172
173							sub new {
174	62			62	1	61297	my ($class, $args) = @_;
175	62					110	my $data;
176
177	62	100	100			941	croak "Argument to 'new()' must be hashref"
178							unless (ref($args) and reftype($args) eq 'HASH');
179	60					91	my $argscount = 0;
180	60	100				182	$argscount++ if $args->{file};
181	60	100				154	$argscount++ if $args->{components};
182	60	100				254	croak "Argument to 'new()' must have either 'file' or 'components' element"
183							if ($argscount == 0);
184	59	100				329	croak "Argument to 'new()' must have either 'file' or 'components' element but not both"
185							if ($argscount == 2);
186
187	58	100				152	if (exists $args->{path_col_idx}) {
188							croak "Argument to 'path_col_idx' must be integer"
189	4	100				143	unless $args->{path_col_idx} =~ m/^\d+$/;
190							}
191	57		100			330	$data->{path_col_idx} = delete $args->{path_col_idx} \|\| 0;
192							$data->{path_col_sep} = exists $args->{path_col_sep}
193							? $args->{path_col_sep}
194	57	100				233	: '\|';
195	57	100				134	if (exists $args->{path_col_sep}) {
196	5					14	$data->{path_col_sep} = $args->{path_col_sep};
197	5					11	delete $args->{path_col_sep};
198							}
199							else {
200	52					102	$data->{path_col_sep} = '\|';
201							}
202
203	57	100				129	if ($args->{components}) {
204							croak "Value of 'components' element must be hashref"
205	30	100	100			478	unless (ref($args->{components}) and reftype($args->{components}) eq 'HASH');
206	28					53	for my $k ( qw\| fields data_records \| ) {
207							croak "Value of 'components' element must have '$k' key-value pair"
208	53	100				393	unless exists $args->{components}->{$k};
209							croak "Value of '$k' element must be arrayref"
210							unless (ref($args->{components}->{$k}) and
211	51	100	100			710	reftype($args->{components}->{$k}) eq 'ARRAY');
212							}
213	23					39	for my $row (@{$args->{components}->{data_records}}) {
	23					57
214	292	100	100			1677	croak "Each element in 'data_records' array must be arrayref"
215							unless (ref($row) and reftype($row) eq 'ARRAY');
216							}
217							# We don't want to stick $args->{components} into the object as is.
218							# Rather, we want to insert 'fields' and 'data_records' for
219							# consistency with the 'file' interface. But to do that we first need
220							# to impose the same validations that we do for the 'file' interface.
221							# We also need to populate 'path_col'.
222	21					69	_prepare_fields($data, $args->{components}->{fields}, 1);
223	18					36	my $these_data_records = $args->{components}->{data_records};
224	18					35	delete $args->{components};
225	18					56	_prepare_data_records($data, $these_data_records, $args);
226							}
227							else {
228							croak "Cannot locate file '$args->{file}'"
229	27	100				722	unless (-f $args->{file});
230	26					65	$data->{file} = delete $args->{file};
231
232							# We've now handled all the Parse::Taxonomy::MaterializedPath-specific options.
233							# Any remaining options are assumed to be intended for Text::CSV_XS::new().
234
235	26					53	$args->{binary} = 1;
236	26	50				139	my $csv = Text::CSV_XS->new ( $args )
237							or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
238							open my $IN, "<", $data->{file}
239	26	50				3339	or croak "Unable to open '$data->{file}' for reading";
240	4			4		2677	my $header_ref = $csv->getline($IN);
	4					20950
	4					146
	26					1221
241
242	26					1299	_prepare_fields($data, $header_ref);
243	23					591	my $data_records = $csv->getline_all($IN);
244	23	50				9275	close $IN or croak "Unable to close after reading";
245	23					78	_prepare_data_records($data, $data_records, $args);
246							}
247
248	32					56	while (my ($k,$v) = each %{$args}) {
	50					179
249	18					43	$data->{$k} = $v;
250							}
251	32					62	my %row_analysis = ();
252	32					46	for my $el (@{$data->{data_records}}) {
	32					74
253	425					636	my $rowkey = $el->[$data->{path_col_idx}];
254	425					1338	$row_analysis{$rowkey} = split(/\Q$data->{path_col_sep}\E/, $rowkey);
255							}
256	32					97	$data->{row_analysis} = \%row_analysis;
257	32					161	return bless $data, $class;
258							}
259
260							sub _prepare_fields {
261	47			47		90	my ($data, $fields_ref, $components) = @_;
262	47	100				120	if (! $components) {
263	26					61	_check_path_col_idx($data, $fields_ref, 0);
264	25					88	path_check_fields($data, $fields_ref);
265							}
266							else {
267	21					60	_check_path_col_idx($data, $fields_ref, 1);
268	20					68	components_check_fields($data, $fields_ref);
269							}
270
271	43					93	my %fields_seen = map { $_ => 1 } @{$fields_ref};
	229					508
	43					86
272	43					133	my @bad_fields = ();
273	43					102	for my $reserved ( qw\| id parent_id name lft rgh \| ) {
274	215	100				525	push @bad_fields, $reserved if $fields_seen{$reserved};
275							}
276	43					121	my $msg = "Bad column names: <@bad_fields>. These are reserved for ";
277	43					67	$msg .= "Parse::Taxonomy's internal use; please rename";
278	43	100				379	croak $msg if @bad_fields;
279
280	41					100	$data->{fields} = $fields_ref;
281	41					119	$data->{path_col} = $data->{fields}->[$data->{path_col_idx}];
282	41					109	return $data;
283							}
284
285							sub _check_path_col_idx {
286	47			47		71	my ($data, $fields_ref, $components) = @_;
287	47					122	my $error_msg = "Argument to 'path_col_idx' exceeds index of last field in ";
288	47	100				145	$error_msg .= $components
289							? "'fields' array ref"
290							: "header row in '$data->{file}'";
291
292	47	100				70	croak $error_msg if $data->{path_col_idx} > $#{$fields_ref};
	47					487
293							}
294
295							sub _prepare_data_records {
296							# Confirm each row's path starts with path_col_sep:
297							# Confirm no duplicate entries in column holding path:
298							# Confirm all rows have same number of columns as header:
299	41			41		70	my ($data, $data_records, $args) = @_;
300	41					57	my $error_msg;
301	41					68	my @bad_path_cols = ();
302	41					59	my @bad_count_records = ();
303	41					78	my %paths_seen = ();
304	41					53	my $field_count = scalar(@{$data->{fields}});
	41					97
305	41					53	for my $rec (@{$data_records}) {
	41					141
306	537	100				2406	unless ($rec->[$data->{path_col_idx}] =~ m/^\Q$data->{path_col_sep}\E/) {
307	6					13	push @bad_path_cols, $rec->[$data->{path_col_idx}];
308							}
309	537					1223	$paths_seen{$rec->[$data->{path_col_idx}]}++;
310	537					569	my $this_row_count = scalar(@{$rec});
	537					791
311	537	100				1423	if ($this_row_count != $field_count) {
312							push @bad_count_records,
313	5					18	[ $rec->[$data->{path_col_idx}], $this_row_count ];
314							}
315							}
316	41					88	$error_msg = <
317							The value of the column designated as path must start with the path column separator.
318							Rows with the following paths fail to do so:
319							IMPROPER_PATH
320	41					117	for my $path (@bad_path_cols) {
321	6					17	$error_msg .= " $path\n";
322							}
323	41	100				395	croak $error_msg if @bad_path_cols;
324
325	39					72	my @dupe_paths = ();
326	39					344	for my $path (sort keys %paths_seen) {
327	504	100				1025	push @dupe_paths, $path if $paths_seen{$path} > 1;
328							}
329	39					107	$error_msg = <
330							No duplicate entries are permitted in column designated as path.
331							The following entries appear the number of times shown:
332							ERROR_MSG_DUPE
333	39					79	for my $path (@dupe_paths) {
334	5					26	$error_msg .= " $path:" . sprintf(" %6s\n" => $paths_seen{$path});
335							}
336	39	100				609	croak $error_msg if @dupe_paths;
337
338	36					92	$error_msg = <
339							Header row has $field_count records. The following records had different counts:
340							ERROR_MSG_WRONG_COUNT
341	36					64	for my $rec (@bad_count_records) {
342	4					14	$error_msg .= " $rec->[0]: $rec->[1]\n";
343							}
344	36	100				346	croak $error_msg if @bad_count_records;
345
346							# Confirm each node appears in taxonomy:
347	34					53	my $path_args = { map { $_ => $args->{$_} } keys %{$args} };
	19					71
	34					86
348	34					90	$path_args->{sep} = $data->{path_col_sep};
349	34	50				180	my $path_csv = Text::CSV_XS->new ( $path_args )
350							or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
351	34					3921	my %missing_parents = ();
352	34					246	for my $path (sort keys %paths_seen) {
353	447					1234	my $status = $path_csv->parse($path);
354	447					7193	my @columns = $path_csv->fields();
355	447	100				3423	if (@columns > 2) {
356							my $parent =
357	324					873	join($path_args->{sep} => @columns[0 .. ($#columns - 1)]);
358	324	100				1209	unless (exists $paths_seen{$parent}) {
359	4					15	$missing_parents{$path} = $parent;
360							}
361							}
362							}
363	34					108	$error_msg = <
364							Each node in the taxonomy must have a parent.
365							The following nodes lack the expected parent:
366							ERROR_MSG_ORPHAN
367	34					82	for my $path (sort keys %missing_parents) {
368	4					13	$error_msg .= " $path: $missing_parents{$path}\n";
369							}
370	34	100				361	croak $error_msg if scalar(keys %missing_parents);
371							# BBB end of validations
372	32					63	$data->{data_records} = $data_records;
373
374	32					423	return $data;
375							}
376
377							=head2 C
378
379							=over 4
380
381							=item * Purpose
382
383							Identify the names of the columns in the taxonomy.
384
385							=item * Arguments
386
387							my $fields = $self->fields();
388
389							No arguments; the information is already inside the object.
390
391							=item * Return Value
392
393							Reference to an array holding a list of the columns as they appear in the
394							header row of the incoming taxonomy file.
395
396							=item * Comment
397
398							Read-only.
399
400							=back
401
402							=head2 C
403
404							=over 4
405
406							=item * Purpose
407
408							Identify the index position (count starts at 0) of the column in the incoming
409							taxonomy file which serves as the path column.
410
411							=item * Arguments
412
413							my $path_col_idx = $self->path_col_idx;
414
415							No arguments; the information is already inside the object.
416
417							=item * Return Value
418
419							Integer in the range from 0 to 1 less than the number of columns in the header
420							row.
421
422							=item * Comment
423
424							Read-only.
425
426							=back
427
428							=cut
429
430							sub path_col_idx {
431	15			15	1	3952	my $self = shift;
432	15					33	return $self->{path_col_idx};
433							}
434
435							=head2 C
436
437							=over 4
438
439							=item * Purpose
440
441							Identify the name of the column in the incoming taxonomy which serves as the
442							path column.
443
444							=item * Arguments
445
446							my $path_col = $self->path_col;
447
448							No arguments; the information is already inside the object.
449
450							=item * Return Value
451
452							String.
453
454							=item * Comment
455
456							Read-only.
457
458							=back
459
460							=cut
461
462							sub path_col {
463	4			4	1	1522	my $self = shift;
464	4					10	return $self->{path_col};
465							}
466
467							=head2 C
468
469							=over 4
470
471							=item * Purpose
472
473							Identify the string used to separate path components once the taxonomy has
474							been created. This is just a "getter" and is logically distinct from the
475							option to C which is, in effect, a "setter."
476
477							=item * Arguments
478
479							my $path_col_sep = $self->path_col_sep;
480
481							No arguments; the information is already inside the object.
482
483							=item * Return Value
484
485							String.
486
487							=item * Comment
488
489							Read-only.
490
491							=back
492
493							=cut
494
495							sub path_col_sep {
496	4			4	1	1505	my $self = shift;
497	4					10	return $self->{path_col_sep};
498							}
499
500							=head2 C
501
502							=over 4
503
504							=item * Purpose
505
506							Once the taxonomy has been validated, get a list of its data rows as a Perl
507							data structure.
508
509							=item * Arguments
510
511							$data_records = $self->data_records;
512
513							None.
514
515							=item * Return Value
516
517							Reference to array of array references. The array will hold the data records
518							found in the incoming taxonomy file in their order in that file.
519
520							=item * Comment
521
522							Does not contain any information about the fields in the taxonomy, so you
523							should probably either (a) use in conjunction with C method above;
524							or (b) use C.
525
526							=back
527
528							=head2 C
529
530							=over 4
531
532							=item * Purpose
533
534							Once the taxonomy has been validated, get a list of its header and data rows as a Perl
535							data structure.
536
537							=item * Arguments
538
539							$data_records = $self->fields_and_data_records;
540
541							None.
542
543							=item * Return Value
544
545							Reference to array of array references. The first element in the array will
546							hold the header row (same as output of C). The remaining elements
547							will hold the data records found in the incoming taxonomy file in their order
548							in that file.
549
550							=back
551
552							=cut
553
554							=head2 C
555
556							=over 4
557
558							=item * Purpose
559
560							Once the taxonomy has been validated, get a list of its data rows as a Perl
561							data structure. In each element of this list, the path is now represented as
562							an array reference rather than a string.
563
564							=item * Arguments
565
566							$data_records_path_components = $self->data_records_path_components;
567
568							None.
569
570							=item * Return Value
571
572							Reference to array of array references. The array will hold the data records
573							found in the incoming taxonomy file in their order in that file.
574
575							=item * Comment
576
577							Does not contain any information about the fields in the taxonomy, so you may
578							wish to use this method either (a) use in conjunction with C method
579							above; or (b) use C.
580
581							=back
582
583							=cut
584
585							sub data_records_path_components {
586	13			13	1	2109	my $self = shift;
587	13					24	my @all_rows = ();
588	13					16	for my $row (@{$self->{data_records}}) {
	13					51
589	182					298	my $path_col = $row->[$self->{path_col_idx}];
590	182					646	my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
591	182					267	my @rewritten = ();
592	182					251	for (my $i = 0; $i <= $#{$row}; $i++) {
	1062					2289
593	880	100				1526	if ($i != $self->{path_col_idx}) {
594	698					1325	push @rewritten, $row->[$i];
595							}
596							else {
597	182					381	push @rewritten, \@path_components;
598							}
599							}
600	182					414	push @all_rows, \@rewritten;
601							}
602	13					61	return \@all_rows;
603							}
604
605							=head2 C
606
607							=over 4
608
609							=item * Purpose
610
611							Once the taxonomy has been validated, get a list of its data rows as a Perl
612							data structure. The first element in this list is an array reference holding
613							the header row. In each data element of this list, the path is now represented as
614							an array reference rather than a string.
615
616							=item * Arguments
617
618							$fields_and_data_records_path_components = $self->fields_and_data_records_path_components;
619
620							None.
621
622							=item * Return Value
623
624							Reference to array of array references. The array will hold the data records
625							found in the incoming taxonomy file in their order in that file.
626
627							=back
628
629							=cut
630
631							sub fields_and_data_records_path_components {
632	6			6	1	4230	my $self = shift;
633	6					21	my @all_rows = $self->fields;
634	6					10	for my $row (@{$self->{data_records}}) {
	6					18
635	78					127	my $path_col = $row->[$self->{path_col_idx}];
636	78					239	my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
637	78					102	my @rewritten = ();
638	78					107	for (my $i = 0; $i <= $#{$row}; $i++) {
	546					1095
639	468	100				815	if ($i != $self->{path_col_idx}) {
640	390					721	push @rewritten, $row->[$i];
641							}
642							else {
643	78					148	push @rewritten, \@path_components;
644							}
645							}
646	78					187	push @all_rows, \@rewritten;
647							}
648	6					17	return \@all_rows;
649							}
650
651							=head2 C
652
653							=over 4
654
655							=item * Purpose
656
657							Identify the index position of a given field within the header row.
658
659							=item * Arguments
660
661							$index = $self->get_field_position('income');
662
663							Takes a single string holding the name of one of the fields (column names).
664
665							=item * Return Value
666
667							Integer representing the index position (counting from C<0>) of the field
668							provided as argument. Throws exception if the argument is not actually a
669							field.
670
671							=back
672
673							=cut
674
675							=head2 C
676
677							=over 4
678
679							=item * Purpose
680
681							Display the number of descendant (multi-generational) nodes each node in the
682							taxonomy has.
683
684							=item * Arguments
685
686							$descendant_counts = $self->descendant_counts();
687
688							$descendant_counts = $self->descendant_counts( { generations => 1 } );
689
690							None required; one optional hash reference. Currently, the only element
691							honored in that hashref is C, whose value must be a non-negative
692							integer. If, instead of getting the count of all descendants of a node, you
693							only want the count of its first generation, i.e., its immediate children, you
694							provide a value of C<1>. Want the count of only the first and second
695							generations? Provide a value of C<2> -- and so on.
696
697							=item * Return Value
698
699							Reference to hash in which each element is keyed on the value of the path
700							column in the incoming taxonomy file.
701
702							=back
703
704							=cut
705
706							sub descendant_counts {
707	9			9	1	5146	my ($self, $args) = @_;
708	9	100				25	if (defined $args) {
709	5	100	100			337	croak "Argument to 'descendant_counts()' must be hashref"
710							unless (ref($args) and reftype($args) eq 'HASH');
711							croak "Value for 'generations' element passed to descendant_counts() must be integer > 0"
712	3	100	100			211	unless ($args->{generations} and $args->{generations} =~ m/^[0-9]+$/);
713							}
714	5					8	my %descendant_counts = ();
715	5					15	my $hashified = $self->hashify();
716	5					7	for my $p (keys %{$hashified}) {
	5					23
717	59					101	$descendant_counts{$p} = 0;
718	59					77	for my $q (
719	707					1496	grep { $self->{row_analysis}->{$_} > $self->{row_analysis}->{$p} }
720	59					175	keys %{$hashified}
721							) {
722	230	100				1170	if ($q =~ m/^\Q$p$self->{path_col_sep}\E/) {
723	61	100				117	if (! $args->{generations}) {
724	48					102	$descendant_counts{$p}++;
725							}
726							else {
727	13					47	my @c = $p =~ m/\Q$self->{path_col_sep}\E/g;
728	13					45	my @d = $q =~ m/\Q$self->{path_col_sep}\E/g;
729							$descendant_counts{$p}++
730	13	100				59	if (scalar(@d) - scalar(@c) <= $args->{generations});
731							}
732							}
733							}
734							}
735	5					16	$self->{descendant_counts} = \%descendant_counts;
736	5					55	return $self->{descendant_counts};
737							}
738
739							=head2 C
740
741							=over 4
742
743							=item * Purpose
744
745							Get the total number of descendant nodes for one specific node in a validated
746							taxonomy.
747
748							=item * Arguments
749
750							$descendant_count = $self->get_descendant_count('\|Path\|To\|Node');
751
752							$descendant_counts = $self->get_descendant_count('\|Path\|To\|Node', { generations => 1 } );
753
754							One required: string containing node's path as spelled in the taxonomy.
755
756							One optional hash reference. Currently, the only element honored in that
757							hashref is C, whose value must be a non-negative integer. If,
758							instead of getting the count of all descendants of a node, you only want the
759							count of its first generation, i.e., its immediate children, you provide a
760							value of C<1>. Want the count of only first and second generations? Provide
761							a value of C<2> -- and so on.
762
763							=item * Return Value
764
765							Unsigned integer >= 0. Any node whose child count is C<0> is by definition a
766							leaf node.
767
768							=item * Comment
769
770							Will throw an exception if the node does not exist or is misspelled.
771
772							If C is called with no second (hashref) argument
773							following an invocation of C, it will return a value from
774							an internal cache created during that earlier method call. Otherwise, it will
775							re-create the cache from scratch. (This, of course, assumes that you have not
776							manipulated the object's internal data subsequent to its creation.)
777
778							=back
779
780							=cut
781
782							sub get_descendant_count {
783	26			26	1	11235	my ($self, $node, $args) = @_;
784	26	100				100	if (defined $args) {
785	15	100	100			286	croak "Second argument to 'get_descendant_count()' must be hashref"
786							unless (ref($args) and reftype($args) eq 'HASH');
787							croak "Value for 'generations' element passed to second argument to get_descendant_count() must be integer > 0"
788	13	100	100			332	unless ($args->{generations} and $args->{generations} =~ m/^[0-9]+$/);
789							}
790	22	100				48	if (exists $self->{descendant_counts}) {
791	6					10	my $descendant_counts = $self->{descendant_counts};
792	6	100				286	croak "Node '$node' not found" unless exists $descendant_counts->{$node};
793	4					10	return $descendant_counts->{$node};
794							}
795							else {
796	16					25	my %descendant_counts = ();
797	16					31	my $hashified = $self->hashify();
798	16	100				165	croak "Node '$node' not found" unless exists $hashified->{$node};
799	15					24	for my $p ($node) {
800	15					24	$descendant_counts{$p} = 0;
801	15					17	for my $q (
802	231					529	grep { $self->{row_analysis}->{$_} > $self->{row_analysis}->{$p} }
803	15					62	keys %{$hashified}
804							) {
805	110	100				586	if ($q =~ m/^\Q$p$self->{path_col_sep}\E/) {
806	36	100				77	if (! $args->{generations}) {
807	10					24	$descendant_counts{$p}++;
808							}
809							else {
810	26					86	my @c = $p =~ m/\Q$self->{path_col_sep}\E/g;
811	26					97	my @d = $q =~ m/\Q$self->{path_col_sep}\E/g;
812							$descendant_counts{$p}++
813	26	100				117	if (scalar(@d) - scalar(@c) <= $args->{generations});
814							}
815							}
816							}
817							}
818	15					236	return $descendant_counts{$node};
819							}
820							}
821
822							=head2 C
823
824							=over 4
825
826							=item * Purpose
827
828							Turn a validated taxonomy into a Perl hash keyed on the column designated as
829							the path column.
830
831							=item * Arguments
832
833							$hashref = $self->hashify();
834
835							Takes an optional hashref holding a list of any of the following elements:
836
837							=over 4
838
839							=item * C
840
841							Boolean, defaulting to C<0>. By default, C will spell the
842							key of the hash exactly as the value of the path column is spelled in the
843							taxonomy -- which in turn is the way it was spelled in the incoming file.
844							That is, a path in the taxonomy spelled C<\|Alpha\|Beta\|Gamma> will be spelled
845							as a key in exactly the same way.
846
847							However, since in many cases (including the example above) the root node of
848							the taxonomy will be empty, the user may wish to remove the first instance of
849							C. The user would do so by setting
850							C to a true value.
851
852							$hashref = $self->hashify( {
853							remove_leading_path_col_sep => 1,
854							} );
855
856							In that case they key would now be spelled: C.
857
858							Note further that if the C switch is set to a true value, any
859							setting to C will be ignored.
860
861							=item * C
862
863							A string which will be used in composing the key of the hashref returned by
864							this method. The user may select this key if she does not want to use the
865							value found in the incoming CSV file (which by default will be the pipe
866							character (C<\|>) and which may be overridden with the C argument
867							to C.
868
869							$hashref = $self->hashify( {
870							key_delim => q{ - },
871							} );
872
873							In the above variant, a path that in the incoming taxonomy file was
874							represented by C<\|Alpha\|Beta\|Gamma> will in C<$hashref> be represented by
875							C< - Alpha - Beta - Gamma>.
876
877							=item * C
878
879							A string which will be used in composing the key of the hashref returned by
880							this method. The user will set this switch if she wishes to have the root
881							note explicitly represented. Using this switch will automatically cause
882							C to be ignored.
883
884							Suppose the user wished to have C be the text for the root
885							node. Suppose further that the user wanted to use the string C< - > as the
886							delimiter within the key.
887
888							$hashref = $self->hashify( {
889							root_str => q{All Suppliers},
890							key_delim => q{ - },
891							} );
892
893							Then incoming path C<\|Alpha\|Beta\|Gamma> would be keyed as:
894
895							All Suppliers - Alpha - Beta - Gamma
896
897							=back
898
899							=item * Return Value
900
901							Hash reference. The number of elements in this hash should be equal to the
902							number of non-header records in the taxonomy.
903
904							=back
905
906							=cut
907
908							sub hashify {
909	33			33	1	20102	my ($self, $args) = @_;
910	33	100				101	if (defined $args) {
911	8	100	100			306	croak "Argument to 'hashify()' must be hashref"
912							unless (ref($args) and reftype($args) eq 'HASH');
913							}
914	31					48	my %hashified = ();
915	31					58	my $fields = $self->{fields};
916	31					45	my %idx2col = map { $_ => $fields->[$_] } (0 .. $#{$fields});
	170					455
	31					72
917	31					77	for my $rec (@{$self->{data_records}}) {
	31					75
918	427					581	my $rowkey;
919	427	100				757	if ($args->{root_str}) {
920	39					83	$rowkey = $args->{root_str} . $rec->[$self->{path_col_idx}];
921							}
922							else {
923	388	100				614	if ($args->{remove_leading_path_col_sep}) {
924	26					175	($rowkey = $rec->[$self->{path_col_idx}]) =~ s/^\Q$self->{path_col_sep}\E(.*)/$1/;
925							}
926							else {
927	362					676	$rowkey = $rec->[$self->{path_col_idx}];
928							}
929							}
930	427	100				885	if ($args->{key_delim}) {
931	52					229	$rowkey =~ s/\Q$self->{path_col_sep}\E/$args->{key_delim}/g;
932							}
933	427					492	my $rowdata = { map { $idx2col{$_} => $rec->[$_] } (0 .. $#{$fields}) };
	2402					6173
	427					766
934	427					1605	$hashified{$rowkey} = $rowdata;
935							}
936	31					127	return \%hashified;
937							}
938
939							=head2 C
940
941							=over 4
942
943							=item * Purpose
944
945							Transform a taxonomy-by-materialized-path into a taxonomy-by-adjacent-list.
946
947							=item * Arguments
948
949							$adjacentified = $self->adjacentify();
950
951							$adjacentified = $self->adjacentify( { serial => 500 } );
952							$adjacentified = $self->adjacentify( { floor => 500 } ); # same as serial
953
954							Optional single hash reference.
955
956							For that hashref, C supports the key C, which defaults
957							to C<0>. C must be a non-negative integer and sets the "floor" above
958							which new unique IDs will be assigned to the C column. Hence, if
959							C is set to C<500>, the value assigned to the C column of the
960							first record to be processed will be C<501>.
961
962							Starting with version .19, C will serve as an alternative way of
963							providing the same information to C. If, however, by mistake
964							you provide B C and C elements in the hash, C
965							will take precedence.
966
967							=item * Return Value
968
969							Reference to an array of hash references. Each element represents one node in
970							the taxonomy. Each element will have key-value pairs for C, C
971							and C which will hold the adjacentification of the materialized path in the
972							original taxonomy-by-materialized-path. Each element will, as well, have KVPs for the
973							non-materialized-path fields in the records in the original taxonomy-by-materialized-path.
974
975							=item * Comment
976
977							See documentation for C for example.
978
979							Note that the order in which C will assign C and
980							C values to records in the taxonomy-by-adjacent-list will almost
981							certainly B match the order in which elements appear in a CSV file or in
982							the data structure returned by a method such as C.
983
984							=back
985
986							=cut
987
988							sub adjacentify {
989	15			15	1	9045	my ($self, $args) = @_;
990	15					26	my $serial = 0;
991	15	100				38	if (defined $args) {
992	9	100	100			353	croak "Argument to 'adjacentify()' must be hashref"
993							unless (ref($args) and reftype($args) eq 'HASH');
994	7					17	for my $w ('serial', 'floor') {
995	13	100				33	if (exists $args->{$w}) {
996							croak "Element '$w' in argument to 'adjacentify()' must be integer"
997	8	100				260	unless ($args->{$w} =~ m/^\d+$/);
998							}
999							}
1000	5		50			33	$serial = $args->{serial} \|\| $args->{floor} \|\| 0;
1001							}
1002
1003
1004	11					58	my $fields = $self->fields();
1005	11					34	my $drpc = $self->data_records_path_components();
1006
1007	11					31	my $path_col_idx = $self->path_col_idx();
1008	41					96	my %non_path_col2idx = map { $fields->[$_] => $_ }
1009	52					102	grep { $_ != $path_col_idx }
1010	11					20	(0..$#{$fields});
	11					20
1011
1012							my @components_by_row =
1013	11					25	map { my $f = $_->[$path_col_idx]; my $c = $#{$f}; [ @{$f}[1..$c] ] } @{$drpc};
	156					199
	156					164
	156					228
	156					206
	156					436
	11					22
1014	11					24	my $max_components = max( map { scalar(@{$_}) } @components_by_row);
	156					165
	156					257
1015	11					16	my @adjacentified = ();
1016	11					19	my %paths_to_id;
1017	11					35	for my $depth (1..$max_components) {
1018	34					80	for (my $r = 0; $r <= $#components_by_row; $r++) {
1019	498	100				487	if (scalar(@{$components_by_row[$r]}) == $depth) {
	498					1496
1020	156					299	my %rowdata = map { $_ => $drpc->[$r]->[$non_path_col2idx{$_}] }
	568					1341
1021							keys %non_path_col2idx;
1022	156					276	my @path_components = @{$drpc->[$r]->[$path_col_idx]};
	156					396
1023	156					232	my $name = $path_components[-1];
1024	156					331	my $parent_of_name = join('\|' =>
1025							@path_components[1 .. ($#path_components -1)]);
1026
1027	156	100				348	my $candidate_for_path = (length($parent_of_name))
1028							? join('\|' => $parent_of_name, $name)
1029							: $name;
1030
1031							my %rowhash = (
1032							id => ++$serial,
1033							parent_id => $paths_to_id{$parent_of_name}{id},
1034	156					885	name => $name,
1035							%rowdata,
1036							);
1037	156					486	$paths_to_id{$candidate_for_path}{id} = $rowhash{id};
1038	156	100				408	$paths_to_id{$candidate_for_path}{parent_path} = $parent_of_name
1039							if (length($parent_of_name));
1040	156					719	push @adjacentified, \%rowhash;
1041							}
1042							}
1043							}
1044	11					285	return \@adjacentified;
1045							}
1046
1047							=head2 C
1048
1049							=over 4
1050
1051							=item * Purpose
1052
1053							Create a CSV-formatted file holding the data returned by C.
1054
1055							=item * Arguments
1056
1057							$csv_file = $self->write_adjacentified_to_csv( {
1058							adjacentified => $adjacentified, # output of adjacentify()
1059							csvfile => './t/data/taxonomy_out3.csv',
1060							} );
1061
1062							Single hash reference. That hash is keyed on:
1063
1064							=over 4
1065
1066							=item * C
1067
1068							B Its value must be the arrayref of hash references returned by
1069							the C method.
1070
1071							=item * C
1072
1073							Optional. Path to location where a CSV-formatted text file holding the
1074							taxonomy-by-adjacent-list will be written. Defaults to a file called
1075							F in the current working directory.
1076
1077							=item * Text::CSV_XS options
1078
1079							You can also pass through any key-value pairs normally accepted by
1080							F.
1081
1082							=back
1083
1084							=item * Return Value
1085
1086							Returns path to CSV-formatted text file just created.
1087
1088							=item * Example
1089
1090							Suppose we have a CSV-formatted file holding the following taxonomy-by-materialized-path:
1091
1092							"path","is_actionable"
1093							"\|Alpha","0"
1094							"\|Beta","0"
1095							"\|Alpha\|Epsilon","0"
1096							"\|Alpha\|Epsilon\|Kappa","1"
1097							"\|Alpha\|Zeta","0"
1098							"\|Alpha\|Zeta\|Lambda","1"
1099							"\|Alpha\|Zeta\|Mu","0"
1100							"\|Beta\|Eta","1"
1101							"\|Beta\|Theta","1"
1102
1103							After running this file through C, C and
1104							C we will have a new CSV-formatted file holding
1105							this taxonomy-by-adjacent-list:
1106
1107							id,parent_id,name,is_actionable
1108							1,,Alpha,0
1109							2,,Beta,0
1110							3,1,Epsilon,0
1111							4,1,Zeta,0
1112							5,2,Eta,1
1113							6,2,Theta,1
1114							7,3,Kappa,1
1115							8,4,Lambda,1
1116							9,4,Mu,0
1117
1118							Note that the C column has been replaced by the C, C and
1119							C columns.
1120
1121							=back
1122
1123							=cut
1124
1125							sub write_adjacentified_to_csv {
1126	14			14	1	12827	my ($self, $args) = @_;
1127	14	100				38	if (defined $args) {
1128	13	100	100			278	croak "Argument to 'adjacentify()' must be hashref"
1129							unless (ref($args) and reftype($args) eq 'HASH');
1130							croak "Argument to 'adjacentify()' must have 'adjacentified' element"
1131	11	100				127	unless exists $args->{adjacentified};
1132							croak "Argument 'adjacentified' must be array reference"
1133							unless (ref($args->{adjacentified}) and
1134	10	100	100			262	reftype($args->{adjacentified}) eq 'ARRAY');
1135							}
1136							else {
1137	1					107	croak "write_adjacentified_to_csv() must be supplied with hashref"
1138							}
1139	8					16	my $adjacentified = $args->{adjacentified};
1140	8					14	delete $args->{adjacentified};
1141
1142	8					28	my $columns_in = $self->fields;
1143							my @non_path_columns_in =
1144	26					57	map { $columns_in->[$_] }
1145	34					72	grep { $_ != $self->{path_col_idx} }
1146	8					14	(0..$#{$columns_in});
	8					17
1147	8					22	my @columns_out = (qw\| id parent_id name \|);
1148	8					14	push @columns_out, @non_path_columns_in;
1149
1150	8					42222	my $cwd = cwd();
1151							my $csvfile = defined($args->{csvfile})
1152							? $args->{csvfile}
1153	8	100				89	: "$cwd/taxonomy_out.csv";
1154	8					66	delete $args->{csvfile};
1155
1156							# By this point, we should have processed all args other than those
1157							# intended for Text::CSV_XS and assigned their contents to variables as
1158							# needed.
1159
1160	8					67	my $csv_args = { binary => 1 };
1161	8					21	while (my ($k,$v) = each %{$args}) {
	11					70
1162	3					17	$csv_args->{$k} = $v;
1163							}
1164	8					153	my $csv = Text::CSV_XS->new($csv_args);
1165	8	50		1		2840	open my $OUT, ">:encoding(utf8)", $csvfile
	1					14
	1					8
	1					42
1166							or croak "Unable to open $csvfile for writing";
1167	8	100				13519	$csv->eol(defined($csv_args->{eol}) ? $csv_args->{eol} : "\n");
1168	8					413	$csv->print($OUT, [@columns_out]);
1169	8					83	for my $rec (@{$adjacentified}) {
	8					24
1170							$csv->print(
1171							$OUT,
1172	117					819	[ map { $rec->{$columns_out[$_]} } (0..$#columns_out) ]
	724					1920
1173							);
1174							}
1175	8	50				594	close $OUT or croak "Unable to close $csvfile after writing";
1176
1177	8					148	return $csvfile;
1178							}
1179
1180							1;
1181
1182							# vim: formatoptions=crqot