File Coverage

lib/Parse/Taxonomy/MaterializedPath.pm

Criterion	Covered	Total	%
statement	326	326	100.0
branch	134	140	95.7
condition	39	40	97.5
subroutine	24	24	100.0
pod	11	11	100.0
total	534	541	98.7

line	stmt	bran	cond	sub	pod	time	code
1							package Parse::Taxonomy::MaterializedPath;
2	6			6		13007	use strict;
	6					8
	6					157
3	6			6		1845	use parent qw( Parse::Taxonomy );
	6					1127
	6					22
4	6			6		185	use Carp;
	6					5
	6					249
5	6			6		3550	use Text::CSV_XS;
	6					36044
	6					263
6	6			6		29	use Scalar::Util qw( reftype );
	6					8
	6					239
7	6			6		19	use List::Util qw( max );
	6					38
	6					454
8	6			6		23	use Cwd;
	6					7
	6					379
9							our $VERSION = '0.23';
10	6					14563	use Parse::Taxonomy::Auxiliary qw(
11							path_check_fields
12							components_check_fields
13	6			6		1481	);
	6					8
14
15							=head1 NAME
16
17							Parse::Taxonomy::MaterializedPath - Validate a file for use as a path-based taxonomy
18
19							=head1 SYNOPSIS
20
21							use Parse::Taxonomy::MaterializedPath;
22
23							# 'file' interface: reads a CSV file for you
24
25							$source = "./t/data/alpha.csv";
26							$self = Parse::Taxonomy::MaterializedPath->new( {
27							file => $source,
28							} );
29
30							# 'components' interface: as if you've already read a
31							# CSV file and now have Perl array references to header and data rows
32
33							$self = Parse::Taxonomy::MaterializedPath->new( {
34							components => {
35							fields => $fields,
36							data_records => $data_records,
37							}
38							} );
39
40							=head1 METHODS
41
42							=head2 C
43
44							=over 4
45
46							=item * Purpose
47
48							Parse::Taxonomy::MaterializedPath constructor.
49
50							=item * Arguments
51
52							Single hash reference. There are two possible interfaces: C and C.
53
54							=over 4
55
56							=item 1 C interface
57
58							$source = "./t/data/alpha.csv";
59							$self = Parse::Taxonomy::MaterializedPath->new( {
60							file => $source,
61							path_col_idx => 0,
62							path_col_sep => '\|',
63							%TextCSVoptions,
64							} );
65
66							Elements in the hash reference are keyed on:
67
68							=over 4
69
70							=item * C
71
72							Absolute or relative path to the incoming taxonomy file.
73							B for this interface.
74
75							=item * C
76
77							If the column to be used as the "path" column in the incoming taxonomy file is
78							B the first column, this option must be set to the integer representing
79							the "path" column's index position (count starts at 0). Optional; defaults to C<0>.
80
81							=item * C
82
83							If the string used to distinguish components of the path in the path column in
84							the incoming taxonomy file is not a pipe (C<\|>), this option must be set.
85							Optional; defaults to C<\|>.
86
87							=item * Text::CSV_XS options
88
89							Any other options which could normally be passed to Cnew()> will
90							be passed through to that module's constructor. On the recommendation of the
91							Text::CSV documentation, C is always set to a true value.
92
93							=back
94
95							=item 2 C interface
96
97							$self = Parse::Taxonomy::MaterializedPath->new( {
98							components => {
99							fields => $fields,
100							data_records => $data_records,
101							}
102							} );
103
104							Elements in this hash are keyed on:
105
106							=over 4
107
108							=item * C
109
110							This element is B for the
111							C interface. The value of this element is a hash reference with two keys, C and
112							C. C is a reference to an array holding the field or
113							column names for the data set. C is a reference to an array of
114							array references, each of the latter arrayrefs holding one record or row from
115							the data set.
116
117							=item * C
118
119							Same as in C interface above.
120
121							=item * C
122
123							Same as in C interface above.
124
125							=back
126
127							=back
128
129							=item * Return Value
130
131							Parse::Taxonomy::MaterializedPath object.
132
133							=item * Comment
134
135							C will throw an exception under any of the following conditions:
136
137							=over 4
138
139							=item * Argument to C is not a reference.
140
141							=item * Argument to C is not a hash reference.
142
143							=item * In the C interface, unable to locate the file which is the value of the C element.
144
145							=item * Argument to C element is not an integer.
146
147							=item * Argument to C is greater than the index number of the
148							last element in the header row of the incoming taxonomy file, I the
149							C is wrong.
150
151							=item * The same field is found more than once in the header row of the
152							incoming taxonomy file.
153
154							=item * Unable to open or close the incoming taxonomy file for reading.
155
156							=item * In the column designated as the "path" column, the same value is
157							observed more than once.
158
159							=item * C, C, C, C and C are reserved terms.
160							One or more columns is named with a reserved term.
161
162							=item * A non-parent node's parent node cannot be located in the incoming taxonomy file.
163
164							=item * A data row has a number of fields different from the number of fields
165							in the header row.
166
167							=back
168
169							=back
170
171							=cut
172
173							sub new {
174	57			57	1	32597	my ($class, $args) = @_;
175	57					58	my $data;
176
177	57	100	100			629	croak "Argument to 'new()' must be hashref"
178							unless (ref($args) and reftype($args) eq 'HASH');
179	55					50	my $argscount = 0;
180	55	100				110	$argscount++ if $args->{file};
181	55	100				102	$argscount++ if $args->{components};
182	55	100				178	croak "Argument to 'new()' must have either 'file' or 'components' element"
183							if ($argscount == 0);
184	54	100				226	croak "Argument to 'new()' must have either 'file' or 'components' element but not both"
185							if ($argscount == 2);
186
187	53	100				84	if (exists $args->{path_col_idx}) {
188							croak "Argument to 'path_col_idx' must be integer"
189	4	100				107	unless $args->{path_col_idx} =~ m/^\d+$/;
190							}
191	52		100			206	$data->{path_col_idx} = delete $args->{path_col_idx} \|\| 0;
192							$data->{path_col_sep} = exists $args->{path_col_sep}
193							? $args->{path_col_sep}
194	52	100				142	: '\|';
195	52	100				63	if (exists $args->{path_col_sep}) {
196	5					8	$data->{path_col_sep} = $args->{path_col_sep};
197	5					6	delete $args->{path_col_sep};
198							}
199							else {
200	47					58	$data->{path_col_sep} = '\|';
201							}
202
203	52	100				67	if ($args->{components}) {
204							croak "Value of 'components' element must be hashref"
205	25	100	100			296	unless (ref($args->{components}) and reftype($args->{components}) eq 'HASH');
206	23					69	for my $k ( qw\| fields data_records \| ) {
207							croak "Value of 'components' element must have '$k' key-value pair"
208	43	100				253	unless exists $args->{components}->{$k};
209							croak "Value of '$k' element must be arrayref"
210							unless (ref($args->{components}->{$k}) and
211	41	100	100			430	reftype($args->{components}->{$k}) eq 'ARRAY');
212							}
213	18					19	for my $row (@{$args->{components}->{data_records}}) {
	18					69
214	231	100	100			818	croak "Each element in 'data_records' array must be arrayref"
215							unless (ref($row) and reftype($row) eq 'ARRAY');
216							}
217							# We don't want to stick $args->{components} into the object as is.
218							# Rather, we want to insert 'fields' and 'data_records' for
219							# consistency with the 'file' interface. But to do that we first need
220							# to impose the same validations that we do for the 'file' interface.
221							# We also need to populate 'path_col'.
222	16					33	_prepare_fields($data, $args->{components}->{fields}, 1);
223	13					15	my $these_data_records = $args->{components}->{data_records};
224	13					15	delete $args->{components};
225	13					26	_prepare_data_records($data, $these_data_records, $args);
226							}
227							else {
228							croak "Cannot locate file '$args->{file}'"
229	27	100				500	unless (-f $args->{file});
230	26					41	$data->{file} = delete $args->{file};
231
232							# We've now handled all the Parse::Taxonomy::MaterializedPath-specific options.
233							# Any remaining options are assumed to be intended for Text::CSV_XS::new().
234
235	26					32	$args->{binary} = 1;
236	26	50				107	my $csv = Text::CSV_XS->new ( $args )
237							or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
238							open my $IN, "<", $data->{file}
239	26	50				2359	or croak "Unable to open '$data->{file}' for reading";
240	4			4		1478	my $header_ref = $csv->getline($IN);
	4					13251
	4					118
	26					893
241
242	26					874	_prepare_fields($data, $header_ref);
243	23					454	my $data_records = $csv->getline_all($IN);
244	23	50				5459	close $IN or croak "Unable to close after reading";
245	23					52	_prepare_data_records($data, $data_records, $args);
246							}
247
248	27					28	while (my ($k,$v) = each %{$args}) {
	45					102
249	18					31	$data->{$k} = $v;
250							}
251	27					31	my %row_analysis = ();
252	27					24	for my $el (@{$data->{data_records}}) {
	27					47
253	364					268	my $rowkey = $el->[$data->{path_col_idx}];
254	364					633	$row_analysis{$rowkey} = split(/\Q$data->{path_col_sep}\E/, $rowkey);
255							}
256	27					52	$data->{row_analysis} = \%row_analysis;
257	27					98	return bless $data, $class;
258							}
259
260							sub _prepare_fields {
261	42			42		53	my ($data, $fields_ref, $components) = @_;
262	42	100				71	if (! $components) {
263	26					65	_check_path_col_idx($data, $fields_ref, 0);
264	25					59	path_check_fields($data, $fields_ref);
265							}
266							else {
267	16					26	_check_path_col_idx($data, $fields_ref, 1);
268	15					35	components_check_fields($data, $fields_ref);
269							}
270
271	38					37	my %fields_seen = map { $_ => 1 } @{$fields_ref};
	203					239
	38					51
272	38					69	my @bad_fields = ();
273	38					50	for my $reserved ( qw\| id parent_id name lft rgh \| ) {
274	190	100				260	push @bad_fields, $reserved if $fields_seen{$reserved};
275							}
276	38					88	my $msg = "Bad column names: <@bad_fields>. These are reserved for ";
277	38					29	$msg .= "Parse::Taxonomy's internal use; please rename";
278	38	100				252	croak $msg if @bad_fields;
279
280	36					47	$data->{fields} = $fields_ref;
281	36					59	$data->{path_col} = $data->{fields}->[$data->{path_col_idx}];
282	36					52	return $data;
283							}
284
285							sub _check_path_col_idx {
286	42			42		37	my ($data, $fields_ref, $components) = @_;
287	42					44	my $error_msg = "Argument to 'path_col_idx' exceeds index of last field in ";
288	42	100				90	$error_msg .= $components
289							? "'fields' array ref"
290							: "header row in '$data->{file}'";
291
292	42	100				35	croak $error_msg if $data->{path_col_idx} > $#{$fields_ref};
	42					313
293							}
294
295							sub _prepare_data_records {
296							# Confirm each row's path starts with path_col_sep:
297							# Confirm no duplicate entries in column holding path:
298							# Confirm all rows have same number of columns as header:
299	36			36		40	my ($data, $data_records, $args) = @_;
300	36					33	my $error_msg;
301	36					40	my @bad_path_cols = ();
302	36					26	my @bad_count_records = ();
303	36					43	my %paths_seen = ();
304	36					31	my $field_count = scalar(@{$data->{fields}});
	36					58
305	36					27	for my $rec (@{$data_records}) {
	36					63
306	476	100				1276	unless ($rec->[$data->{path_col_idx}] =~ m/^\Q$data->{path_col_sep}\E/) {
307	6					9	push @bad_path_cols, $rec->[$data->{path_col_idx}];
308							}
309	476					550	$paths_seen{$rec->[$data->{path_col_idx}]}++;
310	476					277	my $this_row_count = scalar(@{$rec});
	476					342
311	476	100				655	if ($this_row_count != $field_count) {
312							push @bad_count_records,
313	5					11	[ $rec->[$data->{path_col_idx}], $this_row_count ];
314							}
315							}
316	36					46	$error_msg = <
317							The value of the column designated as path must start with the path column separator.
318							Rows with the following paths fail to do so:
319							IMPROPER_PATH
320	36					41	for my $path (@bad_path_cols) {
321	6					10	$error_msg .= " $path\n";
322							}
323	36	100				276	croak $error_msg if @bad_path_cols;
324
325	34					33	my @dupe_paths = ();
326	34					210	for my $path (sort keys %paths_seen) {
327	443	100				509	push @dupe_paths, $path if $paths_seen{$path} > 1;
328							}
329	34					52	$error_msg = <
330							No duplicate entries are permitted in column designated as path.
331							The following entries appear the number of times shown:
332							ERROR_MSG_DUPE
333	34					36	for my $path (@dupe_paths) {
334	5					19	$error_msg .= " $path:" . sprintf(" %6s\n" => $paths_seen{$path});
335							}
336	34	100				346	croak $error_msg if @dupe_paths;
337
338	31					49	$error_msg = <
339							Header row has $field_count records. The following records had different counts:
340							ERROR_MSG_WRONG_COUNT
341	31					38	for my $rec (@bad_count_records) {
342	4					8	$error_msg .= " $rec->[0]: $rec->[1]\n";
343							}
344	31	100				227	croak $error_msg if @bad_count_records;
345
346							# Confirm each node appears in taxonomy:
347	29					28	my $path_args = { map { $_ => $args->{$_} } keys %{$args} };
	19					49
	29					67
348	29					48	$path_args->{sep} = $data->{path_col_sep};
349	29	50				117	my $path_csv = Text::CSV_XS->new ( $path_args )
350							or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
351	29					2173	my %missing_parents = ();
352	29					151	for my $path (sort keys %paths_seen) {
353	386					548	my $status = $path_csv->parse($path);
354	386					3321	my @columns = $path_csv->fields();
355	386	100				1655	if (@columns > 2) {
356							my $parent =
357	281					450	join($path_args->{sep} => @columns[0 .. ($#columns - 1)]);
358	281	100				538	unless (exists $paths_seen{$parent}) {
359	4					7	$missing_parents{$path} = $parent;
360							}
361							}
362							}
363	29					41	$error_msg = <
364							Each node in the taxonomy must have a parent.
365							The following nodes lack the expected parent:
366							ERROR_MSG_ORPHAN
367	29					56	for my $path (sort keys %missing_parents) {
368	4					7	$error_msg .= " $path: $missing_parents{$path}\n";
369							}
370	29	100				238	croak $error_msg if scalar(keys %missing_parents);
371							# BBB end of validations
372	27					37	$data->{data_records} = $data_records;
373
374	27					244	return $data;
375							}
376
377							=head2 C
378
379							=over 4
380
381							=item * Purpose
382
383							Identify the names of the columns in the taxonomy.
384
385							=item * Arguments
386
387							my $fields = $self->fields();
388
389							No arguments; the information is already inside the object.
390
391							=item * Return Value
392
393							Reference to an array holding a list of the columns as they appear in the
394							header row of the incoming taxonomy file.
395
396							=item * Comment
397
398							Read-only.
399
400							=back
401
402							=head2 C
403
404							=over 4
405
406							=item * Purpose
407
408							Identify the index position (count starts at 0) of the column in the incoming
409							taxonomy file which serves as the path column.
410
411							=item * Arguments
412
413							my $path_col_idx = $self->path_col_idx;
414
415							No arguments; the information is already inside the object.
416
417							=item * Return Value
418
419							Integer in the range from 0 to 1 less than the number of columns in the header
420							row.
421
422							=item * Comment
423
424							Read-only.
425
426							=back
427
428							=cut
429
430							sub path_col_idx {
431	15			15	1	2479	my $self = shift;
432	15					20	return $self->{path_col_idx};
433							}
434
435							=head2 C
436
437							=over 4
438
439							=item * Purpose
440
441							Identify the name of the column in the incoming taxonomy which serves as the
442							path column.
443
444							=item * Arguments
445
446							my $path_col = $self->path_col;
447
448							No arguments; the information is already inside the object.
449
450							=item * Return Value
451
452							String.
453
454							=item * Comment
455
456							Read-only.
457
458							=back
459
460							=cut
461
462							sub path_col {
463	4			4	1	1355	my $self = shift;
464	4					8	return $self->{path_col};
465							}
466
467							=head2 C
468
469							=over 4
470
471							=item * Purpose
472
473							Identify the string used to separate path components once the taxonomy has
474							been created. This is just a "getter" and is logically distinct from the
475							option to C which is, in effect, a "setter."
476
477							=item * Arguments
478
479							my $path_col_sep = $self->path_col_sep;
480
481							No arguments; the information is already inside the object.
482
483							=item * Return Value
484
485							String.
486
487							=item * Comment
488
489							Read-only.
490
491							=back
492
493							=cut
494
495							sub path_col_sep {
496	4			4	1	972	my $self = shift;
497	4					6	return $self->{path_col_sep};
498							}
499
500							=head2 C
501
502							=over 4
503
504							=item * Purpose
505
506							Once the taxonomy has been validated, get a list of its data rows as a Perl
507							data structure.
508
509							=item * Arguments
510
511							$data_records = $self->data_records;
512
513							None.
514
515							=item * Return Value
516
517							Reference to array of array references. The array will hold the data records
518							found in the incoming taxonomy file in their order in that file.
519
520							=item * Comment
521
522							Does not contain any information about the fields in the taxonomy, so you
523							should probably either (a) use in conjunction with C method above;
524							or (b) use C.
525
526							=back
527
528							=head2 C
529
530							=over 4
531
532							=item * Purpose
533
534							Once the taxonomy has been validated, get a list of its header and data rows as a Perl
535							data structure.
536
537							=item * Arguments
538
539							$data_records = $self->fields_and_data_records;
540
541							None.
542
543							=item * Return Value
544
545							Reference to array of array references. The first element in the array will
546							hold the header row (same as output of C). The remaining elements
547							will hold the data records found in the incoming taxonomy file in their order
548							in that file.
549
550							=back
551
552							=cut
553
554							=head2 C
555
556							=over 4
557
558							=item * Purpose
559
560							Once the taxonomy has been validated, get a list of its data rows as a Perl
561							data structure. In each element of this list, the path is now represented as
562							an array reference rather than a string.
563
564							=item * Arguments
565
566							$data_records_path_components = $self->data_records_path_components;
567
568							None.
569
570							=item * Return Value
571
572							Reference to array of array references. The array will hold the data records
573							found in the incoming taxonomy file in their order in that file.
574
575							=item * Comment
576
577							Does not contain any information about the fields in the taxonomy, so you may
578							wish to use this method either (a) use in conjunction with C method
579							above; or (b) use C.
580
581							=back
582
583							=cut
584
585							sub data_records_path_components {
586	13			13	1	1340	my $self = shift;
587	13					19	my @all_rows = ();
588	13					8	for my $row (@{$self->{data_records}}) {
	13					28
589	182					144	my $path_col = $row->[$self->{path_col_idx}];
590	182					388	my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
591	182					125	my @rewritten = ();
592	182					119	for (my $i = 0; $i <= $#{$row}; $i++) {
	1062					1256
593	880	100				777	if ($i != $self->{path_col_idx}) {
594	698					652	push @rewritten, $row->[$i];
595							}
596							else {
597	182					221	push @rewritten, \@path_components;
598							}
599							}
600	182					214	push @all_rows, \@rewritten;
601							}
602	13					40	return \@all_rows;
603							}
604
605							=head2 C
606
607							=over 4
608
609							=item * Purpose
610
611							Once the taxonomy has been validated, get a list of its data rows as a Perl
612							data structure. The first element in this list is an array reference holding
613							the header row. In each data element of this list, the path is now represented as
614							an array reference rather than a string.
615
616							=item * Arguments
617
618							$fields_and_data_records_path_components = $self->fields_and_data_records_path_components;
619
620							None.
621
622							=item * Return Value
623
624							Reference to array of array references. The array will hold the data records
625							found in the incoming taxonomy file in their order in that file.
626
627							=back
628
629							=cut
630
631							sub fields_and_data_records_path_components {
632	2			2	1	1220	my $self = shift;
633	2					4	my @all_rows = $self->fields;
634	2					3	for my $row (@{$self->{data_records}}) {
	2					4
635	26					22	my $path_col = $row->[$self->{path_col_idx}];
636	26					57	my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
637	26					20	my @rewritten = ();
638	26					18	for (my $i = 0; $i <= $#{$row}; $i++) {
	182					207
639	156	100				137	if ($i != $self->{path_col_idx}) {
640	130					137	push @rewritten, $row->[$i];
641							}
642							else {
643	26					31	push @rewritten, \@path_components;
644							}
645							}
646	26					31	push @all_rows, \@rewritten;
647							}
648	2					4	return \@all_rows;
649							}
650
651							=head2 C
652
653							=over 4
654
655							=item * Purpose
656
657							Identify the index position of a given field within the header row.
658
659							=item * Arguments
660
661							$index = $self->get_field_position('income');
662
663							Takes a single string holding the name of one of the fields (column names).
664
665							=item * Return Value
666
667							Integer representing the index position (counting from C<0>) of the field
668							provided as argument. Throws exception if the argument is not actually a
669							field.
670
671							=back
672
673							=cut
674
675							=head2 C
676
677							=over 4
678
679							=item * Purpose
680
681							Display the number of descendant (multi-generational) nodes each node in the
682							taxonomy has.
683
684							=item * Arguments
685
686							$descendant_counts = $self->descendant_counts();
687
688							$descendant_counts = $self->descendant_counts( { generations => 1 } );
689
690							None required; one optional hash reference. Currently, the only element
691							honored in that hashref is C, whose value must be a non-negative
692							integer. If, instead of getting the count of all descendants of a node, you
693							only want the count of its first generation, i.e., its immediate children, you
694							provide a value of C<1>. Want the count of only the first and second
695							generations? Provide a value of C<2> -- and so on.
696
697							=item * Return Value
698
699							Reference to hash in which each element is keyed on the value of the path
700							column in the incoming taxonomy file.
701
702							=back
703
704							=cut
705
706							sub descendant_counts {
707	9			9	1	3395	my ($self, $args) = @_;
708	9	100				19	if (defined $args) {
709	5	100	100			238	croak "Argument to 'descendant_counts()' must be hashref"
710							unless (ref($args) and reftype($args) eq 'HASH');
711							croak "Value for 'generations' element passed to descendant_counts() must be integer > 0"
712	3	100	100			159	unless ($args->{generations} and $args->{generations} =~ m/^[0-9]+$/);
713							}
714	5					8	my %descendant_counts = ();
715	5					8	my $hashified = $self->hashify();
716	5					7	for my $p (keys %{$hashified}) {
	5					14
717	59					46	$descendant_counts{$p} = 0;
718	59					44	for my $q (
719	707					661	grep { $self->{row_analysis}->{$_} > $self->{row_analysis}->{$p} }
720	59					96	keys %{$hashified}
721							) {
722	230	100				700	if ($q =~ m/^\Q$p$self->{path_col_sep}\E/) {
723	61	100				64	if (! $args->{generations}) {
724	48					50	$descendant_counts{$p}++;
725							}
726							else {
727	13					41	my @c = $p =~ m/\Q$self->{path_col_sep}\E/g;
728	13					28	my @d = $q =~ m/\Q$self->{path_col_sep}\E/g;
729							$descendant_counts{$p}++
730	13	100				34	if (scalar(@d) - scalar(@c) <= $args->{generations});
731							}
732							}
733							}
734							}
735	5					9	$self->{descendant_counts} = \%descendant_counts;
736	5					46	return $self->{descendant_counts};
737							}
738
739							=head2 C
740
741							=over 4
742
743							=item * Purpose
744
745							Get the total number of descendant nodes for one specific node in a validated
746							taxonomy.
747
748							=item * Arguments
749
750							$descendant_count = $self->get_descendant_count('\|Path\|To\|Node');
751
752							$descendant_counts = $self->get_descendant_count('\|Path\|To\|Node', { generations => 1 } );
753
754							One required: string containing node's path as spelled in the taxonomy.
755
756							One optional hash reference. Currently, the only element honored in that
757							hashref is C, whose value must be a non-negative integer. If,
758							instead of getting the count of all descendants of a node, you only want the
759							count of its first generation, i.e., its immediate children, you provide a
760							value of C<1>. Want the count of only first and second generations? Provide
761							a value of C<2> -- and so on.
762
763							=item * Return Value
764
765							Unsigned integer >= 0. Any node whose child count is C<0> is by definition a
766							leaf node.
767
768							=item * Comment
769
770							Will throw an exception if the node does not exist or is misspelled.
771
772							If C is called with no second (hashref) argument
773							following an invocation of C, it will return a value from
774							an internal cache created during that earlier method call. Otherwise, it will
775							re-create the cache from scratch. (This, of course, assumes that you have not
776							manipulated the object's internal data subsequent to its creation.)
777
778							=back
779
780							=cut
781
782							sub get_descendant_count {
783	26			26	1	7609	my ($self, $node, $args) = @_;
784	26	100				53	if (defined $args) {
785	15	100	100			218	croak "Second argument to 'get_descendant_count()' must be hashref"
786							unless (ref($args) and reftype($args) eq 'HASH');
787							croak "Value for 'generations' element passed to second argument to get_descendant_count() must be integer > 0"
788	13	100	100			203	unless ($args->{generations} and $args->{generations} =~ m/^[0-9]+$/);
789							}
790	22	100				33	if (exists $self->{descendant_counts}) {
791	6					6	my $descendant_counts = $self->{descendant_counts};
792	6	100				178	croak "Node '$node' not found" unless exists $descendant_counts->{$node};
793	4					9	return $descendant_counts->{$node};
794							}
795							else {
796	16					22	my %descendant_counts = ();
797	16					22	my $hashified = $self->hashify();
798	16	100				104	croak "Node '$node' not found" unless exists $hashified->{$node};
799	15					16	for my $p ($node) {
800	15					16	$descendant_counts{$p} = 0;
801	15					9	for my $q (
802	231					248	grep { $self->{row_analysis}->{$_} > $self->{row_analysis}->{$p} }
803	15					43	keys %{$hashified}
804							) {
805	110	100				337	if ($q =~ m/^\Q$p$self->{path_col_sep}\E/) {
806	36	100				41	if (! $args->{generations}) {
807	10					13	$descendant_counts{$p}++;
808							}
809							else {
810	26					56	my @c = $p =~ m/\Q$self->{path_col_sep}\E/g;
811	26					57	my @d = $q =~ m/\Q$self->{path_col_sep}\E/g;
812							$descendant_counts{$p}++
813	26	100				67	if (scalar(@d) - scalar(@c) <= $args->{generations});
814							}
815							}
816							}
817							}
818	15					151	return $descendant_counts{$node};
819							}
820							}
821
822							=head2 C
823
824							=over 4
825
826							=item * Purpose
827
828							Turn a validated taxonomy into a Perl hash keyed on the column designated as
829							the path column.
830
831							=item * Arguments
832
833							$hashref = $self->hashify();
834
835							Takes an optional hashref holding a list of any of the following elements:
836
837							=over 4
838
839							=item * C
840
841							Boolean, defaulting to C<0>. By default, C will spell the
842							key of the hash exactly as the value of the path column is spelled in the
843							taxonomy -- which in turn is the way it was spelled in the incoming file.
844							That is, a path in the taxonomy spelled C<\|Alpha\|Beta\|Gamma> will be spelled
845							as a key in exactly the same way.
846
847							However, since in many cases (including the example above) the root node of
848							the taxonomy will be empty, the user may wish to remove the first instance of
849							C. The user would do so by setting
850							C to a true value.
851
852							$hashref = $self->hashify( {
853							remove_leading_path_col_sep => 1,
854							} );
855
856							In that case they key would now be spelled: C.
857
858							Note further that if the C switch is set to a true value, any
859							setting to C will be ignored.
860
861							=item * C
862
863							A string which will be used in composing the key of the hashref returned by
864							this method. The user may select this key if she does not want to use the
865							value found in the incoming CSV file (which by default will be the pipe
866							character (C<\|>) and which may be overridden with the C argument
867							to C.
868
869							$hashref = $self->hashify( {
870							key_delim => q{ - },
871							} );
872
873							In the above variant, a path that in the incoming taxonomy file was
874							represented by C<\|Alpha\|Beta\|Gamma> will in C<$hashref> be represented by
875							C< - Alpha - Beta - Gamma>.
876
877							=item * C
878
879							A string which will be used in composing the key of the hashref returned by
880							this method. The user will set this switch if she wishes to have the root
881							note explicitly represented. Using this switch will automatically cause
882							C to be ignored.
883
884							Suppose the user wished to have C be the text for the root
885							node. Suppose further that the user wanted to use the string C< - > as the
886							delimiter within the key.
887
888							$hashref = $self->hashify( {
889							root_str => q{All Suppliers},
890							key_delim => q{ - },
891							} );
892
893							Then incoming path C<\|Alpha\|Beta\|Gamma> would be keyed as:
894
895							All Suppliers - Alpha - Beta - Gamma
896
897							=back
898
899							=item * Return Value
900
901							Hash reference. The number of elements in this hash should be equal to the
902							number of non-header records in the taxonomy.
903
904							=back
905
906							=cut
907
908							sub hashify {
909	33			33	1	12134	my ($self, $args) = @_;
910	33	100				58	if (defined $args) {
911	8	100	100			272	croak "Argument to 'hashify()' must be hashref"
912							unless (ref($args) and reftype($args) eq 'HASH');
913							}
914	31					30	my %hashified = ();
915	31					35	my $fields = $self->{fields};
916	31					31	my %idx2col = map { $_ => $fields->[$_] } (0 .. $#{$fields});
	170					246
	31					52
917	31					44	for my $rec (@{$self->{data_records}}) {
	31					54
918	427					215	my $rowkey;
919	427	100				424	if ($args->{root_str}) {
920	39					48	$rowkey = $args->{root_str} . $rec->[$self->{path_col_idx}];
921							}
922							else {
923	388	100				343	if ($args->{remove_leading_path_col_sep}) {
924	26					127	($rowkey = $rec->[$self->{path_col_idx}]) =~ s/^\Q$self->{path_col_sep}\E(.*)/$1/;
925							}
926							else {
927	362					301	$rowkey = $rec->[$self->{path_col_idx}];
928							}
929							}
930	427	100				487	if ($args->{key_delim}) {
931	52					149	$rowkey =~ s/\Q$self->{path_col_sep}\E/$args->{key_delim}/g;
932							}
933	427					250	my $rowdata = { map { $idx2col{$_} => $rec->[$_] } (0 .. $#{$fields}) };
	2402					2946
	427					378
934	427					755	$hashified{$rowkey} = $rowdata;
935							}
936	31					76	return \%hashified;
937							}
938
939							=head2 C
940
941							=over 4
942
943							=item * Purpose
944
945							Transform a taxonomy-by-materialized-path into a taxonomy-by-adjacent-list.
946
947							=item * Arguments
948
949							$adjacentified = $self->adjacentify();
950
951							$adjacentified = $self->adjacentify( { serial => 500 } );
952							$adjacentified = $self->adjacentify( { floor => 500 } ); # same as serial
953
954							Optional single hash reference.
955
956							For that hashref, C supports the key C, which defaults
957							to C<0>. C must be a non-negative integer and sets the "floor" above
958							which new unique IDs will be assigned to the C column. Hence, if
959							C is set to C<500>, the value assigned to the C column of the
960							first record to be processed will be C<501>.
961
962							Starting with version .19, C will serve as an alternative way of
963							providing the same information to C. If, however, by mistake
964							you provide B C and C elements in the hash, C
965							will take precedence.
966
967							=item * Return Value
968
969							Reference to an array of hash references. Each element represents one node in
970							the taxonomy. Each element will have key-value pairs for C, C
971							and C which will hold the adjacentification of the materialized path in the
972							original taxonomy-by-materialized-path. Each element will, as well, have KVPs for the
973							non-materialized-path fields in the records in the original taxonomy-by-materialized-path.
974
975							=item * Comment
976
977							See documentation for C for example.
978
979							Note that the order in which C will assign C and
980							C values to records in the taxonomy-by-adjacent-list will almost
981							certainly B match the order in which elements appear in a CSV file or in
982							the data structure returned by a method such as C.
983
984							=back
985
986							=cut
987
988							sub adjacentify {
989	15			15	1	5649	my ($self, $args) = @_;
990	15					17	my $serial = 0;
991	15	100				38	if (defined $args) {
992	9	100	100			292	croak "Argument to 'adjacentify()' must be hashref"
993							unless (ref($args) and reftype($args) eq 'HASH');
994	7					10	for my $w ('serial', 'floor') {
995	13	100				19	if (exists $args->{$w}) {
996							croak "Element '$w' in argument to 'adjacentify()' must be integer"
997	8	100				189	unless ($args->{$w} =~ m/^\d+$/);
998							}
999							}
1000	5		50			18	$serial = $args->{serial} \|\| $args->{floor} \|\| 0;
1001							}
1002
1003
1004	11					41	my $fields = $self->fields();
1005	11					22	my $drpc = $self->data_records_path_components();
1006
1007	11					22	my $path_col_idx = $self->path_col_idx();
1008	41					60	my %non_path_col2idx = map { $fields->[$_] => $_ }
1009	52					52	grep { $_ != $path_col_idx }
1010	11					16	(0..$#{$fields});
	11					13
1011
1012							my @components_by_row =
1013	11					14	map { my $f = $_->[$path_col_idx]; my $c = $#{$f}; [ @{$f}[1..$c] ] } @{$drpc};
	156					97
	156					80
	156					98
	156					110
	156					232
	11					15
1014	11					17	my $max_components = max( map { scalar(@{$_}) } @components_by_row);
	156					76
	156					142
1015	11					12	my @adjacentified = ();
1016	11					10	my %paths_to_id;
1017	11					30	for my $depth (1..$max_components) {
1018	34					66	for (my $r = 0; $r <= $#components_by_row; $r++) {
1019	498	100				255	if (scalar(@{$components_by_row[$r]}) == $depth) {
	498					788
1020	156					162	my %rowdata = map { $_ => $drpc->[$r]->[$non_path_col2idx{$_}] }
	568					628
1021							keys %non_path_col2idx;
1022	156					127	my @path_components = @{$drpc->[$r]->[$path_col_idx]};
	156					221
1023	156					104	my $name = $path_components[-1];
1024	156					208	my $parent_of_name = join('\|' =>
1025							@path_components[1 .. ($#path_components -1)]);
1026
1027	156	100				200	my $candidate_for_path = (length($parent_of_name))
1028							? join('\|' => $parent_of_name, $name)
1029							: $name;
1030
1031							my %rowhash = (
1032							id => ++$serial,
1033							parent_id => $paths_to_id{$parent_of_name}{id},
1034	156					476	name => $name,
1035							%rowdata,
1036							);
1037	156					262	$paths_to_id{$candidate_for_path}{id} = $rowhash{id};
1038	156	100				232	$paths_to_id{$candidate_for_path}{parent_path} = $parent_of_name
1039							if (length($parent_of_name));
1040	156					412	push @adjacentified, \%rowhash;
1041							}
1042							}
1043							}
1044	11					147	return \@adjacentified;
1045							}
1046
1047							=head2 C
1048
1049							=over 4
1050
1051							=item * Purpose
1052
1053							Create a CSV-formatted file holding the data returned by C.
1054
1055							=item * Arguments
1056
1057							$csv_file = $self->write_adjacentified_to_csv( {
1058							adjacentified => $adjacentified, # output of adjacentify()
1059							csvfile => './t/data/taxonomy_out3.csv',
1060							} );
1061
1062							Single hash reference. That hash is keyed on:
1063
1064							=over 4
1065
1066							=item * C
1067
1068							B Its value must be the arrayref of hash references returned by
1069							the C method.
1070
1071							=item * C
1072
1073							Optional. Path to location where a CSV-formatted text file holding the
1074							taxonomy-by-adjacent-list will be written. Defaults to a file called
1075							F in the current working directory.
1076
1077							=item * Text::CSV_XS options
1078
1079							You can also pass through any key-value pairs normally accepted by
1080							F.
1081
1082							=back
1083
1084							=item * Return Value
1085
1086							Returns path to CSV-formatted text file just created.
1087
1088							=item * Example
1089
1090							Suppose we have a CSV-formatted file holding the following taxonomy-by-materialized-path:
1091
1092							"path","is_actionable"
1093							"\|Alpha","0"
1094							"\|Beta","0"
1095							"\|Alpha\|Epsilon","0"
1096							"\|Alpha\|Epsilon\|Kappa","1"
1097							"\|Alpha\|Zeta","0"
1098							"\|Alpha\|Zeta\|Lambda","1"
1099							"\|Alpha\|Zeta\|Mu","0"
1100							"\|Beta\|Eta","1"
1101							"\|Beta\|Theta","1"
1102
1103							After running this file through C, C and
1104							C we will have a new CSV-formatted file holding
1105							this taxonomy-by-adjacent-list:
1106
1107							id,parent_id,name,is_actionable
1108							1,,Alpha,0
1109							2,,Beta,0
1110							3,1,Epsilon,0
1111							4,1,Zeta,0
1112							5,2,Eta,1
1113							6,2,Theta,1
1114							7,3,Kappa,1
1115							8,4,Lambda,1
1116							9,4,Mu,0
1117
1118							Note that the C column has been replaced by the C, C and
1119							C columns.
1120
1121							=back
1122
1123							=cut
1124
1125							sub write_adjacentified_to_csv {
1126	14			14	1	6988	my ($self, $args) = @_;
1127	14	100				25	if (defined $args) {
1128	13	100	100			214	croak "Argument to 'adjacentify()' must be hashref"
1129							unless (ref($args) and reftype($args) eq 'HASH');
1130							croak "Argument to 'adjacentify()' must have 'adjacentified' element"
1131	11	100				88	unless exists $args->{adjacentified};
1132							croak "Argument 'adjacentified' must be array reference"
1133							unless (ref($args->{adjacentified}) and
1134	10	100	100			194	reftype($args->{adjacentified}) eq 'ARRAY');
1135							}
1136							else {
1137	1					98	croak "write_adjacentified_to_csv() must be supplied with hashref"
1138							}
1139	8					9	my $adjacentified = $args->{adjacentified};
1140	8					10	delete $args->{adjacentified};
1141
1142	8					21	my $columns_in = $self->fields;
1143							my @non_path_columns_in =
1144	26					35	map { $columns_in->[$_] }
1145	34					41	grep { $_ != $self->{path_col_idx} }
1146	8					12	(0..$#{$columns_in});
	8					14
1147	8					13	my @columns_out = (qw\| id parent_id name \|);
1148	8					14	push @columns_out, @non_path_columns_in;
1149
1150	8					16547	my $cwd = cwd();
1151							my $csvfile = defined($args->{csvfile})
1152							? $args->{csvfile}
1153	8	100				58	: "$cwd/taxonomy_out.csv";
1154	8					33	delete $args->{csvfile};
1155
1156							# By this point, we should have processed all args other than those
1157							# intended for Text::CSV_XS and assigned their contents to variables as
1158							# needed.
1159
1160	8					38	my $csv_args = { binary => 1 };
1161	8					12	while (my ($k,$v) = each %{$args}) {
	11					55
1162	3					9	$csv_args->{$k} = $v;
1163							}
1164	8					84	my $csv = Text::CSV_XS->new($csv_args);
1165	8	50		1		1672	open my $OUT, ">:encoding(utf8)", $csvfile
	1					10
	1					1
	1					9
1166							or croak "Unable to open $csvfile for writing";
1167	8	100				8750	$csv->eol(defined($csv_args->{eol}) ? $csv_args->{eol} : "\n");
1168	8					282	$csv->print($OUT, [@columns_out]);
1169	8					65	for my $rec (@{$adjacentified}) {
	8					18
1170							$csv->print(
1171							$OUT,
1172	117					442	[ map { $rec->{$columns_out[$_]} } (0..$#columns_out) ]
	724					963
1173							);
1174							}
1175	8	50				370	close $OUT or croak "Unable to close $csvfile after writing";
1176
1177	8					112	return $csvfile;
1178							}
1179
1180							1;
1181
1182							# vim: formatoptions=crqot