File Coverage

lib/Parse/Taxonomy/MaterializedPath.pm

Criterion	Covered	Total	%
statement	326	326	100.0
branch	134	140	95.7
condition	39	40	97.5
subroutine	24	24	100.0
pod	11	11	100.0
total	534	541	98.7

line	stmt	bran	cond	sub	pod	time	code
1							package Parse::Taxonomy::MaterializedPath;
2	6			6		10254	use strict;
	6					6
	6					160
3	6			6		2005	use parent qw( Parse::Taxonomy );
	6					1262
	6					23
4	6			6		186	use Carp;
	6					6
	6					274
5	6			6		3481	use Text::CSV_XS;
	6					36382
	6					371
6	6			6		36	use Scalar::Util qw( reftype );
	6					6
	6					260
7	6			6		21	use List::Util qw( max );
	6					39
	6					450
8	6			6		19	use Cwd;
	6					8
	6					337
9							our $VERSION = '0.24';
10	6					15186	use Parse::Taxonomy::Auxiliary qw(
11							path_check_fields
12							components_check_fields
13	6			6		1400	);
	6					8
14
15							=head1 NAME
16
17							Parse::Taxonomy::MaterializedPath - Validate a file for use as a path-based taxonomy
18
19							=head1 SYNOPSIS
20
21							use Parse::Taxonomy::MaterializedPath;
22
23							# 'file' interface: reads a CSV file for you
24
25							$source = "./t/data/alpha.csv";
26							$self = Parse::Taxonomy::MaterializedPath->new( {
27							file => $source,
28							} );
29
30							# 'components' interface: as if you've already read a
31							# CSV file and now have Perl array references to header and data rows
32
33							$self = Parse::Taxonomy::MaterializedPath->new( {
34							components => {
35							fields => $fields,
36							data_records => $data_records,
37							}
38							} );
39
40							=head1 METHODS
41
42							=head2 C
43
44							=over 4
45
46							=item * Purpose
47
48							Parse::Taxonomy::MaterializedPath constructor.
49
50							=item * Arguments
51
52							Single hash reference. There are two possible interfaces: C and C.
53
54							=over 4
55
56							=item 1 C interface
57
58							$source = "./t/data/alpha.csv";
59							$self = Parse::Taxonomy::MaterializedPath->new( {
60							file => $source,
61							path_col_idx => 0,
62							path_col_sep => '\|',
63							%TextCSVoptions,
64							} );
65
66							Elements in the hash reference are keyed on:
67
68							=over 4
69
70							=item * C
71
72							Absolute or relative path to the incoming taxonomy file.
73							B for this interface.
74
75							=item * C
76
77							If the column to be used as the "path" column in the incoming taxonomy file is
78							B the first column, this option must be set to the integer representing
79							the "path" column's index position (count starts at 0). Optional; defaults to C<0>.
80
81							=item * C
82
83							If the string used to distinguish components of the path in the path column in
84							the incoming taxonomy file is not a pipe (C<\|>), this option must be set.
85							Optional; defaults to C<\|>.
86
87							=item * Text::CSV_XS options
88
89							Any other options which could normally be passed to Cnew()> will
90							be passed through to that module's constructor. On the recommendation of the
91							Text::CSV documentation, C is always set to a true value.
92
93							=back
94
95							=item 2 C interface
96
97							$self = Parse::Taxonomy::MaterializedPath->new( {
98							components => {
99							fields => $fields,
100							data_records => $data_records,
101							}
102							} );
103
104							Elements in this hash are keyed on:
105
106							=over 4
107
108							=item * C
109
110							This element is B for the
111							C interface. The value of this element is a hash reference with two keys, C and
112							C. C is a reference to an array holding the field or
113							column names for the data set. C is a reference to an array of
114							array references, each of the latter arrayrefs holding one record or row from
115							the data set.
116
117							=item * C
118
119							Same as in C interface above.
120
121							=item * C
122
123							Same as in C interface above.
124
125							=back
126
127							=back
128
129							=item * Return Value
130
131							Parse::Taxonomy::MaterializedPath object.
132
133							=item * Comment
134
135							C will throw an exception under any of the following conditions:
136
137							=over 4
138
139							=item * Argument to C is not a reference.
140
141							=item * Argument to C is not a hash reference.
142
143							=item * In the C interface, unable to locate the file which is the value of the C element.
144
145							=item * Argument to C element is not an integer.
146
147							=item * Argument to C is greater than the index number of the
148							last element in the header row of the incoming taxonomy file, I the
149							C is wrong.
150
151							=item * The same field is found more than once in the header row of the
152							incoming taxonomy file.
153
154							=item * Unable to open or close the incoming taxonomy file for reading.
155
156							=item * In the column designated as the "path" column, the same value is
157							observed more than once.
158
159							=item * C, C, C, C and C are reserved terms.
160							One or more columns is named with a reserved term.
161
162							=item * A non-parent node's parent node cannot be located in the incoming taxonomy file.
163
164							=item * A data row has a number of fields different from the number of fields
165							in the header row.
166
167							=back
168
169							=back
170
171							=cut
172
173							sub new {
174	62			62	1	36876	my ($class, $args) = @_;
175	62					63	my $data;
176
177	62	100	100			713	croak "Argument to 'new()' must be hashref"
178							unless (ref($args) and reftype($args) eq 'HASH');
179	60					57	my $argscount = 0;
180	60	100				124	$argscount++ if $args->{file};
181	60	100				102	$argscount++ if $args->{components};
182	60	100				187	croak "Argument to 'new()' must have either 'file' or 'components' element"
183							if ($argscount == 0);
184	59	100				245	croak "Argument to 'new()' must have either 'file' or 'components' element but not both"
185							if ($argscount == 2);
186
187	58	100				101	if (exists $args->{path_col_idx}) {
188							croak "Argument to 'path_col_idx' must be integer"
189	4	100				104	unless $args->{path_col_idx} =~ m/^\d+$/;
190							}
191	57		100			220	$data->{path_col_idx} = delete $args->{path_col_idx} \|\| 0;
192							$data->{path_col_sep} = exists $args->{path_col_sep}
193							? $args->{path_col_sep}
194	57	100				154	: '\|';
195	57	100				75	if (exists $args->{path_col_sep}) {
196	5					6	$data->{path_col_sep} = $args->{path_col_sep};
197	5					8	delete $args->{path_col_sep};
198							}
199							else {
200	52					60	$data->{path_col_sep} = '\|';
201							}
202
203	57	100				79	if ($args->{components}) {
204							croak "Value of 'components' element must be hashref"
205	30	100	100			321	unless (ref($args->{components}) and reftype($args->{components}) eq 'HASH');
206	28					42	for my $k ( qw\| fields data_records \| ) {
207							croak "Value of 'components' element must have '$k' key-value pair"
208	53	100				260	unless exists $args->{components}->{$k};
209							croak "Value of '$k' element must be arrayref"
210							unless (ref($args->{components}->{$k}) and
211	51	100	100			484	reftype($args->{components}->{$k}) eq 'ARRAY');
212							}
213	23					29	for my $row (@{$args->{components}->{data_records}}) {
	23					53
214	292	100	100			1020	croak "Each element in 'data_records' array must be arrayref"
215							unless (ref($row) and reftype($row) eq 'ARRAY');
216							}
217							# We don't want to stick $args->{components} into the object as is.
218							# Rather, we want to insert 'fields' and 'data_records' for
219							# consistency with the 'file' interface. But to do that we first need
220							# to impose the same validations that we do for the 'file' interface.
221							# We also need to populate 'path_col'.
222	21					43	_prepare_fields($data, $args->{components}->{fields}, 1);
223	18					20	my $these_data_records = $args->{components}->{data_records};
224	18					22	delete $args->{components};
225	18					34	_prepare_data_records($data, $these_data_records, $args);
226							}
227							else {
228							croak "Cannot locate file '$args->{file}'"
229	27	100				599	unless (-f $args->{file});
230	26					43	$data->{file} = delete $args->{file};
231
232							# We've now handled all the Parse::Taxonomy::MaterializedPath-specific options.
233							# Any remaining options are assumed to be intended for Text::CSV_XS::new().
234
235	26					34	$args->{binary} = 1;
236	26	50				107	my $csv = Text::CSV_XS->new ( $args )
237							or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
238							open my $IN, "<", $data->{file}
239	26	50				2519	or croak "Unable to open '$data->{file}' for reading";
240	4			4		1711	my $header_ref = $csv->getline($IN);
	4					13989
	4					105
	26					895
241
242	26					847	_prepare_fields($data, $header_ref);
243	23					423	my $data_records = $csv->getline_all($IN);
244	23	50				5678	close $IN or croak "Unable to close after reading";
245	23					46	_prepare_data_records($data, $data_records, $args);
246							}
247
248	32					32	while (my ($k,$v) = each %{$args}) {
	50					122
249	18					29	$data->{$k} = $v;
250							}
251	32					41	my %row_analysis = ();
252	32					77	for my $el (@{$data->{data_records}}) {
	32					64
253	425					322	my $rowkey = $el->[$data->{path_col_idx}];
254	425					714	$row_analysis{$rowkey} = split(/\Q$data->{path_col_sep}\E/, $rowkey);
255							}
256	32					63	$data->{row_analysis} = \%row_analysis;
257	32					116	return bless $data, $class;
258							}
259
260							sub _prepare_fields {
261	47			47		54	my ($data, $fields_ref, $components) = @_;
262	47	100				80	if (! $components) {
263	26					41	_check_path_col_idx($data, $fields_ref, 0);
264	25					57	path_check_fields($data, $fields_ref);
265							}
266							else {
267	21					36	_check_path_col_idx($data, $fields_ref, 1);
268	20					51	components_check_fields($data, $fields_ref);
269							}
270
271	43					41	my %fields_seen = map { $_ => 1 } @{$fields_ref};
	229					280
	43					52
272	43					74	my @bad_fields = ();
273	43					58	for my $reserved ( qw\| id parent_id name lft rgh \| ) {
274	215	100				296	push @bad_fields, $reserved if $fields_seen{$reserved};
275							}
276	43					87	my $msg = "Bad column names: <@bad_fields>. These are reserved for ";
277	43					40	$msg .= "Parse::Taxonomy's internal use; please rename";
278	43	100				257	croak $msg if @bad_fields;
279
280	41					57	$data->{fields} = $fields_ref;
281	41					65	$data->{path_col} = $data->{fields}->[$data->{path_col_idx}];
282	41					68	return $data;
283							}
284
285							sub _check_path_col_idx {
286	47			47		45	my ($data, $fields_ref, $components) = @_;
287	47					49	my $error_msg = "Argument to 'path_col_idx' exceeds index of last field in ";
288	47	100				101	$error_msg .= $components
289							? "'fields' array ref"
290							: "header row in '$data->{file}'";
291
292	47	100				37	croak $error_msg if $data->{path_col_idx} > $#{$fields_ref};
	47					343
293							}
294
295							sub _prepare_data_records {
296							# Confirm each row's path starts with path_col_sep:
297							# Confirm no duplicate entries in column holding path:
298							# Confirm all rows have same number of columns as header:
299	41			41		47	my ($data, $data_records, $args) = @_;
300	41					24	my $error_msg;
301	41					44	my @bad_path_cols = ();
302	41					35	my @bad_count_records = ();
303	41					54	my %paths_seen = ();
304	41					24	my $field_count = scalar(@{$data->{fields}});
	41					62
305	41					34	for my $rec (@{$data_records}) {
	41					56
306	537	100				1499	unless ($rec->[$data->{path_col_idx}] =~ m/^\Q$data->{path_col_sep}\E/) {
307	6					9	push @bad_path_cols, $rec->[$data->{path_col_idx}];
308							}
309	537					684	$paths_seen{$rec->[$data->{path_col_idx}]}++;
310	537					291	my $this_row_count = scalar(@{$rec});
	537					407
311	537	100				791	if ($this_row_count != $field_count) {
312							push @bad_count_records,
313	5					10	[ $rec->[$data->{path_col_idx}], $this_row_count ];
314							}
315							}
316	41					51	$error_msg = <
317							The value of the column designated as path must start with the path column separator.
318							Rows with the following paths fail to do so:
319							IMPROPER_PATH
320	41					48	for my $path (@bad_path_cols) {
321	6					9	$error_msg .= " $path\n";
322							}
323	41	100				287	croak $error_msg if @bad_path_cols;
324
325	39					40	my @dupe_paths = ();
326	39					238	for my $path (sort keys %paths_seen) {
327	504	100				589	push @dupe_paths, $path if $paths_seen{$path} > 1;
328							}
329	39					60	$error_msg = <
330							No duplicate entries are permitted in column designated as path.
331							The following entries appear the number of times shown:
332							ERROR_MSG_DUPE
333	39					43	for my $path (@dupe_paths) {
334	5					17	$error_msg .= " $path:" . sprintf(" %6s\n" => $paths_seen{$path});
335							}
336	39	100				343	croak $error_msg if @dupe_paths;
337
338	36					60	$error_msg = <
339							Header row has $field_count records. The following records had different counts:
340							ERROR_MSG_WRONG_COUNT
341	36					44	for my $rec (@bad_count_records) {
342	4					8	$error_msg .= " $rec->[0]: $rec->[1]\n";
343							}
344	36	100				237	croak $error_msg if @bad_count_records;
345
346							# Confirm each node appears in taxonomy:
347	34					34	my $path_args = { map { $_ => $args->{$_} } keys %{$args} };
	19					51
	34					71
348	34					52	$path_args->{sep} = $data->{path_col_sep};
349	34	50				137	my $path_csv = Text::CSV_XS->new ( $path_args )
350							or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
351	34					2644	my %missing_parents = ();
352	34					187	for my $path (sort keys %paths_seen) {
353	447					623	my $status = $path_csv->parse($path);
354	447					3869	my @columns = $path_csv->fields();
355	447	100				1891	if (@columns > 2) {
356							my $parent =
357	324					507	join($path_args->{sep} => @columns[0 .. ($#columns - 1)]);
358	324	100				625	unless (exists $paths_seen{$parent}) {
359	4					11	$missing_parents{$path} = $parent;
360							}
361							}
362							}
363	34					51	$error_msg = <
364							Each node in the taxonomy must have a parent.
365							The following nodes lack the expected parent:
366							ERROR_MSG_ORPHAN
367	34					65	for my $path (sort keys %missing_parents) {
368	4					10	$error_msg .= " $path: $missing_parents{$path}\n";
369							}
370	34	100				289	croak $error_msg if scalar(keys %missing_parents);
371							# BBB end of validations
372	32					39	$data->{data_records} = $data_records;
373
374	32					280	return $data;
375							}
376
377							=head2 C
378
379							=over 4
380
381							=item * Purpose
382
383							Identify the names of the columns in the taxonomy.
384
385							=item * Arguments
386
387							my $fields = $self->fields();
388
389							No arguments; the information is already inside the object.
390
391							=item * Return Value
392
393							Reference to an array holding a list of the columns as they appear in the
394							header row of the incoming taxonomy file.
395
396							=item * Comment
397
398							Read-only.
399
400							=back
401
402							=head2 C
403
404							=over 4
405
406							=item * Purpose
407
408							Identify the index position (count starts at 0) of the column in the incoming
409							taxonomy file which serves as the path column.
410
411							=item * Arguments
412
413							my $path_col_idx = $self->path_col_idx;
414
415							No arguments; the information is already inside the object.
416
417							=item * Return Value
418
419							Integer in the range from 0 to 1 less than the number of columns in the header
420							row.
421
422							=item * Comment
423
424							Read-only.
425
426							=back
427
428							=cut
429
430							sub path_col_idx {
431	15			15	1	2812	my $self = shift;
432	15					20	return $self->{path_col_idx};
433							}
434
435							=head2 C
436
437							=over 4
438
439							=item * Purpose
440
441							Identify the name of the column in the incoming taxonomy which serves as the
442							path column.
443
444							=item * Arguments
445
446							my $path_col = $self->path_col;
447
448							No arguments; the information is already inside the object.
449
450							=item * Return Value
451
452							String.
453
454							=item * Comment
455
456							Read-only.
457
458							=back
459
460							=cut
461
462							sub path_col {
463	4			4	1	1112	my $self = shift;
464	4					8	return $self->{path_col};
465							}
466
467							=head2 C
468
469							=over 4
470
471							=item * Purpose
472
473							Identify the string used to separate path components once the taxonomy has
474							been created. This is just a "getter" and is logically distinct from the
475							option to C which is, in effect, a "setter."
476
477							=item * Arguments
478
479							my $path_col_sep = $self->path_col_sep;
480
481							No arguments; the information is already inside the object.
482
483							=item * Return Value
484
485							String.
486
487							=item * Comment
488
489							Read-only.
490
491							=back
492
493							=cut
494
495							sub path_col_sep {
496	4			4	1	962	my $self = shift;
497	4					6	return $self->{path_col_sep};
498							}
499
500							=head2 C
501
502							=over 4
503
504							=item * Purpose
505
506							Once the taxonomy has been validated, get a list of its data rows as a Perl
507							data structure.
508
509							=item * Arguments
510
511							$data_records = $self->data_records;
512
513							None.
514
515							=item * Return Value
516
517							Reference to array of array references. The array will hold the data records
518							found in the incoming taxonomy file in their order in that file.
519
520							=item * Comment
521
522							Does not contain any information about the fields in the taxonomy, so you
523							should probably either (a) use in conjunction with C method above;
524							or (b) use C.
525
526							=back
527
528							=head2 C
529
530							=over 4
531
532							=item * Purpose
533
534							Once the taxonomy has been validated, get a list of its header and data rows as a Perl
535							data structure.
536
537							=item * Arguments
538
539							$data_records = $self->fields_and_data_records;
540
541							None.
542
543							=item * Return Value
544
545							Reference to array of array references. The first element in the array will
546							hold the header row (same as output of C). The remaining elements
547							will hold the data records found in the incoming taxonomy file in their order
548							in that file.
549
550							=back
551
552							=cut
553
554							=head2 C
555
556							=over 4
557
558							=item * Purpose
559
560							Once the taxonomy has been validated, get a list of its data rows as a Perl
561							data structure. In each element of this list, the path is now represented as
562							an array reference rather than a string.
563
564							=item * Arguments
565
566							$data_records_path_components = $self->data_records_path_components;
567
568							None.
569
570							=item * Return Value
571
572							Reference to array of array references. The array will hold the data records
573							found in the incoming taxonomy file in their order in that file.
574
575							=item * Comment
576
577							Does not contain any information about the fields in the taxonomy, so you may
578							wish to use this method either (a) use in conjunction with C method
579							above; or (b) use C.
580
581							=back
582
583							=cut
584
585							sub data_records_path_components {
586	13			13	1	1316	my $self = shift;
587	13					17	my @all_rows = ();
588	13					10	for my $row (@{$self->{data_records}}) {
	13					28
589	182					160	my $path_col = $row->[$self->{path_col_idx}];
590	182					682	my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
591	182					136	my @rewritten = ();
592	182					124	for (my $i = 0; $i <= $#{$row}; $i++) {
	1062					1300
593	880	100				795	if ($i != $self->{path_col_idx}) {
594	698					702	push @rewritten, $row->[$i];
595							}
596							else {
597	182					206	push @rewritten, \@path_components;
598							}
599							}
600	182					233	push @all_rows, \@rewritten;
601							}
602	13					43	return \@all_rows;
603							}
604
605							=head2 C
606
607							=over 4
608
609							=item * Purpose
610
611							Once the taxonomy has been validated, get a list of its data rows as a Perl
612							data structure. The first element in this list is an array reference holding
613							the header row. In each data element of this list, the path is now represented as
614							an array reference rather than a string.
615
616							=item * Arguments
617
618							$fields_and_data_records_path_components = $self->fields_and_data_records_path_components;
619
620							None.
621
622							=item * Return Value
623
624							Reference to array of array references. The array will hold the data records
625							found in the incoming taxonomy file in their order in that file.
626
627							=back
628
629							=cut
630
631							sub fields_and_data_records_path_components {
632	6			6	1	3508	my $self = shift;
633	6					41	my @all_rows = $self->fields;
634	6					6	for my $row (@{$self->{data_records}}) {
	6					15
635	78					68	my $path_col = $row->[$self->{path_col_idx}];
636	78					191	my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
637	78					55	my @rewritten = ();
638	78					59	for (my $i = 0; $i <= $#{$row}; $i++) {
	546					641
639	468	100				413	if ($i != $self->{path_col_idx}) {
640	390					367	push @rewritten, $row->[$i];
641							}
642							else {
643	78					86	push @rewritten, \@path_components;
644							}
645							}
646	78					89	push @all_rows, \@rewritten;
647							}
648	6					14	return \@all_rows;
649							}
650
651							=head2 C
652
653							=over 4
654
655							=item * Purpose
656
657							Identify the index position of a given field within the header row.
658
659							=item * Arguments
660
661							$index = $self->get_field_position('income');
662
663							Takes a single string holding the name of one of the fields (column names).
664
665							=item * Return Value
666
667							Integer representing the index position (counting from C<0>) of the field
668							provided as argument. Throws exception if the argument is not actually a
669							field.
670
671							=back
672
673							=cut
674
675							=head2 C
676
677							=over 4
678
679							=item * Purpose
680
681							Display the number of descendant (multi-generational) nodes each node in the
682							taxonomy has.
683
684							=item * Arguments
685
686							$descendant_counts = $self->descendant_counts();
687
688							$descendant_counts = $self->descendant_counts( { generations => 1 } );
689
690							None required; one optional hash reference. Currently, the only element
691							honored in that hashref is C, whose value must be a non-negative
692							integer. If, instead of getting the count of all descendants of a node, you
693							only want the count of its first generation, i.e., its immediate children, you
694							provide a value of C<1>. Want the count of only the first and second
695							generations? Provide a value of C<2> -- and so on.
696
697							=item * Return Value
698
699							Reference to hash in which each element is keyed on the value of the path
700							column in the incoming taxonomy file.
701
702							=back
703
704							=cut
705
706							sub descendant_counts {
707	9			9	1	3449	my ($self, $args) = @_;
708	9	100				21	if (defined $args) {
709	5	100	100			228	croak "Argument to 'descendant_counts()' must be hashref"
710							unless (ref($args) and reftype($args) eq 'HASH');
711							croak "Value for 'generations' element passed to descendant_counts() must be integer > 0"
712	3	100	100			166	unless ($args->{generations} and $args->{generations} =~ m/^[0-9]+$/);
713							}
714	5					8	my %descendant_counts = ();
715	5					11	my $hashified = $self->hashify();
716	5					6	for my $p (keys %{$hashified}) {
	5					15
717	59					53	$descendant_counts{$p} = 0;
718	59					36	for my $q (
719	707					661	grep { $self->{row_analysis}->{$_} > $self->{row_analysis}->{$p} }
720	59					97	keys %{$hashified}
721							) {
722	230	100				733	if ($q =~ m/^\Q$p$self->{path_col_sep}\E/) {
723	61	100				61	if (! $args->{generations}) {
724	48					52	$descendant_counts{$p}++;
725							}
726							else {
727	13					29	my @c = $p =~ m/\Q$self->{path_col_sep}\E/g;
728	13					24	my @d = $q =~ m/\Q$self->{path_col_sep}\E/g;
729							$descendant_counts{$p}++
730	13	100				34	if (scalar(@d) - scalar(@c) <= $args->{generations});
731							}
732							}
733							}
734							}
735	5					10	$self->{descendant_counts} = \%descendant_counts;
736	5					35	return $self->{descendant_counts};
737							}
738
739							=head2 C
740
741							=over 4
742
743							=item * Purpose
744
745							Get the total number of descendant nodes for one specific node in a validated
746							taxonomy.
747
748							=item * Arguments
749
750							$descendant_count = $self->get_descendant_count('\|Path\|To\|Node');
751
752							$descendant_counts = $self->get_descendant_count('\|Path\|To\|Node', { generations => 1 } );
753
754							One required: string containing node's path as spelled in the taxonomy.
755
756							One optional hash reference. Currently, the only element honored in that
757							hashref is C, whose value must be a non-negative integer. If,
758							instead of getting the count of all descendants of a node, you only want the
759							count of its first generation, i.e., its immediate children, you provide a
760							value of C<1>. Want the count of only first and second generations? Provide
761							a value of C<2> -- and so on.
762
763							=item * Return Value
764
765							Unsigned integer >= 0. Any node whose child count is C<0> is by definition a
766							leaf node.
767
768							=item * Comment
769
770							Will throw an exception if the node does not exist or is misspelled.
771
772							If C is called with no second (hashref) argument
773							following an invocation of C, it will return a value from
774							an internal cache created during that earlier method call. Otherwise, it will
775							re-create the cache from scratch. (This, of course, assumes that you have not
776							manipulated the object's internal data subsequent to its creation.)
777
778							=back
779
780							=cut
781
782							sub get_descendant_count {
783	26			26	1	8015	my ($self, $node, $args) = @_;
784	26	100				54	if (defined $args) {
785	15	100	100			220	croak "Second argument to 'get_descendant_count()' must be hashref"
786							unless (ref($args) and reftype($args) eq 'HASH');
787							croak "Value for 'generations' element passed to second argument to get_descendant_count() must be integer > 0"
788	13	100	100			237	unless ($args->{generations} and $args->{generations} =~ m/^[0-9]+$/);
789							}
790	22	100				35	if (exists $self->{descendant_counts}) {
791	6					7	my $descendant_counts = $self->{descendant_counts};
792	6	100				193	croak "Node '$node' not found" unless exists $descendant_counts->{$node};
793	4					7	return $descendant_counts->{$node};
794							}
795							else {
796	16					21	my %descendant_counts = ();
797	16					24	my $hashified = $self->hashify();
798	16	100				106	croak "Node '$node' not found" unless exists $hashified->{$node};
799	15					15	for my $p ($node) {
800	15					16	$descendant_counts{$p} = 0;
801	15					9	for my $q (
802	231					235	grep { $self->{row_analysis}->{$_} > $self->{row_analysis}->{$p} }
803	15					44	keys %{$hashified}
804							) {
805	110	100				395	if ($q =~ m/^\Q$p$self->{path_col_sep}\E/) {
806	36	100				44	if (! $args->{generations}) {
807	10					14	$descendant_counts{$p}++;
808							}
809							else {
810	26					63	my @c = $p =~ m/\Q$self->{path_col_sep}\E/g;
811	26					67	my @d = $q =~ m/\Q$self->{path_col_sep}\E/g;
812							$descendant_counts{$p}++
813	26	100				112	if (scalar(@d) - scalar(@c) <= $args->{generations});
814							}
815							}
816							}
817							}
818	15					153	return $descendant_counts{$node};
819							}
820							}
821
822							=head2 C
823
824							=over 4
825
826							=item * Purpose
827
828							Turn a validated taxonomy into a Perl hash keyed on the column designated as
829							the path column.
830
831							=item * Arguments
832
833							$hashref = $self->hashify();
834
835							Takes an optional hashref holding a list of any of the following elements:
836
837							=over 4
838
839							=item * C
840
841							Boolean, defaulting to C<0>. By default, C will spell the
842							key of the hash exactly as the value of the path column is spelled in the
843							taxonomy -- which in turn is the way it was spelled in the incoming file.
844							That is, a path in the taxonomy spelled C<\|Alpha\|Beta\|Gamma> will be spelled
845							as a key in exactly the same way.
846
847							However, since in many cases (including the example above) the root node of
848							the taxonomy will be empty, the user may wish to remove the first instance of
849							C. The user would do so by setting
850							C to a true value.
851
852							$hashref = $self->hashify( {
853							remove_leading_path_col_sep => 1,
854							} );
855
856							In that case they key would now be spelled: C.
857
858							Note further that if the C switch is set to a true value, any
859							setting to C will be ignored.
860
861							=item * C
862
863							A string which will be used in composing the key of the hashref returned by
864							this method. The user may select this key if she does not want to use the
865							value found in the incoming CSV file (which by default will be the pipe
866							character (C<\|>) and which may be overridden with the C argument
867							to C.
868
869							$hashref = $self->hashify( {
870							key_delim => q{ - },
871							} );
872
873							In the above variant, a path that in the incoming taxonomy file was
874							represented by C<\|Alpha\|Beta\|Gamma> will in C<$hashref> be represented by
875							C< - Alpha - Beta - Gamma>.
876
877							=item * C
878
879							A string which will be used in composing the key of the hashref returned by
880							this method. The user will set this switch if she wishes to have the root
881							note explicitly represented. Using this switch will automatically cause
882							C to be ignored.
883
884							Suppose the user wished to have C be the text for the root
885							node. Suppose further that the user wanted to use the string C< - > as the
886							delimiter within the key.
887
888							$hashref = $self->hashify( {
889							root_str => q{All Suppliers},
890							key_delim => q{ - },
891							} );
892
893							Then incoming path C<\|Alpha\|Beta\|Gamma> would be keyed as:
894
895							All Suppliers - Alpha - Beta - Gamma
896
897							=back
898
899							=item * Return Value
900
901							Hash reference. The number of elements in this hash should be equal to the
902							number of non-header records in the taxonomy.
903
904							=back
905
906							=cut
907
908							sub hashify {
909	33			33	1	12152	my ($self, $args) = @_;
910	33	100				58	if (defined $args) {
911	8	100	100			242	croak "Argument to 'hashify()' must be hashref"
912							unless (ref($args) and reftype($args) eq 'HASH');
913							}
914	31					33	my %hashified = ();
915	31					39	my $fields = $self->{fields};
916	31					29	my %idx2col = map { $_ => $fields->[$_] } (0 .. $#{$fields});
	170					263
	31					48
917	31					46	for my $rec (@{$self->{data_records}}) {
	31					66
918	427					225	my $rowkey;
919	427	100				431	if ($args->{root_str}) {
920	39					48	$rowkey = $args->{root_str} . $rec->[$self->{path_col_idx}];
921							}
922							else {
923	388	100				346	if ($args->{remove_leading_path_col_sep}) {
924	26					121	($rowkey = $rec->[$self->{path_col_idx}]) =~ s/^\Q$self->{path_col_sep}\E(.*)/$1/;
925							}
926							else {
927	362					296	$rowkey = $rec->[$self->{path_col_idx}];
928							}
929							}
930	427	100				506	if ($args->{key_delim}) {
931	52					147	$rowkey =~ s/\Q$self->{path_col_sep}\E/$args->{key_delim}/g;
932							}
933	427					257	my $rowdata = { map { $idx2col{$_} => $rec->[$_] } (0 .. $#{$fields}) };
	2402					2997
	427					399
934	427					776	$hashified{$rowkey} = $rowdata;
935							}
936	31					80	return \%hashified;
937							}
938
939							=head2 C
940
941							=over 4
942
943							=item * Purpose
944
945							Transform a taxonomy-by-materialized-path into a taxonomy-by-adjacent-list.
946
947							=item * Arguments
948
949							$adjacentified = $self->adjacentify();
950
951							$adjacentified = $self->adjacentify( { serial => 500 } );
952							$adjacentified = $self->adjacentify( { floor => 500 } ); # same as serial
953
954							Optional single hash reference.
955
956							For that hashref, C supports the key C, which defaults
957							to C<0>. C must be a non-negative integer and sets the "floor" above
958							which new unique IDs will be assigned to the C column. Hence, if
959							C is set to C<500>, the value assigned to the C column of the
960							first record to be processed will be C<501>.
961
962							Starting with version .19, C will serve as an alternative way of
963							providing the same information to C. If, however, by mistake
964							you provide B C and C elements in the hash, C
965							will take precedence.
966
967							=item * Return Value
968
969							Reference to an array of hash references. Each element represents one node in
970							the taxonomy. Each element will have key-value pairs for C, C
971							and C which will hold the adjacentification of the materialized path in the
972							original taxonomy-by-materialized-path. Each element will, as well, have KVPs for the
973							non-materialized-path fields in the records in the original taxonomy-by-materialized-path.
974
975							=item * Comment
976
977							See documentation for C for example.
978
979							Note that the order in which C will assign C and
980							C values to records in the taxonomy-by-adjacent-list will almost
981							certainly B match the order in which elements appear in a CSV file or in
982							the data structure returned by a method such as C.
983
984							=back
985
986							=cut
987
988							sub adjacentify {
989	15			15	1	5750	my ($self, $args) = @_;
990	15					17	my $serial = 0;
991	15	100				28	if (defined $args) {
992	9	100	100			247	croak "Argument to 'adjacentify()' must be hashref"
993							unless (ref($args) and reftype($args) eq 'HASH');
994	7					11	for my $w ('serial', 'floor') {
995	13	100				25	if (exists $args->{$w}) {
996							croak "Element '$w' in argument to 'adjacentify()' must be integer"
997	8	100				186	unless ($args->{$w} =~ m/^\d+$/);
998							}
999							}
1000	5		50			22	$serial = $args->{serial} \|\| $args->{floor} \|\| 0;
1001							}
1002
1003
1004	11					38	my $fields = $self->fields();
1005	11					21	my $drpc = $self->data_records_path_components();
1006
1007	11					19	my $path_col_idx = $self->path_col_idx();
1008	41					62	my %non_path_col2idx = map { $fields->[$_] => $_ }
1009	52					54	grep { $_ != $path_col_idx }
1010	11					13	(0..$#{$fields});
	11					14
1011
1012							my @components_by_row =
1013	11					14	map { my $f = $_->[$path_col_idx]; my $c = $#{$f}; [ @{$f}[1..$c] ] } @{$drpc};
	156					103
	156					80
	156					101
	156					92
	156					237
	11					10
1014	11					16	my $max_components = max( map { scalar(@{$_}) } @components_by_row);
	156					78
	156					140
1015	11					14	my @adjacentified = ();
1016	11					10	my %paths_to_id;
1017	11					22	for my $depth (1..$max_components) {
1018	34					54	for (my $r = 0; $r <= $#components_by_row; $r++) {
1019	498	100				290	if (scalar(@{$components_by_row[$r]}) == $depth) {
	498					880
1020	156					190	my %rowdata = map { $_ => $drpc->[$r]->[$non_path_col2idx{$_}] }
	568					734
1021							keys %non_path_col2idx;
1022	156					147	my @path_components = @{$drpc->[$r]->[$path_col_idx]};
	156					248
1023	156					125	my $name = $path_components[-1];
1024	156					228	my $parent_of_name = join('\|' =>
1025							@path_components[1 .. ($#path_components -1)]);
1026
1027	156	100				243	my $candidate_for_path = (length($parent_of_name))
1028							? join('\|' => $parent_of_name, $name)
1029							: $name;
1030
1031							my %rowhash = (
1032							id => ++$serial,
1033							parent_id => $paths_to_id{$parent_of_name}{id},
1034	156					524	name => $name,
1035							%rowdata,
1036							);
1037	156					316	$paths_to_id{$candidate_for_path}{id} = $rowhash{id};
1038	156	100				272	$paths_to_id{$candidate_for_path}{parent_path} = $parent_of_name
1039							if (length($parent_of_name));
1040	156					503	push @adjacentified, \%rowhash;
1041							}
1042							}
1043							}
1044	11					166	return \@adjacentified;
1045							}
1046
1047							=head2 C
1048
1049							=over 4
1050
1051							=item * Purpose
1052
1053							Create a CSV-formatted file holding the data returned by C.
1054
1055							=item * Arguments
1056
1057							$csv_file = $self->write_adjacentified_to_csv( {
1058							adjacentified => $adjacentified, # output of adjacentify()
1059							csvfile => './t/data/taxonomy_out3.csv',
1060							} );
1061
1062							Single hash reference. That hash is keyed on:
1063
1064							=over 4
1065
1066							=item * C
1067
1068							B Its value must be the arrayref of hash references returned by
1069							the C method.
1070
1071							=item * C
1072
1073							Optional. Path to location where a CSV-formatted text file holding the
1074							taxonomy-by-adjacent-list will be written. Defaults to a file called
1075							F in the current working directory.
1076
1077							=item * Text::CSV_XS options
1078
1079							You can also pass through any key-value pairs normally accepted by
1080							F.
1081
1082							=back
1083
1084							=item * Return Value
1085
1086							Returns path to CSV-formatted text file just created.
1087
1088							=item * Example
1089
1090							Suppose we have a CSV-formatted file holding the following taxonomy-by-materialized-path:
1091
1092							"path","is_actionable"
1093							"\|Alpha","0"
1094							"\|Beta","0"
1095							"\|Alpha\|Epsilon","0"
1096							"\|Alpha\|Epsilon\|Kappa","1"
1097							"\|Alpha\|Zeta","0"
1098							"\|Alpha\|Zeta\|Lambda","1"
1099							"\|Alpha\|Zeta\|Mu","0"
1100							"\|Beta\|Eta","1"
1101							"\|Beta\|Theta","1"
1102
1103							After running this file through C, C and
1104							C we will have a new CSV-formatted file holding
1105							this taxonomy-by-adjacent-list:
1106
1107							id,parent_id,name,is_actionable
1108							1,,Alpha,0
1109							2,,Beta,0
1110							3,1,Epsilon,0
1111							4,1,Zeta,0
1112							5,2,Eta,1
1113							6,2,Theta,1
1114							7,3,Kappa,1
1115							8,4,Lambda,1
1116							9,4,Mu,0
1117
1118							Note that the C column has been replaced by the C, C and
1119							C columns.
1120
1121							=back
1122
1123							=cut
1124
1125							sub write_adjacentified_to_csv {
1126	14			14	1	6632	my ($self, $args) = @_;
1127	14	100				25	if (defined $args) {
1128	13	100	100			224	croak "Argument to 'adjacentify()' must be hashref"
1129							unless (ref($args) and reftype($args) eq 'HASH');
1130							croak "Argument to 'adjacentify()' must have 'adjacentified' element"
1131	11	100				90	unless exists $args->{adjacentified};
1132							croak "Argument 'adjacentified' must be array reference"
1133							unless (ref($args->{adjacentified}) and
1134	10	100	100			194	reftype($args->{adjacentified}) eq 'ARRAY');
1135							}
1136							else {
1137	1					74	croak "write_adjacentified_to_csv() must be supplied with hashref"
1138							}
1139	8					8	my $adjacentified = $args->{adjacentified};
1140	8					11	delete $args->{adjacentified};
1141
1142	8					17	my $columns_in = $self->fields;
1143							my @non_path_columns_in =
1144	26					34	map { $columns_in->[$_] }
1145	34					40	grep { $_ != $self->{path_col_idx} }
1146	8					11	(0..$#{$columns_in});
	8					12
1147	8					12	my @columns_out = (qw\| id parent_id name \|);
1148	8					10	push @columns_out, @non_path_columns_in;
1149
1150	8					11283	my $cwd = cwd();
1151							my $csvfile = defined($args->{csvfile})
1152							? $args->{csvfile}
1153	8	100				49	: "$cwd/taxonomy_out.csv";
1154	8					23	delete $args->{csvfile};
1155
1156							# By this point, we should have processed all args other than those
1157							# intended for Text::CSV_XS and assigned their contents to variables as
1158							# needed.
1159
1160	8					33	my $csv_args = { binary => 1 };
1161	8					10	while (my ($k,$v) = each %{$args}) {
	11					45
1162	3					11	$csv_args->{$k} = $v;
1163							}
1164	8					69	my $csv = Text::CSV_XS->new($csv_args);
1165	8	50		1		1456	open my $OUT, ">:encoding(utf8)", $csvfile
	1					7
	1					1
	1					8
1166							or croak "Unable to open $csvfile for writing";
1167	8	100				8616	$csv->eol(defined($csv_args->{eol}) ? $csv_args->{eol} : "\n");
1168	8					255	$csv->print($OUT, [@columns_out]);
1169	8					62	for my $rec (@{$adjacentified}) {
	8					21
1170							$csv->print(
1171							$OUT,
1172	117					532	[ map { $rec->{$columns_out[$_]} } (0..$#columns_out) ]
	724					1035
1173							);
1174							}
1175	8	50				339	close $OUT or croak "Unable to close $csvfile after writing";
1176
1177	8					105	return $csvfile;
1178							}
1179
1180							1;
1181
1182							# vim: formatoptions=crqot