File Coverage

lib/Parse/Taxonomy/AdjacentList.pm

Criterion	Covered	Total	%
statement	258	258	100.0
branch	103	108	95.3
condition	36	36	100.0
subroutine	21	21	100.0
pod	6	9	66.6
total	424	432	98.1

line	stmt	bran	cond	sub	pod	time	code
1							package Parse::Taxonomy::AdjacentList;
2	4			4		9549	use strict;
	4					7
	4					107
3	4			4		1902	use parent qw( Parse::Taxonomy );
	4					860
	4					23
4	4			4		175	use Carp;
	4					8
	4					208
5	4			4		3565	use Text::CSV_XS;
	4					30887
	4					217
6	4			4		31	use Scalar::Util qw( reftype );
	4					8
	4					187
7	4			4		19	use Cwd;
	4					33
	4					310
8							our $VERSION = '0.22';
9	4					11877	use Parse::Taxonomy::Auxiliary qw(
10							path_check_fields
11							components_check_fields
12	4			4		1320	);
	4					10
13
14							=head1 NAME
15
16							Parse::Taxonomy::AdjacentList - Extract a taxonomy from a hierarchy inside a CSV file
17
18							=head1 SYNOPSIS
19
20							use Parse::Taxonomy::AdjacentList;
21
22							$source = "./t/data/alpha.csv";
23							$self = Parse::Taxonomy::AdjacentList->new( {
24							file => $source,
25							} );
26
27							=cut
28
29							=head1 METHODS
30
31							=head2 C
32
33							=over 4
34
35							=item * Purpose
36
37							Parse::Taxonomy::AdjacentList constructor.
38
39							=item * Arguments
40
41							Single hash reference. There are two possible interfaces: C and C.
42
43							=over 4
44
45							=item 1 C interface
46
47							$source = "./t/data/delta.csv";
48							$self = Parse::Taxonomy::AdjacentList->new( {
49							file => $source,
50							} );
51
52							Elements in the hash reference are keyed on:
53
54							=over 4
55
56							=item * C
57
58							Absolute or relative path to the incoming taxonomy file.
59							B for this interface.
60
61							=item * C
62
63							The name of the column in the header row under which each data record's unique
64							ID can be found. Defaults to C.
65
66							=item * C
67
68							The name of the column in the header row under which each data record's parent
69							ID can be found. (Will be empty in the case of top-level nodes, as they have
70							no parent.) Defaults to C.
71
72							=item * C
73
74							The name of the column in the header row under which, in each data record, there
75							is a found a string which differentiates that record from all other records
76							with the same parent ID. Defaults to C.
77
78							=item * Text::CSV_XS options
79
80							Any other options which could normally be passed to Cnew()> will
81							be passed through to that module's constructor. On the recommendation of the
82							Text::CSV documentation, C is always set to a true value.
83
84							=back
85
86							=item 2 C interface
87
88							$self = Parse::Taxonomy::AdjacentList->new( {
89							components => {
90							fields => $fields,
91							data_records => $data_records,
92							}
93							} );
94
95							Elements in this hash are keyed on:
96
97							=over 4
98
99							=item * C
100
101							This element is B for the C interface. The value of this
102							element is a hash reference with two keys, C and C.
103							C is a reference to an array holding the field or column names for the
104							data set. C is a reference to an array of array references,
105							each of the latter arrayrefs holding one record or row from the data set.
106
107							=back
108
109							=back
110
111							=item * Return Value
112
113							Parse::Taxonomy::AdjacentList object.
114
115							=item * Exceptions
116
117							C will throw an exception under any of the following conditions:
118
119							=over 4
120
121							=item * Argument to C is not a reference.
122
123							=item * Argument to C is not a hash reference.
124
125							=item * Argument to C must have either 'file' or 'components' element but not both.
126
127							=item * Lack columns in header row to match requirements.
128
129							=item * Non-numeric entry in C or C column.
130
131							=item * Duplicate entries in C column.
132
133							=item * Number of fields in a data record does not match number in header row.
134
135							=item * Empty string in a C column of a record.
136
137							=item * Unable to locate a record whose C is the C of a different record.
138
139							=item * No records with same C may share value of C column.
140
141							=item * C interface
142
143							=over 4
144
145							=item * In the C interface, unable to locate the file which is the value of the C element.
146
147							=item * The same field is found more than once in the header row of the
148							incoming taxonomy file.
149
150							=item * Unable to open or close the incoming taxonomy file for reading.
151
152							=back
153
154							=item * C interface
155
156							=over 4
157
158							=item * In the C interface, C element must be a hash reference with C and C elements.
159
160							=item * C element must be array reference.
161
162							=item * C element must be reference to array of array references.
163
164							=item * No duplicate fields in C element's array reference.
165
166							=back
167
168							=back
169
170							=back
171
172							=cut
173
174							sub new {
175	51			51	1	53107	my ($class, $args) = @_;
176	51					71	my $data;
177
178	51	100	100			793	croak "Argument to 'new()' must be hashref"
179							unless (ref($args) and reftype($args) eq 'HASH');
180	49					69	my $argscount = 0;
181	49	100				139	$argscount++ if $args->{file};
182	49	100				133	$argscount++ if $args->{components};
183	49	100				223	croak "Argument to 'new()' must have either 'file' or 'components' element"
184							if ($argscount == 0);
185	48	100				221	croak "Argument to 'new()' must have either 'file' or 'components' element but not both"
186							if ($argscount == 2);
187
188							$data->{id_col} = $args->{id_col}
189							? delete $args->{id_col}
190	47	100				170	: 'id';
191							$data->{parent_id_col} = $args->{parent_id_col}
192							? delete $args->{parent_id_col}
193	47	100				131	: 'parent_id';
194							$data->{leaf_col} = $args->{leaf_col}
195							? delete $args->{leaf_col}
196	47	100				108	: 'name';
197
198	47	100				97	if ($args->{components}) {
199							croak "Value of 'components' element must be hashref"
200	22	100	100			367	unless (ref($args->{components}) and reftype($args->{components}) eq 'HASH');
201	20					34	for my $k ( qw\| fields data_records \| ) {
202							croak "Value of 'components' element must have '$k' key-value pair"
203	37	100				320	unless exists $args->{components}->{$k};
204							croak "Value of '$k' element must be arrayref"
205							unless (ref($args->{components}->{$k}) and
206	35	100	100			542	reftype($args->{components}->{$k}) eq 'ARRAY');
207							}
208	15					21	for my $row (@{$args->{components}->{data_records}}) {
	15					36
209	173	100	100			993	croak "Each element in 'data_records' array must be arrayref"
210							unless (ref($row) and reftype($row) eq 'ARRAY');
211							}
212	13					37	_prepare_fields($data, $args->{components}->{fields}, 1);
213	9					16	my $these_data_records = $args->{components}->{data_records};
214	9					15	delete $args->{components};
215	9					21	_prepare_data_records($data, $these_data_records, $args);
216							}
217							else {
218							croak "Cannot locate file '$args->{file}'"
219	25	100				680	unless (-f $args->{file});
220	24					103	$data->{file} = delete $args->{file};
221	24					49	$args->{binary} = 1;
222	24	50				137	my $csv = Text::CSV_XS->new ( $args )
223							or croak "Cannot use CSV: ".Text::CSV_XS->error_diag ();
224							open my $IN, "<", $data->{file}
225	24	50				2914	or croak "Unable to open '$data->{file}' for reading";
226	24			4		1133	my $header_ref = $csv->getline($IN);
	4					2621
	4					20925
	4					124
227	24					1283	_prepare_fields($data, $header_ref);
228
229	20					513	my $data_records = $csv->getline_all($IN);
230	20	50				7880	close $IN or croak "Unable to close after reading";
231	20					63	_prepare_data_records($data, $data_records, $args);
232							}
233
234	18					37	while (my ($k,$v) = each %{$args}) {
	31					109
235	13					28	$data->{$k} = $v;
236							}
237	18					91	return bless $data, $class;
238							}
239
240							sub _prepare_fields {
241	37			37		71	my ($data, $fields_ref, $components) = @_;
242	37	100				92	if (! $components) {
243	24					86	path_check_fields($data, $fields_ref);
244	23					115	_check_required_columns($data, $fields_ref);
245							}
246							else { # 'components' interface
247	13					105	components_check_fields($data, $fields_ref);
248	12					26	_check_required_columns($data, $fields_ref);
249							}
250	29					50	$data->{fields} = $fields_ref;
251	29					45	return $data;
252							}
253
254							sub _check_required_columns {
255	35			35		58	my ($data, $fields_ref) = @_;
256	35					53	my %col2idx = map { $fields_ref->[$_] => $_ } (0 .. $#{$fields_ref});
	255					578
	35					79
257	35					109	my %missing_columns = ();
258	35					56	my %main_columns = map { $_ => 1 } ( qw\| id_col parent_id_col leaf_col \| );
	105					252
259	35					110	for my $c ( keys %main_columns ) {
260	105	100				281	if (! exists $col2idx{$data->{$c}}) {
261	6					14	$missing_columns{$c} = $data->{$c};
262							}
263							}
264	35					73	my $error_msg = "Could not locate columns in header to match required arguments:";
265	35					100	for my $c (sort keys %missing_columns) {
266	6					17	$error_msg .= "\n $c: $missing_columns{$c}";
267							}
268	35	100				909	croak $error_msg if scalar keys %missing_columns;
269	29					72	$data->{fields} = $fields_ref;
270	29					60	for my $c (keys %main_columns) {
271	87					260	$data->{$c.'_idx'} = $col2idx{$data->{$c}};
272							}
273	29					116	return $data;
274							}
275
276							sub _prepare_data_records {
277	29			29		52	my ($data, $data_records, $args) = @_;
278							# Confirm no duplicate entries in 'id_col'. DONE
279							# Confirm all rows have same number of columns as header. DONE
280	29					44	my $error_msg = '';
281	29					37	my $field_count = scalar(@{$data->{fields}});
	29					61
282	29					50	my @non_numeric_id_records = ();
283	29					56	my %ids_seen = ();
284	29					37	my @bad_count_records = ();
285	29					45	my @nameless_component_records = ();
286	29					32	for my $rec (@{$data_records}) {
	29					65
287	373	100				1199	if ($rec->[$data->{id_col_idx}] !~ m/^\d+$/) {
288	2					7	push @non_numeric_id_records, [ $rec->[$data->{id_col_idx}], '' ];
289							}
290	373	100	100			1751	if (length($rec->[$data->{parent_id_col_idx}]) and
291							$rec->[$data->{parent_id_col_idx}] !~ m/^\d+$/
292							) {
293							push @non_numeric_id_records, [
294							$rec->[$data->{id_col_idx}],
295	2					6	$rec->[$data->{parent_id_col_idx}]
296							];
297							}
298	373					766	$ids_seen{$rec->[$data->{id_col_idx}]}++;
299	373					383	my $this_row_count = scalar(@{$rec});
	373					515
300	373	100				717	if ($this_row_count != $field_count) {
301							push @bad_count_records,
302	6					14	[ $rec->[$data->{id_col_idx}], $this_row_count ];
303							}
304	373	100				938	if (! length($rec->[$data->{leaf_col_idx}])) {
305	3					7	push @nameless_component_records, $rec->[$data->{id_col_idx}];
306							}
307							}
308	29					98	$error_msg = <
309							Non-numeric entries are not permitted in the '$data->{id_col}' or '$data->{parent_id_col}' columns.
310							The following records each violate this restriction one or two times:
311							NON_NUMERIC_IDS
312	29					57	for my $rec (@non_numeric_id_records) {
313	4					14	$error_msg .= " $data->{id_col}: $rec->[0]\t$data->{parent_id_col}: $rec->[1]\n";
314							}
315	29	100				203	croak $error_msg if @non_numeric_id_records;
316
317	28					51	my @dupe_ids = ();
318	28					198	for my $id (sort keys %ids_seen) {
319	356	100				744	push @dupe_ids, $id if $ids_seen{$id} > 1;
320							}
321	28					91	$error_msg = <
322							No duplicate entries are permitted in the '$data->{id_col}'column.
323							The following entries appear the number of times shown:
324							ERROR_MSG_DUPE
325	28					88	for my $id (@dupe_ids) {
326	4					19	$error_msg .= " $id:" . sprintf(" %6s\n" => $ids_seen{$id});
327							}
328	28	100				331	croak $error_msg if @dupe_ids;
329
330	26					76	$error_msg = <
331							Header row has $field_count columns. The following records
332							(identified by the value in their '$data->{id_col}' columns)
333							have different counts:
334							ERROR_MSG_WRONG_COUNT
335	26					45	for my $rec (@bad_count_records) {
336	6					15	$error_msg .= " $rec->[0]: $rec->[1]\n";
337							}
338	26	100				309	croak $error_msg if @bad_count_records;
339
340	24					57	$error_msg = <
341							Each data record must have a non-empty string in its 'leaf' column.
342							The following records (identified by the value in their '$data->{id_col}' columns)
343							lack valid leaves:
344							NAMELESS_LEAF
345	24					43	for my $rec (@nameless_component_records) {
346	3					6	$error_msg .= " id: $rec\n";
347							}
348	24	100				179	croak $error_msg if @nameless_component_records;
349
350	23					48	my %ids_missing_parents = ();
351	23					27	for my $rec (@{$data_records}) {
	23					45
352	295					415	my $parent_id = $rec->[$data->{parent_id_col_idx}];
353	295	100	100			1147	if ( (length($parent_id)) and (! $ids_seen{$parent_id}) ) {
354	4					11	$ids_missing_parents{$rec->[$data->{id_col_idx}]} = $parent_id;
355							}
356							}
357	23					79	$error_msg = <
358							For each record with a non-null value in the '$data->{parent_id_col}' column,
359							there must be a record with that value in the '$data->{id_col}' column.
360							The following records (identified by the value in their '$data->{id_col}' columns)
361							appear to to have parent IDs which do not have records of their own:
362							ERROR_MSG_MISSING_PARENT
363	23					66	for my $k (sort {$a <=> $b} keys %ids_missing_parents) {
	2					10
364	4					10	$error_msg .= " $k: $ids_missing_parents{$k}\n";
365							}
366	23	100				328	croak $error_msg if scalar keys %ids_missing_parents;
367
368	21					33	my %families = ();
369	21					27	for my $rec (@{$data_records}) {
	21					41
370	269	100				607	if (length($rec->[$data->{parent_id_col_idx}])) {
371	200					581	$families{$rec->[$data->{parent_id_col_idx}]}{$rec->[$data->{leaf_col_idx}]}++;
372							}
373							}
374	21					63	$error_msg = <
375							No record with a non-null value in the '$data->{parent_id_col}' column
376							may have two children with the same value in the '$data->{leaf_col}' column.
377							The following are violations:
378							ERROR_MSG_SIBLINGS_NAMED_SAME
379
380	21					32	my $same_names = 0;
381	21					88	for my $k (sort {$a <=> $b} keys %families) {
	212					300
382	123					142	for my $l (sort keys %{$families{$k}}) {
	123					351
383	195	100				485	if ($families{$k}{$l} > 1) {
384	5					19	$error_msg .= " $data->{parent_id_col}: $k\|$data->{leaf_col}: $l\|count of $data->{leaf_col}s: $families{$k}{$l}\n";
385	5					12	$same_names++;
386							}
387							}
388							}
389	21	100				450	croak $error_msg if $same_names;
390
391	18					47	$data->{data_records} = $data_records;
392	18					236	return $data;
393							}
394
395							=head2 C
396
397							=over 4
398
399							=item * Purpose
400
401							Identify the names of the columns in the taxonomy.
402
403							=item * Arguments
404
405							my $fields = $self->fields();
406
407							No arguments; the information is already inside the object.
408
409							=item * Return Value
410
411							Reference to an array holding a list of the columns as they appear in the
412							header row of the incoming taxonomy file.
413
414							=item * Comment
415
416							Read-only.
417
418							=back
419
420							=head2 C
421
422							=over 4
423
424							=item * Purpose
425
426							Once the taxonomy has been validated, get a list of its data rows as a Perl
427							data structure.
428
429							=item * Arguments
430
431							$data_records = $self->data_records;
432
433							None.
434
435							=item * Return Value
436
437							Reference to array of array references. The array will hold the data records
438							found in the incoming taxonomy file in their order in that file.
439
440							=item * Comment
441
442							Does not contain any information about the fields in the taxonomy, so you
443							should probably either (a) use in conjunction with C method above;
444							or (b) use C.
445
446							=back
447
448							=cut
449
450							=head2 C
451
452							=over 4
453
454							=item * Purpose
455
456							Identify the index position of a given field within the header row.
457
458							=item * Arguments
459
460							$index = $self->get_field_position('income');
461
462							Takes a single string holding the name of one of the fields (column names).
463
464							=item * Return Value
465
466							Integer representing the index position (counting from C<0>) of the field
467							provided as argument. Throws exception if the argument is not actually a
468							field.
469
470							=back
471
472							=cut
473
474							=head2 Accessors
475
476							The following methods provide information about key columns in a
477							Parse::Taxonomy::MaterializedPath object. The key columns are those which hold the
478							ID, parent ID and component information. They take no arguments. The methods
479							whose names end in C<_idx> return integers, as they return the index position
480							of the column in the header row. The other methods return strings.
481
482							$index_of_id_column = $self->id_col_idx;
483
484							$name_of_id_column = $self->id_col;
485
486							$index_of_parent_id_column = $self->parent_id_col_idx;
487
488							$name_of_parent_id_column = $self->parent_id_col;
489
490							$index_of_leaf_column = $self->leaf_col_idx;
491
492							$name_of_leaf_column = $self->leaf_col;
493
494							=cut
495
496							sub id_col_idx {
497	403			403	0	1090	my $self = shift;
498	403					825	return $self->{id_col_idx};
499							}
500
501							sub id_col {
502	101			101	1	126	my $self = shift;
503	101					360	return $self->{id_col};
504							}
505
506							sub parent_id_col_idx {
507	205			205	0	231	my $self = shift;
508	205					476	return $self->{parent_id_col_idx};
509							}
510
511							sub parent_id_col {
512	85			85	1	104	my $self = shift;
513	85					374	return $self->{parent_id_col};
514							}
515
516							sub leaf_col_idx {
517	205			205	0	252	my $self = shift;
518	205					796	return $self->{leaf_col_idx};
519							}
520
521							sub leaf_col {
522	69			69	1	86	my $self = shift;
523	69					250	return $self->{leaf_col};
524							}
525
526							=head2 C
527
528							=over 4
529
530							=item * Purpose
531
532							Generate a new Perl data structure which holds the same information as a
533							Parse::Taxonomy::AdjacentList object but which expresses the route from the
534							root node to a given branch or leaf node as either a separator-delimited
535							string (as in the C column of a Parse::Taxonomy::MaterializedPath object) or
536							as an array reference holding the list of names which delineate that route.
537
538							Another way of expressing this: Transform a taxonomy-by-adjacent-list to a
539							taxonomy-by-materialized-path.
540
541							Example: Suppose we have a CSV file which serves as a taxonomy-by-adjacent-list for
542							this data:
543
544							"id","parent_id","name","is_actionable"
545							"1","","Alpha","0"
546							"2","","Beta","0"
547							"3","1","Epsilon","0"
548							"4","3","Kappa","1"
549							"5","1","Zeta","0"
550							"6","5","Lambda","1"
551							"7","5","Mu","0"
552							"8","2","Eta","1"
553							"9","2","Theta","1"
554
555							Instead of having the route from the root node to a given node be represented
556							B by following Cs up the tree, suppose we want that
557							route to be represented by a string. Assuming that we work with default
558							column names, that would mean representing the information currently spread
559							out among the C, C and C columns in a single C
560							column which, by default, would hold an array reference.
561
562							$source = "./t/data/theta.csv";
563							$self = Parse::Taxonomy::AdjacentList->new( {
564							file => $source,
565							} );
566
567							$taxonomy_with_path_as_array = $self->pathify;
568
569							Yielding:
570
571							[
572							["path", "is_actionable"],
573							[["", "Alpha"], 0],
574							[["", "Beta"], 0],
575							[["", "Alpha", "Epsilon"], 0],
576							[["", "Alpha", "Epsilon", "Kappa"], 1],
577							[["", "Alpha", "Zeta"], 0],
578							[["", "Alpha", "Zeta", "Lambda"], 1],
579							[["", "Alpha", "Zeta", "Mu"], 0],
580							[["", "Beta", "Eta"], 1],
581							[["", "Beta", "Theta"], 1],
582							]
583
584							If we wanted the path information represented as a string rather than an array
585							reference, we would say:
586
587							$taxonomy_with_path_as_string = $self->pathify( { as_string => 1 } );
588
589							Yielding:
590
591							[
592							["path", "is_actionable"],
593							["\|Alpha", 0],
594							["\|Beta", 0],
595							["\|Alpha\|Epsilon", 0],
596							["\|Alpha\|Epsilon\|Kappa", 1],
597							["\|Alpha\|Zeta", 0],
598							["\|Alpha\|Zeta\|Lambda", 1],
599							["\|Alpha\|Zeta\|Mu", 0],
600							["\|Beta\|Eta", 1],
601							["\|Beta\|Theta", 1],
602							]
603
604							If we are providing a true value to the C key, we also get to
605							choose what character to use as the separator in the C column.
606
607							$taxonomy_with_path_as_string_different_path_col_sep =
608							$self->pathify( {
609							as_string => 1,
610							path_col_sep => '~~',
611							} );
612
613							Yields:
614
615							[
616							["path", "is_actionable"],
617							["~~Alpha", 0],
618							["~~Beta", 0],
619							["~~Alpha~~Epsilon", 0],
620							["~~Alpha~~Epsilon~~Kappa", 1],
621							["~~Alpha~~Zeta", 0],
622							["~~Alpha~~Zeta~~Lambda", 1],
623							["~~Alpha~~Zeta~~Mu", 0],
624							["~~Beta~~Eta", 1],
625							["~~Beta~~Theta", 1],
626							]
627
628							Finally, should we want the C column in the returned arrayref to be
629							named something other than I, we can provide a value to the C
630							key.
631
632							[
633							["foo", "is_actionable"],
634							[["", "Alpha"], 0],
635							[["", "Beta"], 0],
636							[["", "Alpha", "Epsilon"], 0],
637							[["", "Alpha", "Epsilon", "Kappa"], 1],
638							[["", "Alpha", "Zeta"], 0],
639							[["", "Alpha", "Zeta", "Lambda"], 1],
640							[["", "Alpha", "Zeta", "Mu"], 0],
641							[["", "Beta", "Eta"], 1],
642							[["", "Beta", "Theta"], 1],
643							]
644
645							item * Arguments
646
647							Optional single hash reference. If provided, the following keys may be used:
648
649							=over 4
650
651							=item * C
652
653							User-supplied name for column holding path information in the returned array
654							reference. Defaults to C.
655
656							=item * C
657
658							Boolean. If supplied with a true value, path information will be represented
659							as a separator-delimited string rather than an array reference.
660
661							=item * C
662
663							User-supplied string to be used to separate the parts of the route when
664							C is called with a true value. Not meaningful unless C
665							is true.
666
667							=back
668
669							=item * Return Value
670
671							Reference to an array of array references. The first element in the array
672							will be a reference to an array of field names. Each succeeding element will
673							be a reference to an array holding data for one record in the original
674							taxonomy. The path data will be represented, by default, as an array
675							reference built up from the component (C) column in the original
676							taxonomy, but if C is selected, the path data in all non-header
677							elements will be a separator-delimited string.
678
679							=back
680
681							=cut
682
683							sub pathify {
684	20			20	1	20056	my ($self, $args) = @_;
685	20	100				96	if (defined $args) {
686	12	100	100			88	unless (ref($args) and (reftype($args) eq 'HASH')) {
687	2					269	croak "Argument to pathify() must be hash ref";
688							}
689	10					17	my %permissible_args = map { $_ => 1 } ( qw\| path_col as_string path_col_sep \| );
	30					77
690	10					16	for my $k (keys %{$args}) {
	10					56
691							croak "'$k' is not a recognized key for pathify() argument hashref"
692	14	100				136	unless $permissible_args{$k};
693							}
694	9	100	100			45	if ($args->{path_col_sep} and not $args->{as_string}) {
695	1					96	croak "Supplying a value for key 'path_col_step' is only valid when also supplying true value for 'as_string'";
696							}
697							}
698	16	100				58	$args->{path_col} = defined($args->{path_col}) ? $args->{path_col} : 'path';
699	16	100				41	if ($args->{as_string}) {
700	6	100				19	$args->{path_col_sep} = defined($args->{path_col_sep}) ? $args->{path_col_sep} : '\|';
701							}
702
703	16					29	my @rewritten = ();
704	16					26	my @fields_in = @{$self->fields};
	16					70
705	16					37	my @fields_out = ( $args->{path_col} );
706	16					32	for my $f (@fields_in) {
707	94	100	100			184	unless (
			100
708							($f eq $self->id_col) or
709							($f eq $self->parent_id_col) or
710							($f eq $self->leaf_col)
711							) {
712	46					104	push @fields_out, $f;
713							}
714							}
715	16					30	push @rewritten, \@fields_out;
716
717	16					39	my %colsin2idx = map { $fields_in[$_] => $_ } (0 .. $#fields_in);
	94					218
718
719	198					402	my %hashed_data = map { $_->[$self->id_col_idx] => {
720							parent_id => $_->[$self->parent_id_col_idx],
721							leaf => $_->[$self->leaf_col_idx],
722	16					36	} } @{$self->data_records};
	16					58
723
724	16					48	my @this_path = ();
725	16					19	my $code;
726							$code = sub {
727	447			447		609	my $id = shift;
728	447					811	push @this_path, $hashed_data{$id}{leaf};
729	447					662	my $parent_id = $hashed_data{$id}{parent_id};
730	447	100				711	if (length($parent_id)) {
731	249					278	&{$code}($parent_id);
	249					502
732							}
733							else {
734	198					306	push @this_path, '';
735							}
736	447					650	return;
737	16					63	};
738	16					25	for my $rec (@{$self->data_records}) {
	16					42
739	198					213	my @new_record;
740	198					394	&{$code}($rec->[$self->id_col_idx]);
	198					372
741	198					485	my $path_as_array_ref = [ reverse @this_path ];
742	198	100				414	if ($args->{as_string}) {
743							push @new_record,
744	62					84	join($args->{path_col_sep} => @{$path_as_array_ref});
	62					155
745							}
746							else {
747	136					221	push @new_record, $path_as_array_ref;
748							}
749	198					363	for my $f (grep { $_ ne $args->{path_col} } @fields_out) {
	804					1514
750	606					1258	push @new_record, $rec->[$colsin2idx{$f}];
751							}
752	198					317	push @rewritten, \@new_record;
753	198					457	@this_path = ();
754							}
755	16					83	return \@rewritten;
756							}
757
758							=head2 C
759
760							=over 4
761
762							=item * Purpose
763
764							Create a CSV-formatted file holding the data returned by C.
765
766							=item * Arguments
767
768							$csv_file = $self->write_pathified_to_csv( {
769							pathified => $pathified, # output of pathify()
770							csvfile => './t/data/taxonomy_out5.csv',
771							} );
772
773							Single hash reference. That hash is keyed on:
774
775							=over 4
776
777							=item * C
778
779							B Its value must be the arrayref of hash references returned by
780							the C method.
781
782							=item * C
783
784							Optional. Path to location where a CSV-formatted text file holding the
785							taxonomy-by-adjacent-list will be written. Defaults to a file called
786							F in the current working directory.
787
788							=item * Text::CSV_XS options
789
790							You can also pass through any key-value pairs normally accepted by
791							F.
792
793							=back
794
795							=item * Return Value
796
797							Returns path to CSV-formatted text file just created.
798
799							=item * Example
800
801							Suppose we have a CSV-formatted file holding the following taxonomy-by-adjacent-list:
802
803							"id","parent_id","name","is_actionable"
804							"1","","Alpha","0"
805							"2","","Beta","0"
806							"3","1","Epsilon","0"
807							"4","3","Kappa","1"
808							"5","1","Zeta","0"
809							"6","5","Lambda","1"
810							"7","5","Mu","0"
811							"8","2","Eta","1"
812							"9","2","Theta","1"
813
814							After running this file through C, C and
815							C we will have a new CSV-formatted file holding
816							this taxonomy-by-materialized-path:
817
818							path,is_actionable
819							\|Alpha,0
820							\|Beta,0
821							\|Alpha\|Epsilon,0
822							\|Alpha\|Epsilon\|Kappa,1
823							\|Alpha\|Zeta,0
824							\|Alpha\|Zeta\|Lambda,1
825							\|Alpha\|Zeta\|Mu,0
826							\|Beta\|Eta,1
827							\|Beta\|Theta,1
828
829							Note that the C, C and C columns have been replaced by the column.
830
831							=back
832
833							=cut
834
835							sub write_pathified_to_csv {
836	8			8	1	3175	my ($self, $args) = @_;
837	8	100				17	if (defined $args) {
838	7	100	100			241	croak "Argument to 'pathify()' must be hashref"
839							unless (ref($args) and reftype($args) eq 'HASH');
840							croak "Argument to 'pathify()' must have 'pathified' element"
841	5	100				133	unless exists $args->{pathified};
842							croak "Argument 'pathified' must be array reference"
843							unless (ref($args->{pathified}) and
844	4	100	100			222	reftype($args->{pathified}) eq 'ARRAY');
845							}
846							else {
847	1					155	croak "write_pathified_to_csv() must be supplied with hashref"
848							}
849	2					4	my $pathified = $args->{pathified};
850	2					5	delete $args->{pathified};
851
852							# Test whether we're working with first element array ref or first element
853							# string
854	2	100				7	my $path_as_string = (! ref($pathified->[1]->[0])) ? 1 : 0;
855
856	2					9	my $columns_in = $self->fields;
857	6					17	my %path_columns = map {$_ => 1} (
858							$self->{id_col},
859							$self->{parent_id_col},
860							$self->{leaf_col},
861	2					7	);
862							my @non_path_columns_in =
863	2					6	map { $columns_in->[$_] }
864	8					22	grep { ! $path_columns{$columns_in->[$_]} }
865	2					6	(0..$#{$columns_in});
	2					6
866	2					7	my @columns_out = (qw\| path \|);
867	2					4	push @columns_out, @non_path_columns_in;
868
869	2					8954	my $cwd = cwd();
870							my $csvfile = defined($args->{csvfile})
871							? $args->{csvfile}
872	2	100				34	: "$cwd/taxonomy_out.csv";
873	2					15	delete $args->{csvfile};
874
875							# By this point, we should have processed all args other than those
876							# intended for Text::CSV_XS and assigned their contents to variables as
877							# needed.
878
879	2					23	my $csv_args = { binary => 1 };
880	2					8	while (my ($k,$v) = each %{$args}) {
	4					32
881	2					13	$csv_args->{$k} = $v;
882							}
883	2					56	my $csv = Text::CSV_XS->new($csv_args);
884	2	50				665	open my $OUT, ">:encoding(utf8)", $csvfile
885							or croak "Unable to open $csvfile for writing";
886	2	100				13444	$csv->eol(defined($csv_args->{eol}) ? $csv_args->{eol} : "\n");
887	2					118	$csv->print($OUT, [@columns_out]);
888	2					28	for my $rec (@{$pathified}[1..$#{$pathified}]) {
	2					12
	2					8
889							$csv->print(
890							$OUT,
891							$path_as_string
892							? $rec
893							: [
894	9					32	join('\|' => @{$rec->[0]}),
895	18	100				228	@{$rec}[1..$#columns_out]
	9					52
896							]
897							);
898							}
899	2	50				138	close $OUT or croak "Unable to close $csvfile after writing";
900
901	2					58	return $csvfile;
902							}
903
904							1;
905
906							# vim: formatoptions=crqot