File Coverage

lib/Parse/File/Taxonomy/Path.pm

Criterion	Covered	Total	%
statement	19	21	90.4
branch			n/a
condition			n/a
subroutine	7	7	100.0
pod			n/a
total	26	28	92.8

line	stmt	sub	time	code
1				package Parse::File::Taxonomy::Path;
2	5	5	8787	use strict;
	5		8
	5		177
3	5	5	1755	use parent qw( Parse::File::Taxonomy );
	5		1022
	5		20
4	5	5	232	use Carp;
	5		6
	5		243
5	5	5	19	use Text::CSV;
	5		6
	5		22
6	5	5	87	use Scalar::Util qw( reftype );
	5		32
	5		242
7				our $VERSION = '0.03';
8	5		264	use Parse::File::Taxonomy::Auxiliary qw(
9				path_check_fields
10				components_check_fields
11	5	5	1147	);
	5		14
12	5	5	4560	use Data::Dump;
	0
	0
13
14				=head1 NAME
15
16				Parse::File::Taxonomy::Path - Validate a file for use as a path-based taxonomy
17
18				=head1 SYNOPSIS
19
20				use Parse::File::Taxonomy::Path;
21
22				# 'file' interface: reads a CSV file for you
23
24				$source = "./t/data/alpha.csv";
25				$obj = Parse::File::Taxonomy::Path->new( {
26				file => $source,
27				} );
28
29				# 'components' interface: as if you've already read a
30				# CSV file and now have Perl array references to header and data rows
31
32				$obj = Parse::File::Taxonomy::Path->new( {
33				components => {
34				fields => $fields,
35				data_records => $data_records,
36				}
37				} );
38
39				=head1 METHODS
40
41				=head2 C
42
43				=over 4
44
45				=item * Purpose
46
47				Parse::File::Taxonomy::Path constructor.
48
49				=item * Arguments
50
51				Single hash reference. There are two possible interfaces: C and C.
52
53				=over 4
54
55				=item 1 C interface
56
57				$source = "./t/data/alpha.csv";
58				$obj = Parse::File::Taxonomy::Path->new( {
59				file => $source,
60				path_col_idx => 0,
61				path_col_sep => '\|',
62				%TextCSVoptions,
63				} );
64
65				Elements in the hash reference are keyed on:
66
67				=over 4
68
69				=item * C
70
71				Absolute or relative path to the incoming taxonomy file.
72				B for this interface.
73
74				=item * C
75
76				If the column to be used as the "path" column in the incoming taxonomy file is
77				B the first column, this option must be set to the integer representing
78				the "path" column's index position (count starts at 0). Optional; defaults to C<0>.
79
80				=item * C
81
82				If the string used to distinguish components of the path in the path column in
83				the incoming taxonomy file is not a pipe (C<\|>), this option must be set.
84				Optional; defaults to C<\|>.
85
86				=item * Text::CSV options
87
88				Any other options which could normally be passed to Cnew()> will
89				be passed through to that module's constructor. On the recommendation of the
90				Text::CSV documentation, C is always set to a true value.
91
92				=back
93
94				=item 2 C interface
95
96				$obj = Parse::File::Taxonomy::Path->new( {
97				components => {
98				fields => $fields,
99				data_records => $data_records,
100				}
101				} );
102
103				Elements in this hash are keyed on:
104
105				=over 4
106
107				=item * C
108
109				This element is B for the
110				C interface. The value of this element is a hash reference with two keys, C and
111				C. C is a reference to an array holding the field or
112				column names for the data set. C is a reference to an array of
113				array references, each of the latter arrayrefs holding one record or row from
114				the data set.
115
116				=item * C
117
118				Same as in C interface above.
119
120				=item * C
121
122				Same as in C interface above.
123
124				=back
125
126				=back
127
128				=item * Return Value
129
130				Parse::File::Taxonomy::Path object.
131
132				=item * Comment
133
134				C will throw an exception under any of the following conditions:
135
136				=over 4
137
138				=item * Argument to C is not a reference.
139
140				=item * Argument to C is not a hash reference.
141
142				=item * In the C interface, unable to locate the file which is the value of the C element.
143
144				=item * Argument to C element is not an integer.
145
146				=item * Argument to C is greater than the index number of the
147				last element in the header row of the incoming taxonomy file, I the
148				C is wrong.
149
150				=item * The same field is found more than once in the header row of the
151				incoming taxonomy file.
152
153				=item * Unable to open or close the incoming taxonomy file for reading.
154
155				=item * In the column designated as the "path" column, the same value is
156				observed more than once.
157
158				=item * A non-parent node's parent node cannot be located in the incoming taxonomy file.
159
160				=item * A data row has a number of fields different from the number of fields
161				in the header row.
162
163				=back
164
165				=back
166
167				=cut
168
169				sub new {
170				my ($class, $args) = @_;
171				my $data;
172
173				croak "Argument to 'new()' must be hashref"
174				unless (ref($args) and reftype($args) eq 'HASH');
175				croak "Argument to 'new()' must have either 'file' or 'components' element"
176				unless ($args->{file} or $args->{components});
177				croak "Argument to 'new()' must have either 'file' or 'components' element but not both"
178				if ($args->{file} and $args->{components});
179
180				if (exists $args->{path_col_idx}) {
181				croak "Argument to 'path_col_idx' must be integer"
182				unless $args->{path_col_idx} =~ m/^\d+$/;
183				}
184				$data->{path_col_idx} = delete $args->{path_col_idx} \|\| 0;
185				$data->{path_col_sep} = exists $args->{path_col_sep}
186				? $args->{path_col_sep}
187				: '\|';
188				if (exists $args->{path_col_sep}) {
189				$data->{path_col_sep} = $args->{path_col_sep};
190				delete $args->{path_col_sep};
191				}
192				else {
193				$data->{path_col_sep} = '\|';
194				}
195
196				if ($args->{components}) {
197				croak "Value of 'components' element must be hashref"
198				unless (ref($args->{components}) and reftype($args->{components}) eq 'HASH');
199				for my $k ( qw\| fields data_records \| ) {
200				croak "Value of 'components' element must have '$k' key-value pair"
201				unless exists $args->{components}->{$k};
202				croak "Value of '$k' element must be arrayref"
203				unless (ref($args->{components}->{$k}) and
204				reftype($args->{components}->{$k}) eq 'ARRAY');
205				}
206				for my $row (@{$args->{components}->{data_records}}) {
207				croak "Each element in 'data_records' array must be arrayref"
208				unless (ref($row) and reftype($row) eq 'ARRAY');
209				}
210				# We don't want to stick $args->{components} into the object as is.
211				# Rather, we want to insert 'fields' and 'data_records' for
212				# consistency with the 'file' interface. But to do that we first need
213				# to impose the same validations that we do for the 'file' interface.
214				# We also need to populate 'path_col'.
215				_prepare_fields($data, $args->{components}->{fields}, 1);
216				my $these_data_records = $args->{components}->{data_records};
217				delete $args->{components};
218				_prepare_data_records($data, $these_data_records, $args);
219				}
220				else {
221				croak "Cannot locate file '$args->{file}'"
222				unless (-f $args->{file});
223				$data->{file} = delete $args->{file};
224
225				# We've now handled all the Parse::File::Taxonomy::Path-specific options.
226				# Any remaining options are assumed to be intended for Text::CSV::new().
227
228				$args->{binary} = 1;
229				my $csv = Text::CSV->new ( $args )
230				or croak "Cannot use CSV: ".Text::CSV->error_diag ();
231				open my $IN, "<", $data->{file}
232				or croak "Unable to open '$data->{file}' for reading";
233				my $header_ref = $csv->getline($IN);
234
235				_prepare_fields($data, $header_ref);
236				my $data_records = $csv->getline_all($IN);
237				close $IN or croak "Unable to close after reading";
238				_prepare_data_records($data, $data_records, $args);
239				}
240
241				while (my ($k,$v) = each %{$args}) {
242				$data->{$k} = $v;
243				}
244				return bless $data, $class;
245				}
246
247				sub _prepare_fields {
248				my ($data, $fields_ref, $components) = @_;
249				if (! $components) {
250				_check_path_col_idx($data, $fields_ref, 0);
251				path_check_fields($data, $fields_ref);
252				}
253				else {
254				_check_path_col_idx($data, $fields_ref, 1);
255				components_check_fields($data, $fields_ref);
256				}
257				$data->{fields} = $fields_ref;
258				$data->{path_col} = $data->{fields}->[$data->{path_col_idx}];
259				return $data;
260				}
261
262				sub _check_path_col_idx {
263				my ($data, $fields_ref, $components) = @_;
264				my $error_msg = "Argument to 'path_col_idx' exceeds index of last field in ";
265				$error_msg .= $components
266				? "'fields' array ref"
267				: "header row in '$data->{file}'";
268
269				croak $error_msg if $data->{path_col_idx} > $#{$fields_ref};
270				}
271
272				sub _prepare_data_records {
273				# Confirm no duplicate entries in column holding path:
274				# Confirm all rows have same number of columns as header:
275				my ($data, $data_records, $args) = @_;
276				my @bad_count_records = ();
277				my %paths_seen = ();
278				my $field_count = scalar(@{$data->{fields}});
279				for my $rec (@{$data_records}) {
280				$paths_seen{$rec->[$data->{path_col_idx}]}++;
281				my $this_row_count = scalar(@{$rec});
282				if ($this_row_count != $field_count) {
283				push @bad_count_records,
284				[ $rec->[$data->{path_col_idx}], $this_row_count ];
285				}
286				}
287				my @dupe_paths = ();
288				for my $path (sort keys %paths_seen) {
289				push @dupe_paths, $path if $paths_seen{$path} > 1;
290				}
291				my $error_msg = <
292				No duplicate entries are permitted in column designated as path.
293				The following entries appear the number of times shown:
294				ERROR_MSG_DUPE
295				for my $path (@dupe_paths) {
296				$error_msg .= " $path:" . sprintf(" %6s\n" => $paths_seen{$path});
297				}
298				croak $error_msg if @dupe_paths;
299
300				$error_msg = <
301				Header row has $field_count records. The following records had different counts:
302				ERROR_MSG_WRONG_COUNT
303				for my $rec (@bad_count_records) {
304				$error_msg .= " $rec->[0]: $rec->[1]\n";
305				}
306				croak $error_msg if @bad_count_records;
307
308				# Confirm each node appears in taxonomy:
309				my $path_args = { map { $_ => $args->{$_} } keys %{$args} };
310				$path_args->{sep_char} = $data->{path_col_sep};
311				my $path_csv = Text::CSV->new ( $path_args )
312				or croak "Cannot use CSV: ".Text::CSV->error_diag ();
313				my %missing_parents = ();
314				for my $path (sort keys %paths_seen) {
315				my $status = $path_csv->parse($path);
316				my @columns = $path_csv->fields();
317				if (@columns > 2) {
318				my $parent =
319				join($path_args->{sep_char} => @columns[0 .. ($#columns - 1)]);
320				unless (exists $paths_seen{$parent}) {
321				$missing_parents{$path} = $parent;
322				}
323				}
324				}
325				$error_msg = <
326				Each node in the taxonomy must have a parent.
327				The following nodes lack the expected parent:
328				ERROR_MSG_ORPHAN
329				for my $path (sort keys %missing_parents) {
330				$error_msg .= " $path: $missing_parents{$path}\n";
331				}
332				croak $error_msg if scalar(keys %missing_parents);
333				# BBB end of validations
334				$data->{data_records} = $data_records;
335
336				return $data;
337				}
338
339				=head2 C
340
341				=over 4
342
343				=item * Purpose
344
345				Identify the names of the columns in the taxonomy.
346
347				=item * Arguments
348
349				my $fields = $self->fields();
350
351				No arguments; the information is already inside the object.
352
353				=item * Return Value
354
355				Reference to an array holding a list of the columns as they appear in the
356				header row of the incoming taxonomy file.
357
358				=item * Comment
359
360				Read-only.
361
362				=back
363
364				# Implemented in lib/Parse/File/Taxonomy.pm
365
366				=head2 C
367
368				=over 4
369
370				=item * Purpose
371
372				Identify the index position (count starts at 0) of the column in the incoming
373				taxonomy file which serves as the path column.
374
375				=item * Arguments
376
377				my $path_col_idx = $self->path_col_idx;
378
379				No arguments; the information is already inside the object.
380
381				=item * Return Value
382
383				Integer in the range from 0 to 1 less than the number of columns in the header
384				row.
385
386				=item * Comment
387
388				Read-only.
389
390				=back
391
392				=cut
393
394				sub path_col_idx {
395				my $self = shift;
396				return $self->{path_col_idx};
397				}
398
399				=head2 C
400
401				=over 4
402
403				=item * Purpose
404
405				Identify the name of the column in the incoming taxonomy which serves as the
406				path column.
407
408				=item * Arguments
409
410				my $path_col = $self->path_col;
411
412				No arguments; the information is already inside the object.
413
414				=item * Return Value
415
416				String.
417
418				=item * Comment
419
420				Read-only.
421
422				=back
423
424				=cut
425
426				sub path_col {
427				my $self = shift;
428				return $self->{path_col};
429				}
430
431				=head2 C
432
433				=over 4
434
435				=item * Purpose
436
437				Identify the string used to separate path components once the taxonomy has
438				been created. This is just a "getter" and is logically distinct from the
439				option to C which is, in effect, a "setter."
440
441				=item * Arguments
442
443				my $path_col_sep = $self->path_col_sep;
444
445				No arguments; the information is already inside the object.
446
447				=item * Return Value
448
449				String.
450
451				=item * Comment
452
453				Read-only.
454
455				=back
456
457				=cut
458
459				sub path_col_sep {
460				my $self = shift;
461				return $self->{path_col_sep};
462				}
463
464				=head2 C
465
466				=over 4
467
468				=item * Purpose
469
470				Once the taxonomy has been validated, get a list of its data rows as a Perl
471				data structure.
472
473				=item * Arguments
474
475				$data_records = $self->data_records;
476
477				None.
478
479				=item * Return Value
480
481				Reference to array of array references. The array will hold the data records
482				found in the incoming taxonomy file in their order in that file.
483
484				=item * Comment
485
486				Does not contain any information about the fields in the taxonomy, so you
487				should probably either (a) use in conjunction with C method above;
488				or (b) use C.
489
490				=back
491
492				# Implemented in lib/Parse/File/Taxonomy.pm
493
494				=head2 C
495
496				=over 4
497
498				=item * Purpose
499
500				Once the taxonomy has been validated, get a list of its header and data rows as a Perl
501				data structure.
502
503				=item * Arguments
504
505				$data_records = $self->fields_and_data_records;
506
507				None.
508
509				=item * Return Value
510
511				Reference to array of array references. The first element in the array will
512				hold the header row (same as output of C). The remaining elements
513				will hold the data records found in the incoming taxonomy file in their order
514				in that file.
515
516				=back
517
518				=cut
519
520				# Implemented in lib/Parse/File/Taxonomy.pm
521
522				=head2 C
523
524				=over 4
525
526				=item * Purpose
527
528				Once the taxonomy has been validated, get a list of its data rows as a Perl
529				data structure. In each element of this list, the path is now represented as
530				an array reference rather than a string.
531
532				=item * Arguments
533
534				$data_records_path_components = $self->data_records_path_components;
535
536				None.
537
538				=item * Return Value
539
540				Reference to array of array references. The array will hold the data records
541				found in the incoming taxonomy file in their order in that file.
542
543				=item * Comment
544
545				Does not contain any information about the fields in the taxonomy, so you may
546				wish to use this method either (a) use in conjunction with C method
547				above; or (b) use C.
548
549				=back
550
551				=cut
552
553				sub data_records_path_components {
554				my $self = shift;
555				my @all_rows = ();
556				for my $row (@{$self->{data_records}}) {
557				my $path_col = $row->[$self->{path_col_idx}];
558				my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
559				my @rewritten = ();
560				for (my $i = 0; $i <= $#{$row}; $i++) {
561				if ($i != $self->{path_col_idx}) {
562				push @rewritten, $row->[$i];
563				}
564				else {
565				push @rewritten, \@path_components;
566				}
567				}
568				push @all_rows, \@rewritten;
569				}
570				return \@all_rows;
571				}
572
573				=head2 C
574
575				=over 4
576
577				=item * Purpose
578
579				Once the taxonomy has been validated, get a list of its data rows as a Perl
580				data structure. The first element in this list is an array reference holding
581				the header row. In each data element of this list, the path is now represented as
582				an array reference rather than a string.
583
584				=item * Arguments
585
586				$fields_and_data_records_path_components = $self->fields_and_data_records_path_components;
587
588				None.
589
590				=item * Return Value
591
592				Reference to array of array references. The array will hold the data records
593				found in the incoming taxonomy file in their order in that file.
594
595				=back
596
597				=cut
598
599				sub fields_and_data_records_path_components {
600				my $self = shift;
601				my @all_rows = $self->fields;
602				for my $row (@{$self->{data_records}}) {
603				my $path_col = $row->[$self->{path_col_idx}];
604				my @path_components = split(/\Q$self->{path_col_sep}\E/, $path_col);
605				my @rewritten = ();
606				for (my $i = 0; $i <= $#{$row}; $i++) {
607				if ($i != $self->{path_col_idx}) {
608				push @rewritten, $row->[$i];
609				}
610				else {
611				push @rewritten, \@path_components;
612				}
613				}
614				push @all_rows, \@rewritten;
615				}
616				return \@all_rows;
617				}
618
619				=head2 C
620
621				=over 4
622
623				=item * Purpose
624
625				Identify the index position of a given field within the header row.
626
627				=item * Arguments
628
629				$index = $obj->get_field_position('income');
630
631				Takes a single string holding the name of one of the fields (column names).
632
633				=item * Return Value
634
635				Integer representing the index position (counting from C<0>) of the field
636				provided as argument. Throws exception if the argument is not actually a
637				field.
638
639				=back
640
641				=cut
642
643				# Implemented in lib/Parse/File/Taxonomy.pm
644
645				=head2 C
646
647				=over 4
648
649				=item * Purpose
650
651				Display the number of descendant (multi-generational) nodes each node in the
652				taxonomy has.
653
654				=item * Arguments
655
656				$child_counts = $self->child_counts();
657
658				None.
659
660				=item * Return Value
661
662				Reference to hash in which each element is keyed on the value of the path
663				column in the incoming taxonomy file.
664
665				=back
666
667				=cut
668
669				sub child_counts {
670				my $self = shift;
671				my %child_counts = map { $_->[$self->{path_col_idx}] => 0 } @{$self->{data_records}};
672				for my $node (keys %child_counts) {
673				for my $other_node ( grep { ! m/^\Q$node\E$/ } keys %child_counts) {
674				$child_counts{$node}++
675				if $other_node =~ m/^\Q$node$self->{path_col_sep}\E/;
676				}
677				}
678				return \%child_counts;
679				}
680
681				=head2 C
682
683				=over 4
684
685				=item * Purpose
686
687				Get the total number of descendant nodes for one specific node in a validated
688				taxonomy.
689
690				=item * Arguments
691
692				$child_count = $self->get_child_count('\|Path\|To\|Node');
693
694				String containing node's path as spelled in the taxonomy.
695
696				=item * Return Value
697
698				Unsigned integer >= 0. Any node whose child count is C<0> is by definition a
699				leaf node.
700
701				=item * Comment
702
703				Will throw an exception if the node does not exist or is misspelled.
704
705				=back
706
707				=cut
708
709				sub get_child_count {
710				my ($self, $node) = @_;
711				my $child_counts = $self->child_counts();
712				croak "Node '$node' not found" unless exists $child_counts->{$node};
713				return $child_counts->{$node};
714				}
715
716				=head2 C
717
718				=over 4
719
720				=item * Purpose
721
722				Turn a validated taxonomy into a Perl hash keyed on the column designated as
723				the path column.
724
725				=item * Arguments
726
727				$hashref = $self->hashify_taxonomy();
728
729				Takes an optional hashref holding a list of any of the following elements:
730
731				=over 4
732
733				=item * C
734
735				Boolean, defaulting to C<0>. By default, C will spell the
736				key of the hash exactly as the value of the path column is spelled in the
737				taxonomy -- which in turn is the way it was spelled in the incoming file.
738				That is, a path in the taxonomy spelled C<\|Alpha\|Beta\|Gamma> will be spelled
739				as a key in exactly the same way.
740
741				However, since in many cases (including the example above) the root node of
742				the taxonomy will be empty, the user may wish to remove the first instance of
743				C. The user would do so by setting
744				C to a true value.
745
746				$hashref = $self->hashify_taxonomy( {
747				remove_leading_path_col_sep => 1,
748				} );
749
750				In that case they key would now be spelled: C.
751
752				Note further that if the C switch is set to a true value, any
753				setting to C will be ignored.
754
755				=item * C
756
757				A string which will be used in composing the key of the hashref returned by
758				this method. The user may select this key if she does not want to use the
759				value found in the incoming CSV file (which by default will be the pipe
760				character (C<\|>) and which may be overridden with the C argument
761				to C.
762
763				$hashref = $self->hashify_taxonomy( {
764				key_delim => q{ - },
765				} );
766
767				In the above variant, a path that in the incoming taxonomy file was
768				represented by C<\|Alpha\|Beta\|Gamma> will in C<$hashref> be represented by
769				C< - Alpha - Beta - Gamma>.
770
771				=item * C
772
773				A string which will be used in composing the key of the hashref returned by
774				this method. The user will set this switch if she wishes to have the root
775				note explicitly represented. Using this switch will automatically cause
776				C to be ignored.
777
778				Suppose the user wished to have C be the text for the root
779				node. Suppose further that the user wanted to use the string C< - > as the
780				delimiter within the key.
781
782				$hashref = $self->hashify_taxonomy( {
783				root_str => q{All Suppliers},
784				key_delim => q{ - },
785				} );
786
787				Then incoming path C<\|Alpha\|Beta\|Gamma> would be keyed as:
788
789				All Suppliers - Alpha - Beta - Gamma
790
791				=back
792
793				=item * Return Value
794
795				Hash reference. The number of elements in this hash should be equal to the
796				number of non-header records in the taxonomy.
797
798				=back
799
800				=cut
801
802				sub hashify_taxonomy {
803				my ($self, $args) = @_;
804				if (defined $args) {
805				croak "Argument to 'new()' must be hashref"
806				unless (ref($args) and reftype($args) eq 'HASH');
807				}
808				my %hashified = ();
809				my $fields = $self->{fields};
810				my %idx2col = map { $_ => $fields->[$_] } (0 .. $#{$fields});
811				for my $rec (@{$self->{data_records}}) {
812				my $rowkey;
813				if ($args->{root_str}) {
814				$rowkey = $args->{root_str} . $rec->[$self->{path_col_idx}];
815				}
816				else {
817				if ($args->{remove_leading_path_col_sep}) {
818				($rowkey = $rec->[$self->{path_col_idx}]) =~ s/^\Q$self->{path_col_sep}\E(.*)/$1/;
819				}
820				else {
821				$rowkey = $rec->[$self->{path_col_idx}];
822				}
823				}
824				if ($args->{key_delim}) {
825				$rowkey =~ s/\Q$self->{path_col_sep}\E/$args->{key_delim}/g;
826				}
827				my $rowdata = { map { $idx2col{$_} => $rec->[$_] } (0 .. $#{$fields}) };
828				$hashified{$rowkey} = $rowdata;
829				}
830				return \%hashified;
831				}
832
833				1;
834
835				# vim: formatoptions=crqot