File Coverage

blib/lib/Text/CSV/Hashify.pm
Criterion Covered Total %
statement 113 113 100.0
branch 61 64 95.3
condition 26 26 100.0
subroutine 17 17 100.0
pod 6 7 85.7
total 223 227 98.2


line stmt bran cond sub pod time code
1             package Text::CSV::Hashify;
2 6     6   6252 use strict;
  6         49  
  6         177  
3 6     6   125 use 5.8.0;
  6         25  
4 6     6   34 use Carp;
  6         16  
  6         337  
5 6     6   3045 use IO::File;
  6         54574  
  6         736  
6 6     6   3464 use IO::Zlib;
  6         383886  
  6         42  
7 6     6   375 use Scalar::Util qw( reftype looks_like_number );
  6         16  
  6         342  
8 6     6   4327 use Text::CSV;
  6         84352  
  6         308  
9 6     6   2632 use open qw( :encoding(UTF-8) :std );
  6         6838  
  6         41  
10              
11             BEGIN {
12 6     6   110664 use Exporter ();
  6         13  
  6         375  
13 6     6   18 our ($VERSION, @ISA, @EXPORT);
14 6         28 $VERSION = '0.11';
15 6         147 @ISA = qw(Exporter);
16 6         9120 @EXPORT = qw( hashify );
17             }
18              
19             =head1 NAME
20              
21             Text::CSV::Hashify - Turn a CSV file into a Perl hash
22              
23             =head1 VERSION
24              
25             This document refers to version 0.11 of Text::CSV::Hashify. This version was
26             released May 22 2018.
27              
28             =head1 SYNOPSIS
29              
30             # Simple functional interface
31             use Text::CSV::Hashify;
32             $hash_ref = hashify('/path/to/file.csv', 'primary_key');
33              
34             # Object-oriented interface
35             use Text::CSV::Hashify;
36             $obj = Text::CSV::Hashify->new( {
37             file => '/path/to/file.csv',
38             format => 'hoh', # hash of hashes, which is default
39             key => 'id', # needed except when format is 'aoh'
40             max_rows => 20, # number of records to read; defaults to all
41             ... # other key-value pairs as appropriate from Text::CSV
42             } );
43              
44             # all records requested
45             $hash_ref = $obj->all;
46              
47             # arrayref of fields input
48             $fields_ref = $obj->fields;
49              
50             # hashref of specified record
51             $record_ref = $obj->record('value_of_key');
52              
53             # value of one field in one record
54             $datum = $obj->datum('value_of_key', 'field');
55              
56             # arrayref of all unique keys seen
57             $keys_ref = $obj->keys;
58              
59             =head1 DESCRIPTION
60              
61             The Comma-Separated-Value ('CSV') format is the most common way to store
62             spreadsheets or the output of relational database queries in plain-text
63             format. However, since commas (or other designated field-separator
64             characters) may be embedded within data entries, the parsing of delimited
65             records is non-trivial. Fortunately, in Perl this parsing is well handled by
66             CPAN distribution L. This
67             permits us to address more specific data manipulation problems by building
68             modules on top of F.
69              
70             B In this document we will use I as a catch-all for tab-delimited
71             files, pipe-delimited files, and so forth. Please refer to the documentation
72             for Text::CSV to learn how to handle field separator characters other than the
73             comma.
74              
75             F is designed for the case where you simply want to turn a
76             CSV file into a Perl hash. In particular, it is designed for the case where:
77              
78             =over 4
79              
80             =item *
81              
82             the CSV file's first record is a list of fields in the
83             ancestral database table; and
84              
85             =item *
86              
87             one field (column) functions as a primary key, I each record's entry in
88             that field is non-null and is distinct from every other record's entry
89             therein.
90              
91             =back
92              
93             F turns that kind of CSV file into one big hash of hashes.
94              
95             F can now take gzip-compressed (F<.gz>) files as input as
96             well as uncompressed files.
97              
98             =head2 Primary Case: CSV (with primary key) to Hash of Hashes
99              
100             Text::CSV::Hashify is designed for the case where you simply want to turn a
101             CSV file into a Perl hash. In particular, it is designed for the case where
102             (a) the CSV file's first record is a list of fields in the ancestral database
103             table and (b) one field (column) functions as a B, I each
104             record's entry in that field is non-null and is distinct from every other
105             record's entry therein.
106              
107             Text::CSV::Hashify turns that kind of CSV file into one big hash of hashes.
108             Elements of this hash are keyed on the entries in the designated primary key
109             field and the value for each element is a hash reference of all the data in a
110             particular database record (including the primary key field and its value).
111              
112             =head2 Secondary Case: CSV (lacking primary key) to Array of Hashes
113              
114             You may, however, encounter cases where a CSV file's header row contains the
115             list of database fields but no field is capable of serving as a primary key,
116             I there is no field in which the entry for that field in any record is
117             guaranteed to be distinct from the entries in that field for all other
118             records.
119              
120             In this case, while an individual record can be turned into a hash,
121             the CSV file as a whole cannot accurately be turned into a hash of hashes. As
122             a fallback, Text::CSV::Hashify can, upon request, turn this into an array of
123             hashes. In this case, you will not be able to look up a particular record by
124             its primary key. You will instead have to know its index position within the
125             array (which is equivalent to knowing its record number in the original CSV
126             file minus C<1>).
127              
128             =head2 Interfaces
129              
130             Text::CSV::Hashify provides two interfaces: one functional, one
131             object-oriented.
132              
133             Use the functional interface when all you want is to turn a CSV file with a
134             primary key field into a hash of hashes.
135              
136             Use the object-oriented interface for any more sophisticated manipulation of
137             the CSV file. This includes:
138              
139             =over 4
140              
141             =item * Text::CSV options
142              
143             Access to any of the options available to Text::CSV, such as use of a
144             separator character other than a comma. B Much of the time you will
145             not need any of the Text::CSV options. Text::CSV::Hashify is focused on
146             B CSV files, whereas Text::CSV is focused on both reading and
147             B CSV files. Some Text::CSV options, such as C, are unlikely to
148             be needed when using Text::CSV::Hashify. Hence, you should be very selective
149             in your use of Text::CSV options.
150              
151             =item * Limit number of records
152              
153             Selection of a limited number of records from the CSV file, rather than
154             slurping the whole file into your in-memory hash.
155              
156             =item * Array of hash references format
157              
158             Probably better than the default hash of hash references format when the CSV
159             file has no field able to serve as a primary key.
160              
161             =item * Metadata
162              
163             Access to the list of fields, the list of all primary key values, the values
164             in an individual record, or the value of an individual field in an individual
165             record.
166              
167             =back
168              
169             B On the recommendation of the authors/maintainers of Text::CSV,
170             Text::CSV::Hashify will internally always set Text::CSV's C 1>
171             option.
172              
173             =head1 FUNCTIONAL INTERFACE
174              
175             Text::CSV::Hashify by default exports one function: C.
176              
177             $hash_ref = hashify('/path/to/file.csv', 'primary_key');
178              
179             or
180              
181             $hash_ref = hashify('/path/to/file.csv.gz', 'primary_key');
182              
183             Function takes two arguments: path to CSV file; field in that file which
184             serves as primary key. If the path to the input file ends in F<.gz>, it is
185             assumed to be compressed by F. If the file name ends in F<.psv> (or
186             F<.psv.gz>), the separator character is assumed to be a pipe (C<|>). If the
187             file name ends in F<.tsv> (or F<.tsv.gz>), the separator character is assumed
188             to be a tab (C< >). Otherwise, the separator character will be assumed to be
189             a comma (C<,>).
190              
191             Returns a reference to a hash of hash references.
192              
193             =cut
194              
195             sub hashify {
196 10 100   10 0 24996 croak "'hashify()' must have two arguments"
197             unless @_ == 2;
198 9         28 my @args = @_;
199 9         34 for (my $i=0;$i<=$#args;$i++) {
200 18 100       156 croak "'hashify()' argument at index '$i' not true" unless $args[$i];
201             }
202 8         34 my %obj_args = (
203             file => $args[0],
204             key => $args[1],
205             );
206             $obj_args{sep_char} =
207             ($obj_args{file} =~ m/\.psv(\.gz)?$/)
208             ? '|'
209 8 100       94 : ($obj_args{file} =~ m/\.tsv(\.gz)?$/)
    100          
210             ? "\t"
211             : ',';
212 8         42 my $obj = Text::CSV::Hashify->new( \%obj_args );
213 8         38 return $obj->all();
214             }
215              
216             =head1 OBJECT-ORIENTED INTERFACE
217              
218             =head2 C
219              
220             =over 4
221              
222             =item * Purpose
223              
224             Text::CSV::Hashify constructor.
225              
226             =item * Arguments
227              
228             $obj = Text::CSV::Hashify->new( {
229             file => '/path/to/file.csv',
230             format => 'hoh', # hash of hashes, which is default
231             key => 'id', # needed except when format is 'aoh'
232             max_rows => 20, # number of records to read; defaults to all
233             ... # other key-value pairs as appropriate from Text::CSV
234             } );
235              
236             Single hash reference. Required element is:
237              
238             =over 4
239              
240             =item * C
241              
242             String: path to CSV file serving as input. If the path to the input file ends
243             in F<.gz>, it is assumed to be compressed by F.
244              
245             =back
246              
247             Element usually needed:
248              
249             =over 4
250              
251             =item * C
252              
253             String: name of field in CSV file serving as unique key. Needed except when
254             optional element C is C.
255              
256             =back
257              
258             Optional elements are:
259              
260             =over 4
261              
262             =item * C
263              
264             String: possible values are C and C. Defaults to C (hash of
265             hashes). C will fail if the same value is encountered in more than one
266             record's entry in the C column. So if you know in advance that your data
267             cannot meet this condition, explicitly select C aoh>.
268              
269             =item * C
270              
271             Number: provide this if you do not wish to populate the hash with all data
272             records from the CSV file. (Will have no effect if the number provided is
273             greater than or equal to the number of data records in the CSV file.)
274              
275             =item * Any option available to Text::CSV
276              
277             See documentation for either Text::CSV or Text::CSV_XS, but see discussion of
278             "Text::CSV options" above.
279              
280             =back
281              
282             =item * Return Value
283              
284             Text::CSV::Hashify object.
285              
286             =item * Comment
287              
288             =back
289              
290             =cut
291              
292             sub new {
293 37     37 1 30846 my ($class, $args) = @_;
294 37         133 my %data;
295              
296 37 100 100     624 croak "Argument to 'new()' must be hashref"
297             unless (ref($args) and reftype($args) eq 'HASH');
298 35 100       206 croak "Argument to 'new()' must have 'file' element" unless $args->{file};
299             croak "Cannot locate file '$args->{file}'"
300 34 100       900 unless (-f $args->{file});
301 33         232 $data{file} = delete $args->{file};
302              
303 33 100 100     157 if ($args->{format} and ($args->{format} !~ m/^(?:h|a)oh$/i) ) {
304 1         125 croak "Entry '$args->{format}' for format is invalid'";
305             }
306 32   100     138 $data{format} = delete $args->{format} || 'hoh';
307              
308 32 100       80 if (exists $args->{key}) {
309             croak "Value for 'key' must be non-empty string"
310 29 100 100     419 unless defined $args->{key} and length($args->{key});
311             }
312              
313 30 100 100     97 if (! exists $args->{key} and $data{format} ne 'aoh') {
314 1         140 croak "Argument to 'new()' must have 'key' element unless 'format' element is 'aoh'";
315             }
316              
317 29         67 $data{key} = delete $args->{key};
318              
319 29 100       77 if (defined($args->{max_rows})) {
320 8 100       119 if ($args->{max_rows} !~ m/^[0-9]+$/) {
321 3         330 croak "'max_rows' option, if defined, must be numeric";
322             }
323             else {
324 5         20 $data{max_rows} = delete $args->{max_rows};
325             }
326             }
327             # We've now handled all the Text::CSV::Hashify::new-specific options.
328             # Any remaining options are assumed to be intended for Text::CSV::new().
329              
330 26         60 $args->{binary} = 1;
331 26 50       172 my $csv = Text::CSV->new ( $args )
332             or croak "Cannot use CSV: ".Text::CSV->error_diag ();
333 26         3836 my $IN;
334 26 100       117 if ($data{file} =~ m/\.gz$/) {
335 8         44 $IN = IO::Zlib->new($data{file}, "rb");
336             }
337             else {
338 18         106 $IN = IO::File->new($data{file}, "r");
339             }
340 26 50       30012 croak "Unable to open '$data{file}' for reading"
341             unless defined $IN;
342 26         948 my $header_ref = $csv->getline($IN);
343 26         2429 my %header_fields_seen;
344 26         53 for (@{$header_ref}) {
  26         72  
345 522 100       850 if (exists $header_fields_seen{$_}) {
346 1         132 croak "Duplicate field '$_' observed in '$data{file}'";
347             }
348             else {
349 521         912 $header_fields_seen{$_}++;
350             }
351             }
352 25 100       105 if ($data{format} eq 'hoh') {
353             croak "Key '$data{key}' not found in header row"
354 23 100       204 unless $header_fields_seen{$data{key}};
355             }
356              
357 24         62 $data{fields} = $header_ref;
358 24         38 $csv->column_names(@{$header_ref});
  24         144  
359              
360             # 'hoh format
361 24         2397 my %keys_seen;
362 24         66 my @keys_list = ();
363 24         50 my %parsed_data;
364             # 'aoh' format
365             my @parsed_data;
366              
367 24         96 PARSE_FILE: while (my $record = $csv->getline_hr($IN)) {
368 4239 100       834433 if ($data{format} eq 'hoh') {
369 4229         9338 my $kk = $record->{$data{key}};
370 4229 100       9201 if ($keys_seen{$kk}) {
371 1         128 croak "Key '$kk' already seen";
372             }
373             else {
374 4228         10070 $keys_seen{$kk}++;
375 4228         8247 push @keys_list, $kk;
376 4228         8294 $parsed_data{$kk} = $record;
377             last PARSE_FILE if (
378             defined $data{max_rows} and
379             scalar(keys %parsed_data) == $data{max_rows}
380 4228 100 100     18620 );
381             }
382             }
383             else { # format: 'aoh'
384 10         18 push @parsed_data, $record;
385             last PARSE_FILE if (
386             defined $data{max_rows} and
387             scalar(@parsed_data) == $data{max_rows}
388 10 100 100     44 );
389             }
390             }
391 23 50       2322 $IN->close or croak "Unable to close $data{file} after reading";
392 23 100       2032 $data{all} = ($data{format} eq 'aoh') ? \@parsed_data : \%parsed_data;
393 23 100       105 $data{keys} = \@keys_list if $data{format} eq 'hoh';
394 23         70 $data{csv} = $csv;
395 23         51 while (my ($k,$v) = each %{$args}) {
  60         222  
396 37         120 $data{$k} = $v;
397             }
398 23         13785 return bless \%data, $class;
399             }
400              
401             =head2 C
402              
403             =over 4
404              
405             =item * Purpose
406              
407             Get a representation of all data found in a CSV input file.
408              
409             =item * Arguments
410              
411             $hash_ref = $obj->all; # when format is default or 'hoh'
412             $array_ref = $obj->all; # when format is 'aoh'
413              
414             =item * Return Value
415              
416             Reference representing all data records in the CSV input file. In the default
417             case, or if you have specifically requested C 'hoh'>, the return
418             value is a hash reference. When you have requested C 'aoh'>, the
419             return value is an array reference.
420              
421             =item * Comment
422              
423             In the default (C) case, the return value is equivalent to that of
424             C.
425              
426             =back
427              
428             =cut
429              
430             sub all {
431 12     12 1 2710 my ($self) = @_;
432 12         444 return $self->{all};
433             }
434              
435             =head2 C
436              
437             =over 4
438              
439             =item * Purpose
440              
441             Get a list of the fields in the CSV source.
442              
443             =item * Arguments
444              
445             $fields_ref = $obj->fields;
446              
447             =item * Return Value
448              
449             Array reference.
450              
451             =item * Comment
452              
453             If any field names are duplicate, you will not get this far, as C would
454             have died.
455              
456             =back
457              
458             =cut
459              
460             sub fields {
461 3     3 1 1709 my ($self) = @_;
462 3         9 return $self->{fields};
463             }
464              
465             =head2 C
466              
467             =over 4
468              
469             =item * Purpose
470              
471             Get a hash representing one record in the CSV input file.
472              
473             =item * Arguments
474              
475             $record_ref = $obj->record('value_of_key');
476              
477             One argument. In the default case (C 'hoh'>), this argument is the value in the record in the column serving as unique key.
478              
479             In the C 'aoh'> case, this will be index position of the data record
480             in the array. (The header row will be at index C<0>.)
481              
482             =item * Return Value
483              
484             Hash reference.
485              
486             =back
487              
488             =cut
489              
490             sub record {
491 15     15 1 10913 my ($self, $key) = @_;
492 15 100 100     756 croak "Argument to 'record()' either not defined or non-empty"
493             unless (defined $key and $key ne '');
494             ($self->{format} eq 'aoh')
495             ? return $self->{all}->[$key]
496 9 100       45 : return $self->{all}->{$key};
497             }
498              
499             =head2 C
500              
501             =over 4
502              
503             =item * Purpose
504              
505             Get value of one field in one record.
506              
507             =item * Arguments
508              
509             $datum = $obj->datum('value_of_key', 'field');
510              
511             List of two arguments: the value in the record in the column serving as unique
512             key; the name of the field.
513              
514             =item * Return Value
515              
516             Scalar.
517              
518             =back
519              
520             =cut
521              
522             sub datum {
523 14     14 1 8195 my ($self, @args) = @_;
524 14 100       317 croak "'datum()' needs two arguments" unless @args == 2;
525 11         33 for (my $i=0;$i<=$#args;$i++) {
526 19 100 100     573 croak "Argument to 'datum()' at index '$i' either not defined or non-empty"
527             unless ((defined($args[$i])) and ($args[$i] ne ''));
528             }
529             ($self->{format} eq 'aoh')
530             ? return $self->{all}->[$args[0]]->{$args[1]}
531 5 100       29 : return $self->{all}->{$args[0]}->{$args[1]};
532             }
533              
534             =head2 C
535              
536             =over 4
537              
538             =item * Purpose
539              
540             Get a list of all unique keys found in the input file.
541              
542             =item * Arguments
543              
544             $keys_ref = $obj->keys;
545              
546             =item * Return Value
547              
548             Array reference.
549              
550             =item * Comment
551              
552             If you have selected C 'aoh'> in the options to C, the
553             C method is inappropriate and will cause your program to die.
554              
555             =back
556              
557             =cut
558              
559             sub keys {
560 4     4 1 3274 my ($self) = @_;
561 4 100       19 if (exists $self->{keys}) {
562 3         13 return $self->{keys};
563             }
564             else {
565 1         86 croak "'keys()' method not appropriate when 'format' is 'aoh'";
566             }
567             }
568              
569             =head1 AUTHOR
570              
571             James E Keenan
572             CPAN ID: jkeenan
573             jkeenan@cpan.org
574             http://thenceforward.net/perl/modules/Text-CSV-Hashify
575              
576             =head1 COPYRIGHT
577              
578             This program is free software; you can redistribute
579             it and/or modify it under the same terms as Perl itself.
580              
581             The full text of the license can be found in the
582             LICENSE file included with this module.
583              
584             Copyright 2012-2018, James E Keenan. All rights reserved.
585              
586             =head1 BUGS
587              
588             There are no bug reports outstanding on Text::CSV::Hashify as of the most recent
589             CPAN upload date of this distribution.
590              
591             =head1 SUPPORT
592              
593             To report any bugs or make any feature requests, please send mail to
594             C or use the web interface at
595             L.
596              
597             =head1 ACKNOWLEDGEMENTS
598              
599             Thanks to Christine Shieh for serving as the alpha consumer of this
600             library's output.
601              
602             =head1 OTHER CPAN DISTRIBUTIONS
603              
604             =head2 Text-CSV and Text-CSV_XS
605              
606             These distributions underlie Text-CSV-Hashify and provide all of its
607             file-parsing functionality. Where possible, install both. That will enable
608             you to process a file with a single, shared interface but have access to the
609             faster processing speeds of XS where available.
610              
611             =head2 Text-CSV-Slurp
612              
613             Like Text-CSV-Hashify, Text-CSV-Slurp slurps an entire CSV file into memory,
614             but stores it as an array of hashes instead.
615              
616             =head2 Text-CSV-Auto
617              
618             This distribution inspired the C option to C.
619              
620             =cut
621              
622             1;
623