File Coverage

blib/lib/Text/CSV/Hashify.pm
Criterion Covered Total %
statement 111 111 100.0
branch 53 56 94.6
condition 23 23 100.0
subroutine 18 18 100.0
pod 6 7 85.7
total 211 215 98.1


line stmt bran cond sub pod time code
1             package Text::CSV::Hashify;
2 6     6   4631 use strict;
  6         33  
  6         126  
3 6     6   105 use 5.8.0;
  6         18  
4 6     6   21 use Carp;
  6         9  
  6         223  
5 6     6   2273 use IO::File;
  6         40959  
  6         559  
6 6     6   2624 use IO::Zlib;
  6         297826  
  6         36  
7 6     6   283 use Scalar::Util qw( reftype looks_like_number );
  6         10  
  6         254  
8 6     6   3427 use Text::CSV;
  6         63433  
  6         233  
9 6     6   2095 use open qw( :encoding(UTF-8) :std );
  6         5473  
  6         43  
10              
11             BEGIN {
12 6     6   86629 use Exporter ();
  6         12  
  6         107  
13 6     6   24 use vars qw($VERSION @ISA @EXPORT);
  6         21  
  6         334  
14 6     6   18 $VERSION = '0.10';
15 6         114 @ISA = qw(Exporter);
16 6         6963 @EXPORT = qw( hashify );
17             }
18              
19             =head1 NAME
20              
21             Text::CSV::Hashify - Turn a CSV file into a Perl hash
22              
23             =head1 VERSION
24              
25             This document refers to version 0.10 of Text::CSV::Hashify. This version was
26             released Janaury 21 2018.
27              
28             =head1 SYNOPSIS
29              
30             # Simple functional interface
31             use Text::CSV::Hashify;
32             $hash_ref = hashify('/path/to/file.csv', 'primary_key');
33              
34             # Object-oriented interface
35             use Text::CSV::Hashify;
36             $obj = Text::CSV::Hashify->new( {
37             file => '/path/to/file.csv',
38             format => 'hoh', # hash of hashes, which is default
39             key => 'id', # needed except when format is 'aoh'
40             max_rows => 20, # number of records to read; defaults to all
41             ... # other key-value pairs possible for Text::CSV
42             } );
43              
44             # all records requested
45             $hash_ref = $obj->all;
46              
47             # arrayref of fields input
48             $fields_ref = $obj->fields;
49              
50             # hashref of specified record
51             $record_ref = $obj->record('value_of_key');
52              
53             # value of one field in one record
54             $datum = $obj->datum('value_of_key', 'field');
55              
56             # arrayref of all unique keys seen
57             $keys_ref = $obj->keys;
58              
59             =head1 DESCRIPTION
60              
61             The Comma-Separated-Value ('CSV') format is the most common way to store
62             spreadsheets or the output of relational database queries in plain-text
63             format. However, since commas (or other designated field-separator
64             characters) may be embedded within data entries, the parsing of delimited
65             records is non-trivial. Fortunately, in Perl this parsing is well handled by
66             CPAN distribution L. This
67             permits us to address more specific data manipulation problems by building
68             modules on top of F.
69              
70             B In this document we will use I as a catch-all for tab-delimited
71             files, pipe-delimited files, and so forth. Please refer to the documentation
72             for Text::CSV to learn how to handle field separator characters other than the
73             comma.
74              
75             F is designed for the case where you simply want to turn a
76             CSV file into a Perl hash. In particular, it is designed for the case where:
77              
78             =over 4
79              
80             =item *
81              
82             the CSV file's first record is a list of fields in the
83             ancestral database table; and
84              
85             =item *
86              
87             one field (column) functions as a primary key, I each record's entry in
88             that field is non-null and is distinct from every other record's entry
89             therein.
90              
91             =back
92              
93             F turns that kind of CSV file into one big hash of hashes.
94              
95             F can now take gzip-compressed (F<.gz>) files as input as
96             well as uncompressed files.
97              
98             =head2 Primary Case: CSV (with primary key) to Hash of Hashes
99              
100             Text::CSV::Hashify is designed for the case where you simply want to turn a
101             CSV file into a Perl hash. In particular, it is designed for the case where
102             (a) the CSV file's first record is a list of fields in the ancestral database
103             table and (b) one field (column) functions as a B, I each
104             record's entry in that field is non-null and is distinct from every other
105             record's entry therein.
106              
107             Text::CSV::Hashify turns that kind of CSV file into one big hash of hashes.
108             Elements of this hash are keyed on the entries in the designated primary key
109             field and the value for each element is a hash reference of all the data in a
110             particular database record (including the primary key field and its value).
111              
112             =head2 Secondary Case: CSV (lacking primary key) to Array of Hashes
113              
114             You may, however, encounter cases where a CSV file's header row contains the
115             list of database fields but no field is capable of serving as a primary key,
116             I there is no field in which the entry for that field in any record is
117             guaranteed to be distinct from the entries in that field for all other
118             records.
119              
120             In this case, while an individual record can be turned into a hash,
121             the CSV file as a whole cannot accurately be turned into a hash of hashes. As
122             a fallback, Text::CSV::Hashify can, upon request, turn this into an array of
123             hashes. In this case, you will not be able to look up a particular record by
124             its primary key. You will instead have to know its index position within the
125             array (which is equivalent to knowing its record number in the original CSV
126             file minus C<1>).
127              
128             =head2 Interfaces
129              
130             Text::CSV::Hashify provides two interfaces: one functional, one
131             object-oriented.
132              
133             Use the functional interface when all you want is to turn a CSV file with a
134             primary key field into a hash of hashes.
135              
136             Use the object-oriented interface for any more sophisticated manipulation of
137             the CSV file. This includes:
138              
139             =over 4
140              
141             =item * Text::CSV options
142              
143             Access to any of the options available to Text::CSV, such as use of a
144             separator character other than a comma.
145              
146             =item * Limit number of records
147              
148             Selection of a limited number of records from the CSV file, rather than
149             slurping the whole file into your in-memory hash.
150              
151             =item * Array of hash references format
152              
153             Probably better than the default hash of hash references format when the CSV
154             file has no field able to serve as a primary key.
155              
156             =item * Metadata
157              
158             Access to the list of fields, the list of all primary key values, the values
159             in an individual record, or the value of an individual field in an individual
160             record.
161              
162             =back
163              
164             B On the recommendation of the authors/maintainers of Text::CSV,
165             Text::CSV::Hashify will internally always set Text::CSV's C 1>
166             option.
167              
168             =head1 FUNCTIONAL INTERFACE
169              
170             Text::CSV::Hashify by default exports one function: C.
171              
172             $hash_ref = hashify('/path/to/file.csv', 'primary_key');
173              
174             or
175              
176             $hash_ref = hashify('/path/to/file.csv.gz', 'primary_key');
177              
178             Function takes two arguments: path to CSV file; field in that file which
179             serves as primary key. If the path to the input file ends in F<.gz>, it is
180             assumed to be compressed by F. If the file name ends in F<.psv> (or
181             F<.psv.gz>), the separator character is assumed to be a pipe (C<|>). If the
182             file name ends in F<.tsv> (or F<.tsv.gz>), the separator character is assumed
183             to be a tab (C< >). Otherwise, the separator character will be assumed to be
184             a comma (C<,>).
185              
186             Returns a reference to a hash of hash references.
187              
188             =cut
189              
190             sub hashify {
191 10 100   10 0 20228 croak "'hashify()' must have two arguments"
192             unless @_ == 2;
193 9         21 my @args = @_;
194 9         29 for (my $i=0;$i<=$#args;$i++) {
195 18 100       140 croak "'hashify()' argument at index '$i' not true" unless $args[$i];
196             }
197 8         29 my %obj_args = (
198             file => $args[0],
199             key => $args[1],
200             );
201             $obj_args{sep_char} =
202             ($obj_args{file} =~ m/\.psv(\.gz)?$/)
203             ? '|'
204 8 100       59 : ($obj_args{file} =~ m/\.tsv(\.gz)?$/)
    100          
205             ? "\t"
206             : ',';
207 8         29 my $obj = Text::CSV::Hashify->new( \%obj_args );
208 8         25 return $obj->all();
209             }
210              
211             =head1 OBJECT-ORIENTED INTERFACE
212              
213             =head2 C
214              
215             =over 4
216              
217             =item * Purpose
218              
219             Text::CSV::Hashify constructor.
220              
221             =item * Arguments
222              
223             $obj = Text::CSV::Hashify->new( {
224             file => '/path/to/file.csv',
225             format => 'hoh', # hash of hashes, which is default
226             key => 'id', # needed except when format is 'aoh'
227             max_rows => 20, # number of records to read; defaults to all
228             ... # other key-value pairs possible for Text::CSV
229             } );
230              
231             Single hash reference. Required element is:
232              
233             =over 4
234              
235             =item * C
236              
237             String: path to CSV file serving as input. If the path to the input file ends
238             in F<.gz>, it is assumed to be compressed by F.
239              
240             =back
241              
242             Element usually needed:
243              
244             =over 4
245              
246             =item * C
247              
248             String: name of field in CSV file serving as unique key. Needed except when
249             optional element C is C.
250              
251             =back
252              
253             Optional elements are:
254              
255             =over 4
256              
257             =item * C
258              
259             String: possible values are C and C. Defaults to C (hash of
260             hashes). C will fail if the same value is encountered in more than one
261             record's entry in the C column. So if you know in advance that your data
262             cannot meet this condition, explicitly select C aoh>.
263              
264             =item * C
265              
266             Number: provide this if you do not wish to populate the hash with all data
267             records from the CSV file. (Will have no effect if the number provided is
268             greater than or equal to the number of data records in the CSV file.)
269              
270             =item * Any option available to Text::CSV
271              
272             See documentation for either Text::CSV or Text::CSV_XS.
273              
274             =back
275              
276             =item * Return Value
277              
278             Text::CSV::Hashify object.
279              
280             =item * Comment
281              
282             =back
283              
284             =cut
285              
286             sub new {
287 34     34 1 22614 my ($class, $args) = @_;
288 34         50 my %data;
289              
290 34 100 100     507 croak "Argument to 'new()' must be hashref"
291             unless (ref($args) and reftype($args) eq 'HASH');
292 32 100       311 croak "Argument to 'new()' must have 'file' element" unless $args->{file};
293             croak "Cannot locate file '$args->{file}'"
294 31 100       642 unless (-f $args->{file});
295 30         112 $data{file} = delete $args->{file};
296              
297 30 100 100     111 if ($args->{format} and ($args->{format} !~ m/^(?:h|a)oh$/i) ) {
298 1         86 croak "Entry '$args->{format}' for format is invalid'";
299             }
300 29   100     106 $data{format} = delete $args->{format} || 'hoh';
301              
302 29 100 100     86 if (! exists $args->{key} and $data{format} ne 'aoh') {
303 1         80 croak "Argument to 'new()' must have 'key' element unless 'format' element is 'aoh'";
304             }
305 28         53 $data{key} = delete $args->{key};
306              
307 28 100       60 if (defined($args->{max_rows})) {
308 8 100       100 if ($args->{max_rows} !~ m/^[0-9]+$/) {
309 3         243 croak "'max_rows' option, if defined, must be numeric";
310             }
311             else {
312 5         14 $data{max_rows} = delete $args->{max_rows};
313             }
314             }
315             # We've now handled all the Text::CSV::Hashify::new-specific options.
316             # Any remaining options are assumed to be intended for Text::CSV::new().
317              
318 25         51 $args->{binary} = 1;
319 25 50       136 my $csv = Text::CSV->new ( $args )
320             or croak "Cannot use CSV: ".Text::CSV->error_diag ();
321 25         2836 my $IN;
322 25 100       85 if ($data{file} =~ m/\.gz$/) {
323 8         36 $IN = IO::Zlib->new($data{file}, "rb");
324             }
325             else {
326 17         77 $IN = IO::File->new($data{file}, "r");
327             }
328 25 50       24066 croak "Unable to open '$data{file}' for reading"
329             unless defined $IN;
330 25         696 my $header_ref = $csv->getline($IN);
331 25         1965 my %header_fields_seen;
332 25         41 for (@{$header_ref}) {
  25         59  
333 514 100       637 if (exists $header_fields_seen{$_}) {
334 1         100 croak "Duplicate field '$_' observed in '$data{file}'";
335             }
336             else {
337 513         725 $header_fields_seen{$_}++;
338             }
339             }
340 24         51 $data{fields} = $header_ref;
341 24         33 $csv->column_names(@{$header_ref});
  24         109  
342              
343             # 'hoh format
344 24         1763 my %keys_seen;
345 24         52 my @keys_list = ();
346 24         42 my %parsed_data;
347             # 'aoh' format
348             my @parsed_data;
349              
350 24         87 PARSE_FILE: while (my $record = $csv->getline_hr($IN)) {
351 4239 100       620747 if ($data{format} eq 'hoh') {
352 4229         7523 my $kk = $record->{$data{key}};
353 4229 100       7058 if ($keys_seen{$kk}) {
354 1         87 croak "Key '$kk' already seen";
355             }
356             else {
357 4228         7191 $keys_seen{$kk}++;
358 4228         5871 push @keys_list, $kk;
359 4228         6830 $parsed_data{$kk} = $record;
360             last PARSE_FILE if (
361             defined $data{max_rows} and
362             scalar(keys %parsed_data) == $data{max_rows}
363 4228 100 100     13439 );
364             }
365             }
366             else { # format: 'aoh'
367 10         16 push @parsed_data, $record;
368             last PARSE_FILE if (
369             defined $data{max_rows} and
370             scalar(@parsed_data) == $data{max_rows}
371 10 100 100     53 );
372             }
373             }
374 23 50       1750 $IN->close or croak "Unable to close $data{file} after reading";
375 23 100       1579 $data{all} = ($data{format} eq 'aoh') ? \@parsed_data : \%parsed_data;
376 23 100       84 $data{keys} = \@keys_list if $data{format} eq 'hoh';
377 23         58 $data{csv} = $csv;
378 23         37 while (my ($k,$v) = each %{$args}) {
  60         164  
379 37         94 $data{$k} = $v;
380             }
381 23         12791 return bless \%data, $class;
382             }
383              
384             =head2 C
385              
386             =over 4
387              
388             =item * Purpose
389              
390             Get a representation of all data found in a CSV input file.
391              
392             =item * Arguments
393              
394             $hash_ref = $obj->all; # when format is default or 'hoh'
395             $array_ref = $obj->all; # when format is 'aoh'
396              
397             =item * Return Value
398              
399             Reference representing all data records in the CSV input file. In the default
400             case, or if you have specifically requested C 'hoh'>, the return
401             value is a hash reference. When you have requested C 'aoh'>, the
402             return value is an array reference.
403              
404             =item * Comment
405              
406             In the default (C) case, the return value is equivalent to that of
407             C.
408              
409             =back
410              
411             =cut
412              
413             sub all {
414 12     12 1 2172 my ($self) = @_;
415 12         313 return $self->{all};
416             }
417              
418             =head2 C
419              
420             =over 4
421              
422             =item * Purpose
423              
424             Get a list of the fields in the CSV source.
425              
426             =item * Arguments
427              
428             $fields_ref = $obj->fields;
429              
430             =item * Return Value
431              
432             Array reference.
433              
434             =item * Comment
435              
436             If any field names are duplicate, you will not get this far, as C would
437             have died.
438              
439             =back
440              
441             =cut
442              
443             sub fields {
444 3     3 1 1343 my ($self) = @_;
445 3         6 return $self->{fields};
446             }
447              
448             =head2 C
449              
450             =over 4
451              
452             =item * Purpose
453              
454             Get a hash representing one record in the CSV input file.
455              
456             =item * Arguments
457              
458             $record_ref = $obj->record('value_of_key');
459              
460             One argument. In the default case (C 'hoh'>), this argument is the value in the record in the column serving as unique key.
461              
462             In the C 'aoh'> case, this will be index position of the data record
463             in the array. (The header row will be at index C<0>.)
464              
465             =item * Return Value
466              
467             Hash reference.
468              
469             =back
470              
471             =cut
472              
473             sub record {
474 15     15 1 8358 my ($self, $key) = @_;
475 15 100 100     664 croak "Argument to 'record()' either not defined or non-empty"
476             unless (defined $key and $key ne '');
477             ($self->{format} eq 'aoh')
478             ? return $self->{all}->[$key]
479 9 100       33 : return $self->{all}->{$key};
480             }
481              
482             =head2 C
483              
484             =over 4
485              
486             =item * Purpose
487              
488             Get value of one field in one record.
489              
490             =item * Arguments
491              
492             $datum = $obj->datum('value_of_key', 'field');
493              
494             List of two arguments: the value in the record in the column serving as unique
495             key; the name of the field.
496              
497             =item * Return Value
498              
499             Scalar.
500              
501             =back
502              
503             =cut
504              
505             sub datum {
506 14     14 1 6672 my ($self, @args) = @_;
507 14 100       231 croak "'datum()' needs two arguments" unless @args == 2;
508 11         32 for (my $i=0;$i<=$#args;$i++) {
509 19 100 100     465 croak "Argument to 'datum()' at index '$i' either not defined or non-empty"
510             unless ((defined($args[$i])) and ($args[$i] ne ''));
511             }
512             ($self->{format} eq 'aoh')
513             ? return $self->{all}->[$args[0]]->{$args[1]}
514 5 100       27 : return $self->{all}->{$args[0]}->{$args[1]};
515             }
516              
517             =head2 C
518              
519             =over 4
520              
521             =item * Purpose
522              
523             Get a list of all unique keys found in the input file.
524              
525             =item * Arguments
526              
527             $keys_ref = $obj->keys;
528              
529             =item * Return Value
530              
531             Array reference.
532              
533             =item * Comment
534              
535             If you have selected C 'aoh'> in the options to C, the
536             C method is inappropriate and will cause your program to die.
537              
538             =back
539              
540             =cut
541              
542             sub keys {
543 4     4 1 2555 my ($self) = @_;
544 4 100       16 if (exists $self->{keys}) {
545 3         9 return $self->{keys};
546             }
547             else {
548 1         68 croak "'keys()' method not appropriate when 'format' is 'aoh'";
549             }
550             }
551              
552             =head1 AUTHOR
553              
554             James E Keenan
555             CPAN ID: jkeenan
556             jkeenan@cpan.org
557             http://thenceforward.net/perl/modules/Text-CSV-Hashify
558              
559             =head1 COPYRIGHT
560              
561             This program is free software; you can redistribute
562             it and/or modify it under the same terms as Perl itself.
563              
564             The full text of the license can be found in the
565             LICENSE file included with this module.
566              
567             Copyright 2012-2018, James E Keenan. All rights reserved.
568              
569             =head1 BUGS
570              
571             There are no bug reports outstanding on Text::CSV::Hashify as of the most recent
572             CPAN upload date of this distribution.
573              
574             =head1 SUPPORT
575              
576             To report any bugs or make any feature requests, please send mail to
577             C or use the web interface at
578             L.
579              
580             =head1 ACKNOWLEDGEMENTS
581              
582             Thanks to Christine Shieh for serving as the alpha consumer of this
583             library's output.
584              
585             =head1 OTHER CPAN DISTRIBUTIONS
586              
587             =head2 Text-CSV and Text-CSV_XS
588              
589             These distributions underlie Text-CSV-Hashify and provide all of its
590             file-parsing functionality. Where possible, install both. That will enable
591             you to process a file with a single, shared interface but have access to the
592             faster processing speeds of XS where available.
593              
594             =head2 Text-CSV-Slurp
595              
596             Like Text-CSV-Hashify, Text-CSV-Slurp slurps an entire CSV file into memory,
597             but stores it as an array of hashes instead.
598              
599             =head2 Text-CSV-Auto
600              
601             This distribution inspired the C option to C.
602              
603             =cut
604              
605             1;
606