File Coverage

blib/lib/Parse/CSV.pm
Criterion Covered Total %
statement 89 102 87.2
branch 39 52 75.0
condition 13 21 61.9
subroutine 15 16 93.7
pod 10 10 100.0
total 166 201 82.5


line stmt bran cond sub pod time code
1 3     3   169129 use strict;
  3         24  
  3         178  
2             package Parse::CSV;
3             $Parse::CSV::VERSION = '2.06';
4             =pod
5              
6             =head1 NAME
7              
8             Parse::CSV - Highly flexible CSV parser for large files
9              
10             =head1 VERSION
11              
12             version 2.06
13              
14             =head1 SYNOPSIS
15              
16             # Simple headerless comma-separated column parser
17             my $simple = Parse::CSV->new(
18             file => 'file.csv',
19             );
20              
21             while ( my $array_ref = $simple->fetch ) {
22             # Do something...
23             }
24              
25             ... or a more complex example...
26              
27             # Parse a colon-separated variables file from a handle as a hash
28             # based on headers from the first line.
29             # Then filter, so we emit objects rather than the plain hash.
30             my $objects = Parse::CSV->new(
31             handle => $io_handle,
32             sep_char => ';',
33             names => 1,
34             filter => sub { My::Object->new( $_ ) },
35             );
36              
37             while ( my $object = $objects->fetch ) {
38             $object->do_something;
39             }
40              
41             =head1 DESCRIPTION
42              
43             Surely the CPAN doesn't need yet another CSV parsing module.
44              
45             L is the standard parser for CSV files. It is fast as hell,
46             but unfortunately it can be a bit verbose to use.
47              
48             A number of other modules have attempted to put usability wrappers around
49             this venerable module, but they have all focused on parsing the entire
50             file into memory at once.
51              
52             This method is fine unless your CSV files start to get large. Once that
53             happens, the only existing option is to fall back on the relatively slow
54             and heavyweight L module.
55              
56             L fills this functionality gap. It provides a flexible
57             and light-weight streaming parser for large, extremely large, or
58             arbitrarily large CSV files.
59              
60             =head2 Main Features
61              
62             B - All parsing a line at a time.
63              
64             B - Parsing can be done in simple array mode, returning
65             a reference to an array if the columns are not named.
66              
67             B - Parsing can be done in hash mode, putting the data into
68             a hash and returning a reference to it.
69              
70             B - All items returned can be passed through a
71             custom filter. This filter can either modify the data on the fly,
72             or drop records you don't need.
73              
74             =head2 Writing Filters
75              
76             A L filter is a subroutine reference that is passed the
77             original record as C<$_> (not as a function argument), and should
78             C the alternative or modified record.
79              
80             A no-op filter (does not modify or drop any records) would look like the
81             following.
82              
83             sub { $_ }
84              
85             A filter that reversed the order of the columns (assuming the parser
86             is in array mode) might look like the following.
87              
88             sub { [ reverse @$_ ] }
89              
90             To drop the record, return C from the filter. The
91             parser will then keep pulling and parsing new records until one
92             passes the filter.
93              
94             # Only keep records where the 'foo' field is true
95             sub { $_->{foo} ? $_ : undef }
96              
97             To signal an error, throw an exception
98              
99             sub {
100             $_->{foo} =~ /bar/ or die "Assumption failed";
101             return $_;
102             }
103              
104             Feel free to modify C<$_> as a side-effect of your filter routine -
105             this will have no effect on anything.
106              
107             =head1 METHODS
108              
109             =cut
110              
111 3     3   99 use 5.005;
  3         9  
112 3     3   15 use Carp ();
  3         6  
  3         66  
113 3     3   1625 use IO::File 1.13 ();
  3         20099  
  3         97  
114 3     3   2033 use Text::CSV_XS 0.80 ();
  3         24824  
  3         91  
115 3     3   1607 use Params::Util 1.00 ();
  3         17865  
  3         3500  
116              
117              
118             #####################################################################
119             # Constructor
120              
121             =pod
122              
123             =head2 new
124              
125             The C constructor creates and initialises a new CSV parser. It
126             returns a new L object, or throws an exception (dies) on
127             error. It accepts a number of params:
128              
129             =over 4
130              
131             =item C
132              
133             =item C
134              
135             To specify the CSV data source, provide either the C
136             param, which should be the name of the file to read, or the C
137             param, which should be a file handle to read instead.
138              
139             =item C
140              
141             Any parameter for L's constructor can also be provided
142             to this C method, and they will be passed on to it.
143             Alternatively, they can be passed as a single C reference as the
144             C param. For example:
145              
146             $parser = Parse::CSV->new(
147             file => 'file.csv',
148             csv_attr => {
149             sep_char => ';',
150             quote_char => "'",
151             },
152             );
153              
154             =item C
155              
156             An optional C param can be provided, which should either be an
157             array reference containing the names of the columns:
158              
159             $parser = Parse::CSV->new(
160             file => 'file.csv',
161             names => [ 'col1', 'col2', 'col3' ],
162             );
163              
164             or a true value that's not a reference, indicating that the column
165             names will be read from the first line of the input:
166              
167             $parser = Parse::CSV->new(
168             file => 'file.csv',
169             names => 1,
170             );
171              
172             If the C param is provided, the parser will map each line to a
173             hash where the keys are the field names provided, and the values are the
174             values found in the CSV file.
175              
176             If the C param is B provided, the parser will return simple
177             array references of the columns, treating them just like all the other
178             rows in the file.
179              
180             If your CSV file has (or might have) a ,
181             you must use the C functionality, because this lets us call the C
182             method of C, which is the only place the BOM is handled
183             in that module.
184              
185             =item C
186              
187             The optional C param will be used to filter the records if
188             provided. It should be a C reference or any otherwise callable
189             scalar, and each value parsed (either array reference or hash reference)
190             will be available to the filter as C<$_> to be changed or converted into an object,
191             or whatever you wish. See the L section for more details.
192              
193             =back
194              
195             =cut
196              
197             sub new {
198 9     9 1 9428 my $class = shift;
199 9         48 my $self = bless {
200             @_,
201             row => 0,
202             errstr => '',
203             }, $class;
204              
205             # Do we have a file name
206 9 100       39 if ( exists $self->{file} ) {
207 7 50       28 unless ( Params::Util::_STRING($self->{file}) ) {
208 0         0 Carp::croak("Parse::CSV file param is not a string");
209             }
210 7 50 33     177 unless ( -f $self->{file} and -r _ ) {
211 0         0 Carp::croak("Parse::CSV file '$self->{file}' does not exist");
212             }
213              
214 7         52 $self->{handle} = IO::File->new();
215 7 50       263 unless ( $self->{handle}->open($self->{file}) ) {
216 0         0 Carp::croak("Parse::CSV file '$self->{file}' failed to load: $!");
217             }
218             }
219              
220             # Do we have a file handle
221 9 50       340 if ( exists $self->{handle} ) {
222 9 50       40 unless ( Params::Util::_HANDLE($self->{handle}) ) {
223 0         0 Carp::croak("Parse::CSV handle param is not an IO handle");
224             }
225             } else {
226 0         0 Carp::croak("Parse::CSV not provided a file or handle param");
227             }
228              
229             # Separate the Text::CSV attributes
230 9 100       185 unless ( Params::Util::_HASH0($self->{csv_attr}) ) {
231 7         22 $self->{csv_attr} = {binary => 1}; # Suggested by Text::CSV_XS docs to always be on
232             # XXX it would be nice to not have this list hard-coded.
233 7         18 foreach ( qw{quote_char eol escape_char sep_char binary always_quote} ) {
234 42 50       82 next unless exists $self->{$_};
235 0         0 $self->{csv_attr}->{$_} = delete $self->{$_};
236             }
237             }
238              
239             # Create the parser
240 9         51 $self->{csv_xs} = Text::CSV_XS->new( $self->{csv_attr} );
241 9 50       1086 unless ( $self->{csv_xs} ) {
242 0         0 Carp::croak("Failed to create Text::CSV_XS parser");
243             }
244              
245             # Deprecated fields usage
246 9 100 66     34 if ( $self->{fields} and not $self->{names} ) {
247 3         12 $self->{names} = $self->{fields};
248             }
249              
250             # Handle automatic field names
251 9 50 66     86 if ( Params::Util::_STRING($self->{names}) and $self->{names} ) {
252             # Grab the first line
253 5         14 $self->{names} = $self->getline(header=>1);
254             }
255              
256             # Check names
257 9 50 66     78 if ( exists $self->{names} and ! Params::Util::_ARRAY($self->{names}) ) {
258 0         0 Carp::croak("Parse::CSV names param is not an array reference of strings");
259             }
260              
261             # Check filter
262 9 50 66     39 if ( exists $self->{filter} and ! Params::Util::_CODELIKE($self->{filter}) ) {
263 0         0 Carp::croak("Parse::CSV filter param is not callable");
264             }
265              
266 9         26 $self;
267             }
268              
269              
270              
271              
272              
273             #####################################################################
274             # Main Methods
275              
276             =pod
277              
278             =head2 fetch
279              
280             Once a L object has been created, the C method is
281             used to parse and return the next value from the CSV file.
282              
283             Returns an C, C or the output of the filter, based on the
284             configuration of the object, or C in a variety of situations.
285              
286             Returning C means either some part of the parsing and filtering
287             process has resulted in an error, B that the end of file has been
288             reached.
289              
290             On receiving C, you should check the C method. If it is an empty
291             string you have reached the end of file. Otherwise the error message will
292             be returned. Thus, the basic usage of L will look like the
293             following.
294              
295             my $parser = Parse::CSV->new(
296             file => 'file.csv',
297             );
298             while ( my $value = $parser->fetch ) {
299             # Do something...
300             }
301             if ( $parser->errstr ) {
302             # Handle errors...
303             }
304              
305             NOTE: currently the L and L methods can be used to
306             access the most recently-read row (as an array ref or a formatted
307             string) after using C. However, this contradicts the
308             documentation for L, which says those methods should be
309             "meaningless" after calling C (which C internally
310             uses to read the input). Keeping the current behavior also incurs a
311             speed & memory penalty. Therefore, relying on L and L
312             to return the current data after C is deprecated and will
313             (probably) be removed in a future release.
314              
315             =cut
316              
317             sub fetch {
318 24     24 1 3318 my $self = shift;
319              
320             # The filter can skip rows,
321             # iterate till we get something.
322 24         62 while ( my $row = $self->getline ) {
323             # Turn the array ref into a hash if needed
324 19         29 my $rv;
325 19 100       44 if ( $self->{names} ) {
326 9         20 $rv = {};
327 9         17 @{$rv}{@{$self->{names}}} = @$row;
  9         41  
  9         19  
328             } else {
329 10         19 $rv = $row;
330             }
331              
332             # Just return for simple uses
333 19 100       78 return $rv unless $self->{filter};
334              
335             # Filter if needed
336 4         5 $rv = eval { local $_ = $rv; $self->{filter}->() };
  4         7  
  4         12  
337 4 50       29 if ( $@ ) {
338             # Handle filter errors
339 0         0 $self->{errstr} = "Filter error: $@";
340 0         0 $self->{errstr} =~ s/^(.+)at line.+$/$1/;
341 0         0 return undef;
342             }
343              
344             # Filter returns undef to drop a record
345 4 100       10 next unless defined $rv;
346              
347             # We have a good record, return it
348 3         9 return $rv;
349             }
350              
351 6         17 return undef;
352             }
353              
354             =head2 getline
355              
356             Returns the next line of the input as an array reference, without
357             performing possible conversion to a hash, and without running any
358             filters. This is the routine that C uses internally to read
359             its input. It may be useful if you sometimes want to do filtering and
360             sometimes don't, or sometimes want to do hash conversion and sometimes
361             don't, or maybe you don't need either of those things and you just
362             want to shave all the milliseconds off that you can (but then you
363             might be better off just using C directly).
364              
365             =cut
366              
367             sub getline {
368 30     30 1 44 my $self = shift;
369 30         54 my %attrs = @_;
370 30         102 $self->{errstr} = '';
371              
372             my $row = $attrs{header}
373             ? [$self->{csv_xs}->header( $self->{handle} )]
374 30 100       791 : $self->{csv_xs}->getline( $self->{handle} );
375              
376 30 100 66     20918 if (!$row && 0+$self->{csv_xs}->error_diag) {
377 6         170 my $err = "".$self->{csv_xs}->error_diag;
378             # We need to propagate errors from Text::CSV_XS, but
379             # eof is also reported as an error. So we are going to
380             # filter out it as a special case.
381 6 100 66     200 if (!eof $self->{handle} || $err !~ /^EOF/) {
382 1         5 $self->{errstr} = $err;
383             }
384             }
385              
386 30 100       83 $self->{row}++ if defined $row;
387 30         57 $self->{savedrow} = $row;
388 30         88 return $row;
389             }
390              
391             =pod
392              
393             =head2 row
394              
395             The C method returns the line number of the most-recently-read row of the CSV file.
396              
397             This is a one-based count, so when you first create the parser,
398             the value of C will be zero (unless you are using
399             C on automatic in which case it will be 1).
400              
401             =cut
402              
403             sub row {
404 23     23 1 6964 $_[0]->{row};
405             }
406              
407             =pod
408              
409             =head2 combine
410              
411             $status = $csv->combine(@columns);
412              
413             The C method is provided as a convenience, and is passed through
414             to the underlying L object.
415              
416             =cut
417              
418             sub combine {
419 4     4 1 18 shift->{csv_xs}->combine(@_);
420             }
421              
422             =pod
423              
424             =head2 string
425              
426             $line = $csv->string;
427              
428             The C method is provided as a convenience, and is passed through
429             to the underlying L object.
430              
431             NOTE: relying on L to return the current data after C
432             is deprecated and will (probably) be removed in a future release.
433             Only rely on its value after C. See similar warnings in
434             L and L.
435              
436             =cut
437              
438             sub string {
439 2     2 1 5 my $self = shift;
440 2 50       8 if ($self->{savedrow}) {
441 2         3 $self->combine(@{$self->{savedrow}});
  2         7  
442 2         33 delete $self->{savedrow};
443             }
444 2         7 $self->{csv_xs}->string;
445             }
446              
447             =pod
448              
449             =head2 print
450              
451             $status = $csv->print($io, $columns);
452              
453             The C method is provided as a convenience, and is passed through
454             to the underlying L object.
455              
456             =cut
457              
458             sub print {
459 0     0 1 0 shift->{csv_xs}->print(@_);
460             }
461              
462             =pod
463              
464             =head2 fields
465              
466             @fields = $csv->fields;
467              
468             The C method is provided as a convenience, and is passed through
469             to the underlying L object. It shows the actual row as an array.
470              
471             NOTE: relying on L to return the current data after C
472             is deprecated and will (probably) be removed in a future release.
473             Only rely on its value after C. See similar warnings in
474             L and L.
475              
476             =cut
477              
478             sub fields {
479 4     4 1 1084 my $self = shift;
480 4 100       13 if ($self->{savedrow}) {
481 2         4 $self->combine(@{$self->{savedrow}});
  2         9  
482 2         38 delete $self->{savedrow};
483             }
484 4         13 $self->{csv_xs}->fields;
485             }
486              
487             =pod
488              
489             =head2 names
490              
491             # Get the current column names in use
492             my @names = $csv->names;
493              
494             # Change the column names on the fly mid stream
495             $csv->names( 'fn1', 'fn2' );
496              
497             The C method gets or sets the column name mapping for the parser.
498              
499             If the parser has no names or fields, returns the null list.
500              
501             =cut
502              
503             sub names {
504 6     6 1 12 my $self = shift;
505 6         10 my $names = $self->{names};
506 6 100       18 if ( $names ) {
507 5 100       20 @$names = @_ if @_;
508 5         40 return @$names;
509             }
510 1 50       3 $self->{names} = [ @_ ] if @_;
511 1         5 return @_;
512             }
513              
514             =pod
515              
516             =head2 errstr
517              
518             On error, the C method returns the error that occured.
519              
520             If the last action was NOT an error, returns the null string C<''>.
521              
522             =cut
523              
524             sub errstr {
525 28     28 1 2940 $_[0]->{errstr};
526             }
527              
528             1;
529              
530             =pod
531              
532             =head1 SUPPORT
533              
534             Bugs should always be reported via the CPAN bug tracker at
535              
536             L
537              
538             For other issues, or commercial enhancement or support, contact the author.
539              
540             =head1 AUTHORS
541              
542             Adam Kennedy Eadamk@cpan.orgE
543              
544             =head1 CONTRIBUTORS
545              
546             Uwe Sarnowski Euwes@cpan.orgE
547              
548             Ken Williams Ekwilliams@cpan.orgE
549              
550             =head1 SEE ALSO
551              
552             L, L
553              
554             =head1 COPYRIGHT
555              
556             Copyright 2006 - 2012 Adam Kennedy.
557              
558             This program is free software; you can redistribute
559             it and/or modify it under the same terms as Perl itself.
560              
561             The full text of the license can be found in the
562             LICENSE file included with this module.
563              
564             =cut