File Coverage

blib/lib/Parse/CSV.pm
Criterion Covered Total %
statement 88 101 87.1
branch 37 50 74.0
condition 13 21 61.9
subroutine 15 16 93.7
pod 10 10 100.0
total 163 198 82.3


line stmt bran cond sub pod time code
1 3     3   104013 use strict;
  3         19  
  3         126  
2             package Parse::CSV;
3             $Parse::CSV::VERSION = '2.05';
4             =pod
5              
6             =head1 NAME
7              
8             Parse::CSV - Highly flexible CSV parser for large files
9              
10             =head1 VERSION
11              
12             version 2.05
13              
14             =head1 SYNOPSIS
15              
16             # Simple headerless comma-seperated column parser
17             my $simple = Parse::CSV->new(
18             file => 'file.csv',
19             );
20              
21             while ( my $array_ref = $simple->fetch ) {
22             # Do something...
23             }
24              
25             ... or a more complex example...
26              
27             # Parse a colon-seperated variables file from a handle as a hash
28             # based on headers from the first line.
29             # Then filter, so we emit objects rather than the plain hash.
30             my $objects = Parse::CSV->new(
31             handle => $io_handle,
32             sep_char => ';',
33             names => 1,
34             filter => sub { My::Object->new( $_ ) },
35             );
36              
37             while ( my $object = $objects->fetch ) {
38             $object->do_something;
39             }
40              
41             =head1 DESCRIPTION
42              
43             Surely the CPAN doesn't need yet another CSV parsing module.
44              
45             L is the standard parser for CSV files. It is fast as hell,
46             but unfortunately it can be a bit verbose to use.
47              
48             A number of other modules have attempted to put usability wrappers around
49             this venerable module, but they have all focused on parsing the entire
50             file into memory at once.
51              
52             This method is fine unless your CSV files start to get large. Once that
53             happens, the only existing option is to fall back on the relatively slow
54             and heavyweight L module.
55              
56             L fills this functionality gap. It provides a flexible
57             and light-weight streaming parser for large, extremely large, or
58             arbitrarily large CSV files.
59              
60             =head2 Main Features
61              
62             B - All parsing a line at a time.
63              
64             B - Parsing can be done in simple array mode, returning
65             a reference to an array if the columns are not named.
66              
67             B - Parsing can be done in hash mode, putting the data into
68             a hash and returning a reference to it.
69              
70             B - All items returned can be passed through a
71             custom filter. This filter can either modify the data on the fly,
72             or drop records you don't need.
73              
74             =head2 Writing Filters
75              
76             A L filter is a subroutine reference that is passed the
77             original record as C<$_> (not as a function argument), and should
78             C the alternative or modified record.
79              
80             A no-op filter (does not modify or drop any records) would look like the
81             following.
82              
83             sub { $_ }
84              
85             A filter that reversed the order of the columns (assuming the parser
86             is in array mode) might look like the following.
87              
88             sub { [ reverse @$_ ] }
89              
90             To drop the record, return C from the filter. The
91             parser will then keep pulling and parsing new records until one
92             passes the filter.
93              
94             # Only keep records where the 'foo' field is true
95             sub { $_->{foo} ? $_ : undef }
96              
97             To signal an error, throw an exception
98              
99             sub {
100             $_->{foo} =~ /bar/ or die "Assumption failed";
101             return $_;
102             }
103              
104             Feel free to modify C<$_> as a side-effect of your filter routine -
105             this will have no effect on anything.
106              
107             =head1 METHODS
108              
109             =cut
110              
111 3     3   42 use 5.005;
  3         6  
112 3     3   10 use Carp ();
  3         5  
  3         52  
113 3     3   1083 use IO::File 1.13 ();
  3         19868  
  3         77  
114 3     3   2061 use Text::CSV_XS 0.80 ();
  3         25155  
  3         79  
115 3     3   1116 use Params::Util 1.00 ();
  3         9242  
  3         2334  
116              
117              
118             #####################################################################
119             # Constructor
120              
121             =pod
122              
123             =head2 new
124              
125             The C constructor creates and initialises a new CSV parser. It
126             returns a new L object, or throws an exception (dies) on
127             error. It accepts a number of params:
128              
129             =over 4
130              
131             =item C
132              
133             =item C
134              
135             To specify the CSV data source, provide either the C
136             param, which should be the name of the file to read, or the C
137             param, which should be a file handle to read instead.
138              
139             =item C
140              
141             Any parameter for L's constructor can also be provided
142             to this C method, and they will be passed on to it.
143             Alternatively, they can be passed as a single C reference as the
144             C param. For example:
145              
146             $parser = Parse::CSV->new(
147             file => 'file.csv',
148             csv_attr => {
149             sep_char => ';',
150             quote_char => "'",
151             },
152             );
153              
154             =item C
155              
156             An optional C param can be provided, which should either be an
157             array reference containing the names of the columns:
158              
159             $parser = Parse::CSV->new(
160             file => 'file.csv',
161             names => [ 'col1', 'col2', 'col3' ],
162             );
163              
164             or a true value that's not a reference, indicating that the column
165             names will be read from the first line of the input:
166              
167             $parser = Parse::CSV->new(
168             file => 'file.csv',
169             names => 1,
170             );
171              
172             If the C param is provided, the parser will map each line to a
173             hash where the keys are the field names provided, and the values are the
174             values found in the CSV file.
175              
176             If the C param is B provided, the parser will return simple
177             array references of the columns.
178              
179             =item C
180              
181             The optional C param will be used to filter the records if
182             provided. It should be a C reference or any otherwise callable
183             scalar, and each value parsed (either array reference or hash reference)
184             will be available to the filter as C<$_> to be changed or converted into an object,
185             or whatever you wish. See the L section for more details.
186              
187             =back
188              
189             =cut
190              
191             sub new {
192 8     8 1 5755 my $class = shift;
193 8         35 my $self = bless {
194             @_,
195             row => 0,
196             errstr => '',
197             }, $class;
198              
199             # Do we have a file name
200 8 100       47 if ( exists $self->{file} ) {
201 7 50       26 unless ( Params::Util::_STRING($self->{file}) ) {
202 0         0 Carp::croak("Parse::CSV file param is not a string");
203             }
204 7 50 33     125 unless ( -f $self->{file} and -r _ ) {
205 0         0 Carp::croak("Parse::CSV file '$self->{file}' does not exist");
206             }
207              
208 7         34 $self->{handle} = IO::File->new();
209 7 50       187 unless ( $self->{handle}->open($self->{file}) ) {
210 0         0 Carp::croak("Parse::CSV file '$self->{file}' failed to load: $!");
211             }
212             }
213              
214             # Do we have a file handle
215 8 50       254 if ( exists $self->{handle} ) {
216 8 50       184 unless ( Params::Util::_HANDLE($self->{handle}) ) {
217 0         0 Carp::croak("Parse::CSV handle param is not an IO handle");
218             }
219             } else {
220 0         0 Carp::croak("Parse::CSV not provided a file or handle param");
221             }
222              
223             # Seperate the Text::CSV attributes
224 8 100       156 unless ( Params::Util::_HASH0($self->{csv_attr}) ) {
225 7         17 $self->{csv_attr} = {binary => 1}; # Suggested by Text::CSV_XS docs to always be on
226             # XXX it would be nice to not have this list hard-coded.
227 7         15 foreach ( qw{quote_char eol escape_char sep_char binary always_quote} ) {
228 42 50       58 next unless exists $self->{$_};
229 0         0 $self->{csv_attr}->{$_} = delete $self->{$_};
230             }
231             }
232              
233             # Create the parser
234 8         35 $self->{csv_xs} = Text::CSV_XS->new( $self->{csv_attr} );
235 8 50       647 unless ( $self->{csv_xs} ) {
236 0         0 Carp::croak("Failed to create Text::CSV_XS parser");
237             }
238              
239             # Deprecated fields usage
240 8 100 66     23 if ( $self->{fields} and not $self->{names} ) {
241 3         6 $self->{names} = $self->{fields};
242             }
243              
244             # Handle automatic field names
245 8 50 66     30 if ( Params::Util::_STRING($self->{names}) and $self->{names} ) {
246             # Grab the first line
247 4         7 $self->{names} = $self->getline;
248             }
249              
250             # Check names
251 8 50 66     52 if ( exists $self->{names} and ! Params::Util::_ARRAY($self->{names}) ) {
252 0         0 Carp::croak("Parse::CSV names param is not an array reference of strings");
253             }
254              
255             # Check filter
256 8 50 66     23 if ( exists $self->{filter} and ! Params::Util::_CODELIKE($self->{filter}) ) {
257 0         0 Carp::croak("Parse::CSV filter param is not callable");
258             }
259              
260 8         18 $self;
261             }
262              
263              
264              
265              
266              
267             #####################################################################
268             # Main Methods
269              
270             =pod
271              
272             =head2 fetch
273              
274             Once a L object has been created, the C method is
275             used to parse and return the next value from the CSV file.
276              
277             Returns an C, C or the output of the filter, based on the
278             configuration of the object, or C in a variety of situations.
279              
280             Returning C means either some part of the parsing and filtering
281             process has resulted in an error, B that the end of file has been
282             reached.
283              
284             On receiving C, you should check the C method. If it is an empty
285             string you have reached the end of file. Otherwise the error message will
286             be returned. Thus, the basic usage of L will look like the
287             following.
288              
289             my $parser = Parse::CSV->new(
290             file => 'file.csv',
291             );
292             while ( my $value = $parser->fetch ) {
293             # Do something...
294             }
295             if ( $parser->errstr ) {
296             # Handle errors...
297             }
298              
299             NOTE: currently the L and L methods can be used to
300             access the most recently-read row (as an array ref or a formatted
301             string) after using C. However, this contradicts the
302             documentation for L, which says those methods should be
303             "meaningless" after calling C (which C internally
304             uses to read the input). Keeping the current behavior also incurs a
305             speed & memory penalty. Therefore, relying on L and L
306             to return the current data after C is deprecated and will
307             (probably) be removed in a future release.
308              
309             =cut
310              
311             sub fetch {
312 23     23 1 2352 my $self = shift;
313              
314             # The filter can skip rows,
315             # iterate till we get something.
316 23         40 while ( my $row = $self->getline ) {
317             # Turn the array ref into a hash if needed
318 18         22 my $rv;
319 18 100       32 if ( $self->{names} ) {
320 8         14 $rv = {};
321 8         10 @{$rv}{@{$self->{names}}} = @$row;
  8         26  
  8         9  
322             } else {
323 10         13 $rv = $row;
324             }
325              
326             # Just return for simple uses
327 18 100       49 return $rv unless $self->{filter};
328              
329             # Filter if needed
330 4         4 $rv = eval { local $_ = $rv; $self->{filter}->() };
  4         5  
  4         9  
331 4 50       19 if ( $@ ) {
332             # Handle filter errors
333 0         0 $self->{errstr} = "Filter error: $@";
334 0         0 $self->{errstr} =~ s/^(.+)at line.+$/$1/;
335 0         0 return undef;
336             }
337              
338             # Filter returns undef to drop a record
339 4 100       9 next unless defined $rv;
340              
341             # We have a good record, return it
342 3         6 return $rv;
343             }
344              
345 6         12 return undef;
346             }
347              
348             =head2 getline
349              
350             Returns the next line of the input as an array reference, without
351             performing possible conversion to a hash, and without running any
352             filters. This is the routine that C uses internally to read
353             its input. It may be useful if you sometimes want to do filtering and
354             sometimes don't, or sometimes want to do hash conversion and sometimes
355             don't, or maybe you don't need either of those things and you just
356             want to shave all the milliseconds off that you can (but then you
357             might be better off just using C directly).
358              
359             =cut
360              
361             sub getline {
362 28     28 1 30 my $self = shift;
363 28         39 $self->{errstr} = '';
364              
365 28         665 my $row = $self->{csv_xs}->getline( $self->{handle} );
366              
367 28 100 66     1043 if (!$row && 0+$self->{csv_xs}->error_diag) {
368 6         136 my $err = "".$self->{csv_xs}->error_diag;
369             # We need to propagate errors from Text::CSV_XS, but
370             # eof is also reported as an error. So we are going to
371             # filter out it as a special case.
372 6 100 66     149 if (!eof $self->{handle} || $err !~ /^EOF/) {
373 1         3 $self->{errstr} = $err;
374             }
375             }
376              
377 28 100       60 $self->{row}++ if defined $row;
378 28         41 $self->{savedrow} = $row;
379 28         64 return $row;
380             }
381              
382             =pod
383              
384             =head2 row
385              
386             The C method returns the line number of the most-recently-read row of the CSV file.
387              
388             This is a one-based count, so when you first create the parser,
389             the value of C will be zero (unless you are using
390             C on automatic in which case it will be 1).
391              
392             =cut
393              
394             sub row {
395 23     23 1 5336 $_[0]->{row};
396             }
397              
398             =pod
399              
400             =head2 combine
401              
402             $status = $csv->combine(@columns);
403              
404             The C method is provided as a convenience, and is passed through
405             to the underlying L object.
406              
407             =cut
408              
409             sub combine {
410 4     4 1 16 shift->{csv_xs}->combine(@_);
411             }
412              
413             =pod
414              
415             =head2 string
416              
417             $line = $csv->string;
418              
419             The C method is provided as a convenience, and is passed through
420             to the underlying L object.
421              
422             NOTE: relying on L to return the current data after C
423             is deprecated and will (probably) be removed in a future release.
424             Only rely on its value after C. See similar warnings in
425             L and L.
426              
427             =cut
428              
429             sub string {
430 2     2 1 3 my $self = shift;
431 2 50       5 if ($self->{savedrow}) {
432 2         3 $self->combine(@{$self->{savedrow}});
  2         6  
433 2         33 delete $self->{savedrow};
434             }
435 2         7 $self->{csv_xs}->string;
436             }
437              
438             =pod
439              
440             =head2 print
441              
442             $status = $csv->print($io, $columns);
443              
444             The C method is provided as a convenience, and is passed through
445             to the underlying L object.
446              
447             =cut
448              
449             sub print {
450 0     0 1 0 shift->{csv_xs}->print(@_);
451             }
452              
453             =pod
454              
455             =head2 fields
456              
457             @fields = $csv->fields;
458              
459             The C method is provided as a convenience, and is passed through
460             to the underlying L object. It shows the actual row as an array.
461              
462             NOTE: relying on L to return the current data after C
463             is deprecated and will (probably) be removed in a future release.
464             Only rely on its value after C. See similar warnings in
465             L and L.
466              
467             =cut
468              
469             sub fields {
470 4     4 1 844 my $self = shift;
471 4 100       13 if ($self->{savedrow}) {
472 2         2 $self->combine(@{$self->{savedrow}});
  2         5  
473 2         25 delete $self->{savedrow};
474             }
475 4         11 $self->{csv_xs}->fields;
476             }
477              
478             =pod
479              
480             =head2 names
481              
482             # Get the current column names in use
483             my @names = $csv->names;
484              
485             # Change the column names on the fly mid stream
486             $csv->names( 'fn1', 'fn2' );
487              
488             The C method gets or sets the column name mapping for the parser.
489              
490             If the parser has no names or fields, returns the null list.
491              
492             =cut
493              
494             sub names {
495 6     6 1 14 my $self = shift;
496 6         9 my $names = $self->{names};
497 6 100       15 if ( $names ) {
498 5 100       13 @$names = @_ if @_;
499 5         33 return @$names;
500             }
501 1 50       3 $self->{names} = [ @_ ] if @_;
502 1         4 return @_;
503             }
504              
505             =pod
506              
507             =head2 errstr
508              
509             On error, the C method returns the error that occured.
510              
511             If the last action was NOT an error, returns the null string C<''>.
512              
513             =cut
514              
515             sub errstr {
516 27     27 1 1456 $_[0]->{errstr};
517             }
518              
519             1;
520              
521             =pod
522              
523             =head1 SUPPORT
524              
525             Bugs should always be reported via the CPAN bug tracker at
526              
527             L
528              
529             For other issues, or commercial enhancement or support, contact the author.
530              
531             =head1 AUTHORS
532              
533             Adam Kennedy Eadamk@cpan.orgE
534              
535             =head1 CONTRIBUTORS
536              
537             Uwe Sarnowski Euwes@cpan.orgE
538              
539             Ken Williams Ekwilliams@cpan.orgE
540              
541             =head1 SEE ALSO
542              
543             L, L
544              
545             =head1 COPYRIGHT
546              
547             Copyright 2006 - 2012 Adam Kennedy.
548              
549             This program is free software; you can redistribute
550             it and/or modify it under the same terms as Perl itself.
551              
552             The full text of the license can be found in the
553             LICENSE file included with this module.
554              
555             =cut