File Coverage

blib/lib/Text/LooseCSV.pm
Criterion Covered Total %
statement 108 188 57.4
branch 31 98 31.6
condition 8 27 29.6
subroutine 12 20 60.0
pod 0 15 0.0
total 159 348 45.6


line stmt bran cond sub pod time code
1             #
2             # $Id: LooseCSV.pm,v 1.6 2007/11/21 04:23:08 rsandberg Exp $
3             #
4              
5             package Text::LooseCSV;
6              
7             $VERSION = 1.6;
8              
9 1     1   26816 use strict;
  1         3  
  1         1701  
10              
11              
12             =head1 NAME
13              
14             Text::LooseCSV - Highly forgiving variable length record text parser; compare to MS Excel
15              
16              
17             =head1 SYNOPSIS
18              
19             use Text::LooseCSV;
20             use IO::File;
21              
22             $fh = new IO::File $fname;
23             $f = new Text::LooseCSV($fh);
24              
25             # Some optional settings
26             $f->word_delimiter("\t");
27             $f->line_delimiter("\n");
28             $f->no_quotes(1);
29              
30             # Parse/split a line
31             while ($rec = $f->next_record())
32             {
33             if ($rec == -1)
34             {
35             warn("corrupt rec: ", $f->cur_line);
36             next;
37             }
38              
39             # process $rec as arrayref
40             ...
41             }
42              
43              
44             # Or, (vice-versa) create a variable-length record file
45             $line = $f->form_record( [ 'Debbie Does Dallas','30.00','VHS','Classic' ] );
46              
47             =head1 DESCRIPTION
48              
49             Why another variable-length text record parser? I've had the privilege to parse some of the gnarliest data ever seen
50             and everything else I tried on CPAN choked (at the time I wrote this module). This module has
51             been munching on millions of records of the filthiest data imaginable at several production
52             sites so I thought I'd contribute.
53              
54             This module follows somewhat loose rules (compare to MS Excel) and will handle embedded newlines, etc.
55             It is capable of handling large files and processes data in line-chunks. If MAX_LINEBUF is
56             reached, however, it will mark the current record as corrupt, return -1 and start over
57             again at the very next line. This will (of course) process tab-delimited data or whatever value
58             you set for C.
59              
60             Methods are called in perl OO fashion.
61              
62              
63             WARNING this module messes with $/
64             C sets $/ and is always called during construction. Don't change $/ during
65             program execution!
66              
67              
68             =head1 METHOD DETAILS
69              
70             =over 4
71              
72              
73             =item C
74              
75             $f = new Text::LooseCSV($fh);
76              
77             Create a new Text::LooseCSV object for all your variable-length record needs with an optional
78             file handle, $fh (e.g. IO::File). Set properties using the accessor methods as needed.
79              
80             If $fh is not given, you can use input_file() or input_text().
81              
82             Returns a blessed Text::LooseCSV object.
83              
84             =cut
85             sub new
86             {
87 1     1 0 12 my ($caller,$fh) = @_;
88              
89 1   33     8 my $class = ref($caller) || $caller;
90            
91 1         10 my $self = {
92             QUOTE_ESCAPE => '"',
93             WORD_DELIMITER => ',',
94             MAX_LINEBUF => 1000,
95             RECADD => 0,
96             NO_QUOTES => 0,
97             ALWAYS_QUOTE => 0,
98             WORD_LINE_DELIMITER_ESCAPE => undef,
99             linebuf => [],
100             fh => $fh,
101             };
102 1         4 line_delimiter($self,"\r\n");
103 1         4 return bless($self,$class);
104             }
105            
106             =pod
107              
108             =item C
109              
110             $current_value = $f->line_delimiter("\n");
111              
112             Get/set LINE_DELIMITER.
113             LINE_DELIMITER defines the line boundary chunks that are read into the
114             buffer and loosely defines the record delimiter.
115              
116             For parsing, this does not strictly affect the record/field structures as fields may
117             have embedded newlines, etc. However, this DOES need to be set correctly.
118              
119             Default = "\r\n" NOTE! The default is Windows format.
120              
121             Always returns the current set value.
122              
123             WARNING! line_delimiter() also sets $/ and is always called during construction.
124             Due to buffering, don't change $/ or LINE_DELIMITER during program execution!
125              
126              
127             =cut
128             sub line_delimiter
129             {
130 1     1 0 2 my ($self,$newval) = @_;
131 1 50       4 if (defined($newval))
132             {
133 1         5 return $self->{LINE_DELIMITER} = $/ = $newval;
134             }
135 0         0 return $self->{LINE_DELIMITER};
136             }
137              
138             =pod
139              
140             =item C
141              
142             $current_value = $f->word_delimiter("\t");
143              
144             Get/set WORD_DELIMITER.
145             WORD_DELIMITER defines the field boundaries within the record.
146             WORD_DELIMITER may only be set to a single character, otherwise a warning
147             is generated and the new value is ignored.
148              
149             Default = "," NOTE! Single character only.
150              
151             Always returns the current set value.
152              
153             WARNING! Due to buffering, don't change WORD_DELIMITER during program execution!
154              
155              
156             =cut
157             sub word_delimiter
158             {
159 0     0 0 0 my ($self,$newval) = @_;
160 0 0       0 if (defined($newval))
161             {
162 0 0       0 if (length($newval) != 1)
163             {
164 0         0 warn("WORD_DELIMITER may only be a single character, ignoring set value to [$newval]");
165             }
166             else
167             {
168 0         0 $self->{WORD_DELIMITER} = $newval;
169             }
170             }
171 0         0 return $self->{WORD_DELIMITER};
172             }
173              
174             =pod
175              
176             =item C
177              
178             $current_value = $f->quote_escape("\\");
179              
180             Get/set QUOTE_ESCAPE.
181             For data that have fields enclosed in quotes, QUOTE_ESCAPE defines the escape character for '"'
182             e.g. for the default QUOTE_ESCAPE = '"', to embed a quote character in a field (MS Excel style):
183              
184             "field1 ""junk"" and more, etc"
185              
186             Default = '"'
187              
188             Always returns the current set value.
189              
190             WARNING! Due to buffering, don't change QUOTE_ESCAPE during program execution!
191              
192              
193             =cut
194             sub quote_escape
195             {
196 0     0 0 0 my ($self,$newval) = @_;
197 0 0       0 if (defined($newval))
198             {
199 0         0 return $self->{QUOTE_ESCAPE} = $newval;
200             }
201 0         0 return $self->{QUOTE_ESCAPE};
202             }
203              
204             =pod
205              
206             =item C
207              
208             $current_value = $f->word_line_delimiter_escape("\\");
209              
210             Get/set WORD_LINE_DELIMITER_ESCAPE.
211             Sometimes you'll encounter (or want to create) files where WORD_DELIMITER and/or
212             LINE_DELIMITER's are embedded in the data and the creator had the notion (courtesy?)
213             to escape those characters when they appeared within a field with say, '\'. If so,
214             you'll want to set WORD_LINE_DELIMITER_ESCAPE to that character.
215              
216             If WORD_LINE_DELIMITER_ESCAPE is specified, this character must be escaped by the
217             same character to be included in a field.
218             e.g. for a tab-delimited file where WORD_LINE_DELIMITER_ESCAPE => '\'
219             follows is a sample record with an embedded newline:
220              
221             STABEyouETABEthis is a single field that contains an escaped line terminator\
222             an escaped tab\ETABE and an actual \\ETABEthis is the next field...>
223              
224             Do not use WORD_LINE_DELIMITER_ESCAPE for data with fields that are enclosed in
225             quotes.
226              
227             WORD_LINE_DELIMITER_ESCAPE cannot be '_', will otherwise be silently ignored.
228              
229             Default = undef()
230              
231             Always returns the current set value.
232              
233             WARNING! Due to buffering, don't change WORD_LINE_DELIMITER_ESCAPE during program execution!
234              
235              
236             =cut
237             sub word_line_delimiter_escape
238             {
239 0     0 0 0 my ($self,$newval) = @_;
240 0 0 0     0 if (defined($newval) && $newval ne '_')
241             {
242 0         0 return $self->{WORD_LINE_DELIMITER_ESCAPE} = $newval;
243             }
244 0         0 return $self->{WORD_LINE_DELIMITER_ESCAPE};
245             }
246              
247             =pod
248              
249             =item C
250              
251             $current_value = $f->no_quotes($bool);
252              
253             Get/set NO_QUOTES.
254             Instruct C to strip WORD_DELIMITER and LINE_DELIMITER from fields within the record
255             and never to enclose fields in quotes.
256              
257             By default, if, during record formation a WORD_DELIMITER or LINE_DELIMITER is encountered in a field
258             value, that field will be enclosed in quotes. However, if NO_QUOTES = 1 any occurence of
259             WORD_DELIMITER or LINE_DELIMITER will be stripped from the value and no enclosing quotes will be used.
260              
261             If ALWAYS_QUOTE = 1 this attribute is ignored and quotes will always be used.
262              
263             Only affects C.
264              
265             Default = 0 (by default records created with C may have fields enclosed in quotes)
266              
267             Always returns the current set value.
268              
269              
270             =cut
271             sub no_quotes
272             {
273 0     0 0 0 my ($self,$newval) = @_;
274 0 0       0 if (defined($newval))
275             {
276 0         0 return $self->{NO_QUOTES} = $newval;
277             }
278 0         0 return $self->{NO_QUOTES};
279             }
280              
281             =pod
282              
283             =item C
284              
285             $current_value = $f->always_quote($bool);
286              
287             Get/set ALWAYS_QUOTE.
288             Always enclose fields in quotes when using C. Only affects C.
289             Takes precedence over C.
290              
291             Default = 0
292              
293             Always returns the current set value.
294              
295              
296             =cut
297             sub always_quote
298             {
299 1     1 0 1004 my ($self,$newval) = @_;
300 1 50       7 if (defined($newval))
301             {
302 1         5 return $self->{ALWAYS_QUOTE} = $newval;
303             }
304 0         0 return $self->{ALWAYS_QUOTE};
305             }
306              
307             =pod
308              
309             =item C
310              
311             $current_value = $f->max_linebuf($integer);
312              
313             Get/set MAX_LINEBUF.
314             A file is read in line chunks and because newlines are allowed to be embedded in the field
315             values, many lines may be read and buffered before the whole record is determined.
316             MAX_LINEBUF sets the maximum number of lines that are used to parse a record before
317             the first line of that block is determined junk and -1 is returned from C.
318             Processing then continues at the very next line in the file.
319              
320             Default = 1000
321              
322             Always returns the current set value.
323              
324              
325             =cut
326             sub max_linebuf
327             {
328 0     0 0 0 my ($self,$newval) = @_;
329 0 0       0 if (defined($newval))
330             {
331 0         0 return $self->{MAX_LINEBUF} = $newval;
332             }
333 0         0 return $self->{MAX_LINEBUF};
334             }
335              
336             =pod
337              
338             =item C
339              
340             $current_value = $f->recadd($bool);
341              
342             Get/set RECADD.
343             If set to true, LINE_DELIMITER (actually $/) will be added to the end of the value returned from
344             C.
345             Only affects C
346              
347             Default = 0
348              
349             Always returns the current set value.
350              
351              
352             =cut
353             sub recadd
354             {
355 0     0 0 0 my ($self,$newval) = @_;
356 0 0       0 if (defined($newval))
357             {
358 0         0 return $self->{RECADD} = $newval;
359             }
360 0         0 return $self->{RECADD};
361             }
362              
363             =pod
364              
365             =item C
366              
367             $current_value = $f->input_file($fh);
368              
369             Get/set the filehandle of the file to be parsed (e.g. IO::File object).
370             May also be set in the constructor.
371              
372             Default = undef
373              
374             Always returns the current set value.
375              
376              
377             =cut
378             sub input_file
379             {
380 0     0 0 0 my ($self,$fh) = @_;
381 0 0       0 if (defined($fh))
382             {
383 0         0 return $self->{fh} = $fh;
384             }
385 0         0 return $self->{fh};
386             }
387              
388             =pod
389              
390             =item C
391              
392             $textbuf = $f->input_text($text_blob);
393              
394             Alternative to C, feed the entire text of a file or scalar to $f at once. Accepts scalar or scalar reference.
395              
396             Returns the internal textbuf attr.
397              
398              
399             =cut
400             sub input_text
401             {
402 1     1 0 5 my $self = shift;
403 1         4 my $recdel = quotemeta($/);
404 1 50       28 $self->{textbuf} = [ split(/$recdel/,(ref($_[0]) ? ${$_[0]} : $_[0])) ];
  0         0  
405             }
406              
407             =pod
408              
409             =item C
410              
411             $rec = $f->next_record();
412              
413             Parses and returns an arrayref of the fields of the next record.
414              
415             return '' if EOF is encountered
416              
417             return -1 if the next record is corrupted (incomplete, etc) or if MAX_LINEBUF is reached
418              
419              
420             WARNING! Due to buffering, don't change $/ or LINE_DELIMITER during program execution!
421              
422              
423             =cut
424             sub next_record
425             {
426 1     1 0 7 my ($self) = @_;
427              
428 1         2 my @fields;
429 1         2 my $linebuf_pos = -1;
430              
431 1         5 my $line = $self->next_line(\$linebuf_pos);
432 1 50 33     7 return -1 if ref($line) || !defined($line);
433 1 50       3 return $line unless $line;
434              
435 1         3 my $qe = quotemeta($self->{QUOTE_ESCAPE});
436 1         2 my $wd = quotemeta($self->{WORD_DELIMITER});
437              
438 1         2 my $lde_orig = $self->{WORD_LINE_DELIMITER_ESCAPE};
439 1 50       5 my $lde = defined($lde_orig) ? quotemeta($lde_orig) : undef;
440            
441 1         2 my $val;
442 1         2 my $rec_err = 0;
443              
444             # If a word contains ["$qe$wd$line_delimiter] then it must be surrounded in quotes or preceded by WORD_LINE_DELIMITER_ESCAPE
445 1         4 while (length($line))
446             {
447 4         5 my $match;
448 4         8 my $val = '';
449 4         5 my $delim = '';
450              
451 4 50       11 if (defined($lde))
452             {
453 0         0 $match = scalar($line =~ m/^
454             ((?:$lde$wd|[^$wd])*?) # an unquoted text
455             (?:\Z|($wd)) # plus EOL, or delimiter
456             ([\000-\377]*) # the rest
457             /xs); # extended layout
458              
459 0 0       0 if ($match)
460             {
461 0         0 $val = $1;
462 0         0 $delim = $2;
463 0         0 $line = $3;
464             }
465             }
466             else
467             {
468              
469 4         176 $match = scalar($line =~ m/^(?:(?:" # a quote
470             ((?:$qe"|[^"])*) # and quoted text " is escaped by $qe
471             ") # followed by the same quote
472             | # --OR--
473             ([^$wd"]*?)) # an unquoted text
474             (?:\Z|($wd)) # plus EOL, or delimiter
475             ([\000-\377]*) # the rest
476             /xs); # extended layout
477              
478 4 50       14 if ($match)
479             {
480 4 100       30 $val = defined($1) ? $1 : $2;
481 4         7 $delim = $3;
482 4         12 $line = $4;
483             }
484             }
485              
486 4         5 my $ldedef = $lde;
487 4 50       12 $ldedef = '' unless defined($lde);
488              
489 4 50       10 if ($match)
490             {
491 4         17 $val =~ s/$qe"/"/g;
492 4         16 $val =~ s/$ldedef$wd/$self->{WORD_DELIMITER}/g;
493 4 50       9 $val =~ s/__WORDLINETERMINATORESCAPE__/$lde_orig/g if defined($lde);
494 4         8 push(@fields,$val);
495 4 50 66     35 push(@fields,'') if (defined($delim) && length($delim)) && !length($line);
      66        
496             }
497             else
498             {
499 0         0 my $rs = $self->{le};
500 0         0 my $nl = $self->next_line(\$linebuf_pos);
501 0 0       0 return -1 if ref($nl);
502 0 0       0 length("${nl}${rs}") || ($rec_err++,last);
503 0         0 $line .= $rs . $nl;
504             }
505             }
506              
507 1 50       13 if ($rec_err)
508             {
509 0         0 $linebuf_pos = 0;
510             }
511              
512 1         4 $self->{cur_line} = join('',@{$self->{linebuf}}[0..$linebuf_pos]);
  1         5  
513              
514 1 50       2 if ($#{$self->{linebuf}} >= (++$linebuf_pos))
  1         5  
515             {
516 0         0 @{$self->{linebuf}} = @{$self->{linebuf}}[$linebuf_pos..$#{$self->{linebuf}}];
  0         0  
  0         0  
  0         0  
517             }
518             else
519             {
520 1         3 $self->{linebuf} = [];
521             }
522 1 50       4 return -1 if $rec_err;
523 1         5 return \@fields;
524             }
525              
526             =pod
527              
528             =item C
529              
530             $raw = $f->cur_line();
531              
532             Returns the raw text line currently being processed (including a line terminator if originally present).
533              
534              
535             =cut
536             sub cur_line
537             {
538 0     0 0 0 my $self = shift;
539 0         0 return $self->{cur_line};
540             }
541              
542              
543              
544             sub next_line
545             {
546 1     1 0 2 my ($self,$linebuf_pos) = @_;
547              
548 1         3 $self->{le} = '';
549              
550 1 50       4 (warn("MAX_LINEBUF limit reached"),return undef) if ($$linebuf_pos+1) >= $self->{MAX_LINEBUF};
551              
552 1         2 my $fh = $self->{fh};
553              
554 1         2 my $lde_orig = $self->{WORD_LINE_DELIMITER_ESCAPE};
555 1 50       3 my $lde = defined($lde_orig) ? quotemeta($lde_orig) : undef;
556              
557 1 50       2 unless ($$linebuf_pos < $#{$self->{linebuf}})
  1         4  
558             {
559 1         2 my $l;
560 1 50       3 if ($fh)
561             {
562 1     1   10 no strict;
  1         2  
  1         54  
563 0         0 $l = <$fh>;
564 1     1   6 use strict;
  1         7  
  1         242  
565              
566 0 0       0 if (defined($lde))
567             {
568 0 0       0 (warn("Incompatible string in line: $l"),return {}) if $l =~ /__WORDLINETERMINATORESCAPE__/;
569 0         0 $l =~ s/$lde{2}/__WORDLINETERMINATORESCAPE__/g;
570 0         0 my $nl = $l;
571 0         0 $l = '';
572 0         0 while ($nl =~ s/$lde(\r?)$/$1/)
573             {
574 0         0 $l .= $nl;
575 1     1   5 no strict;
  1         3  
  1         33  
576 0         0 $nl = <$fh>;
577 1     1   5 use strict;
  1         2  
  1         863  
578 0 0       0 (warn("Incompatible string in line: $nl"),return {}) if $nl =~ /__WORDLINETERMINATORESCAPE__/;
579 0         0 $nl =~ s/$lde{2}/__WORDLINETERMINATORESCAPE__/g;
580             }
581 0         0 $l .= $nl;
582             }
583             }
584             else
585             {
586 1         1 $l = shift(@{$self->{textbuf}});
  1         3  
587 1 50       2 $l .= $/ if @{$self->{textbuf}};
  1         3  
588              
589 1 50       15 if (defined($lde))
590             {
591 0 0       0 (warn("Incompatible string in line: $l"),return {}) if $l =~ /__WORDLINETERMINATORESCAPE__/;
592 0         0 $l =~ s/$lde{2}/__WORDLINETERMINATORESCAPE__/g;
593 0         0 my $nl = $l;
594 0         0 $l = '';
595 0         0 while ($nl =~ s/$lde(\r?)$/$1/)
596             {
597 0         0 $l .= $nl;
598              
599 0         0 $nl = shift(@{$self->{textbuf}});
  0         0  
600 0 0       0 $nl .= $/ if @{$self->{textbuf}};
  0         0  
601              
602 0 0       0 (warn("Incompatible string in line: $nl"),return {}) if $nl =~ /__WORDLINETERMINATORESCAPE__/;
603 0         0 $nl =~ s/$lde{2}/__WORDLINETERMINATORESCAPE__/g;
604             }
605 0         0 $l .= $nl;
606             }
607             }
608 1 50       3 length($l) || return '';
609 1         2 push(@{$self->{linebuf}},$l);
  1         3  
610             }
611 1         3 my $l = $self->{linebuf}[++$$linebuf_pos];
612              
613             ##at these next lines would suck if defined($lde) and the last character of the last record of the file was an escaped line-terminator
614 1 50       6 $self->{le} = $/ if chomp($l);
615 1 50       5 if (substr($l,-1) eq "\r")
616             {
617 0         0 $self->{le} = chop($l) . $self->{le};
618             }
619 1 50 33     12 if (substr($l,-1) eq $self->{WORD_DELIMITER} && !defined($lde))
620             {
621 0         0 $l .= '""';
622             }
623 1         3 return $l;
624             }
625              
626             =pod
627              
628             =item C
629              
630             $line = $f->form_record($array_of_fields);
631              
632             Returns a WORD_DELIMITED joined text scalar variable-length record of $array_of_fields. Also see C.
633              
634             $array_of_fields may be an array or arrayref.
635              
636             =cut
637             sub form_record
638             {
639 1     1 0 5 my $self = shift;
640 1         3 my @rec = ();
641 1 50       4 if (ref($_[0]))
642             {
643 1         1 @rec = @{$_[0]};
  1         4  
644             }
645             else
646             {
647 0         0 @rec = @_;
648             }
649 1         2 my $ret = '';
650              
651 1         3 my $wd = quotemeta($self->{WORD_DELIMITER});
652 1         2 my $ld = quotemeta($self->{LINE_DELIMITER});
653 1         2 my $lde_orig = $self->{WORD_LINE_DELIMITER_ESCAPE};
654 1 50       3 my $lde = defined($lde_orig) ? quotemeta($lde_orig) : undef;
655              
656 1         3 foreach my $field (@rec)
657             {
658 4 50 0     15 if ($self->{ALWAYS_QUOTE} || (!defined($lde) && !$self->{NO_QUOTES} && $field =~ /(?:$wd)|(?:$ld)|\"/s))
    0 0        
    0 33        
659             {
660 4         5 $field =~ s/"/$self->{QUOTE_ESCAPE}"/gs;
661 4         9 $field = qq["$field"];
662             }
663             elsif (defined($lde))
664             {
665 0         0 $field =~ s/((?:$wd)|(?:$ld)|(?:$lde))/$lde_orig$1/gs;
666             }
667             elsif ($self->{NO_QUOTES})
668             {
669 0         0 $field =~ s/(?:$wd)|(?:$ld)//gs;
670             }
671 4         9 $ret .= "$field$self->{WORD_DELIMITER}";
672             }
673 1         2 chop($ret);
674 1 50       3 $ret .= $/ if $self->{RECADD};
675 1         5 return $ret;
676             }
677              
678             =pod
679              
680             =back
681              
682             =head1 BUGS
683              
684             None as yet. This code has been used at several production sites before publishing to the public.
685              
686              
687             =head1 AUTHORS
688              
689             Reed Sandberg, Ereed_sandberg Ӓ yahooE
690              
691              
692             =head1 COPYRIGHT
693              
694             Copyright (C) 2001-2007 Reed Sandberg
695             All rights reserved. This program is free software; you can redistribute
696             it and/or modify it under the same terms as Perl itself.
697              
698              
699             =cut
700              
701             1;