File Coverage

blib/lib/Data/TableReader/Decoder/CSV.pm
Criterion Covered Total %
statement 86 103 83.5
branch 34 60 56.6
condition 21 37 56.7
subroutine 16 17 94.1
pod 2 3 66.6
total 159 220 72.2


line stmt bran cond sub pod time code
1             package Data::TableReader::Decoder::CSV;
2 4     4   97052 use Moo 2;
  4         10251  
  4         37  
3 4     4   2267 use Try::Tiny;
  4         10  
  4         236  
4 4     4   24 use Carp;
  4         8  
  4         218  
5 4     4   1962 use IO::Handle;
  4         22782  
  4         3700  
6             extends 'Data::TableReader::Decoder';
7              
8             # ABSTRACT: Access rows of a comma-delimited text file
9             our $VERSION = '0.011'; # VERSION
10              
11             our @csv_probe_modules= ( ['Text::CSV_XS' => 1.06], ['Text::CSV' => 1.91] );
12             our $default_csv_module;
13             sub default_csv_module {
14 21   66 21 0 588 $default_csv_module ||=
15             Data::TableReader::Decoder::_first_sufficient_module('CSV parser', \@csv_probe_modules);
16             }
17              
18              
19             has _parser_args => ( is => 'ro', init_arg => 'parser' );
20              
21             has parser => ( is => 'lazy', init_arg => undef );
22             sub _build_parser {
23 16     16   130 my $self= shift;
24 16   50     85 my $args= $self->_parser_args || {};
25 16 50       114 return $args if ref($args)->can('getline');
26 16         49 return $self->default_csv_module->new({
27             binary => 1,
28             allow_loose_quotes => 1,
29             auto_diag => 2,
30             %$args
31             });
32             }
33              
34              
35             has autodetect_encoding => ( is => 'rw', default => sub { 1 } );
36              
37             sub encoding {
38 18     18 1 43 my ($self, $enc)= @_;
39 18         36 my $fh= $self->file_handle;
40 18 50       51 if (defined $enc) {
41 0         0 binmode($fh, ":encoding($enc)");
42 0         0 return $enc;
43             }
44            
45 18         114 my @layers= PerlIO::get_layers($fh);
46 18 50       60 if (($enc)= grep { /^encoding|^utf/ } @layers) {
  23         124  
47             # extract encoding name
48 0 0       0 return 'UTF-8' if $enc eq 'utf8';
49 0 0       0 return uc($1) if $enc =~ /encoding\(([^)]+)\)/;
50 0         0 return uc($enc); # could throw a parse error, but this is probably more useful behavior
51             }
52            
53             # fh_start_pos will be set if we have already checked for BOM
54 18 50 33     152 if ($self->autodetect_encoding && !defined $self->_fh_start_pos) {
55 18   50     112 $self->_fh_start_pos(tell $fh or 0);
56 18 100       51 if (($enc= $self->_autodetect_bom($fh))) {
57 1     1   7 binmode($fh, ":encoding($enc)");
  1         3  
  1         5  
  6         83  
58             # re-mark the start after the BOM
59 6   50     18425 $self->_fh_start_pos(tell $fh or 0);
60 6         19 return $enc;
61             }
62             }
63 11         33 return '';
64             }
65              
66              
67             has _fh_start_pos => ( is => 'rw' );
68             has _iterator => ( is => 'rw', weak_ref => 1 );
69             has _row_ref => ( is => 'rw' );
70             sub iterator {
71 20     20 1 7050 my $self= shift;
72 20 100       446 croak "Multiple iterators on CSV stream not supported yet" if $self->_iterator;
73 19         417 my $parser= $self->parser;
74 19         12426 my $fh= $self->file_handle;
75 19         51 my $row_ref= $self->_row_ref;
76             # Keeping this object is just an indication of whether an iterator has been used yet
77 19 100       56 if (!$row_ref) {
    50          
78 18         55 $self->_row_ref($row_ref= \(my $row= 0));
79             # trigger BOM detection if needed
80 18         64 my $enc= $self->encoding;
81 17   100     121 $self->_log->('debug', "encoding is ".($enc||'maybe utf8'));
82             # ensure _fh_start_pos is set
83 17   100     224 $self->_fh_start_pos(tell $fh or 0);
84             }
85             elsif ($$row_ref) {
86 0         0 $self->_log->('debug', 'Seeking back to start of input');
87 0 0       0 seek($fh, $self->_fh_start_pos, 0)
88             or die "Can't seek back to start of stream";
89 0         0 $$row_ref= 0;
90             }
91             my $i= Data::TableReader::Decoder::CSV::_Iter->new(
92             sub {
93 87     87   173 ++$$row_ref;
94 87 100       2402 my $r= $parser->getline($fh) or return undef;
95 71 100       2134 @$r= @{$r}[ @{$_[0]} ] if $_[0]; # optional slice argument
  26         77  
  26         47  
96 71         331 return $r;
97             },
98             {
99 18         278 row => $row_ref,
100             fh => $fh,
101             origin => $self->_fh_start_pos,
102             }
103             );
104 18         426 $self->_iterator($i);
105 18         259 return $i;
106             }
107              
108             # This design is simplified from File::BOM in that it ignores UTF-32
109             # and in any "normal" case it can read from a pipe with only one
110             # character to push back, avoiding the need to tie the file handle.
111             # It also checks for whether layers have already been enabled.
112             # It also avoids seeking to the start of the file handle, in case
113             # the user deliberately seeked to a position.
114             sub _autodetect_bom {
115 18     18   47 my ($self, $fh)= @_;
116 18         37 my $fpos= tell($fh);
117            
118 18         93 local $!;
119 18 50       150 read($fh, my $buf, 1) || return;
120 18 100 100     113 if ($buf eq "\xFF" || $buf eq "\xFE" || $buf eq "\xEF") {
      100        
121 8 50       27 if (read($fh, $buf, 1, 1)) {
122 8 100 66     43 if ($buf eq "\xFF\xFE") {
    100 66        
    100          
123 2         11 return 'UTF-16LE';
124             } elsif ($buf eq "\xFE\xFF") {
125 2         9 return 'UTF-16BE';
126             } elsif ($buf eq "\xEF\xBB" and read($fh, $buf, 1, 2) and $buf eq "\xEF\xBB\xBF") {
127 2         11 return 'UTF-8';
128             }
129             }
130             }
131            
132             # It wasn't a BOM. Try to undo our read.
133 12         75 $self->_log->('debug', 'No BOM in stream, seeking back to start');
134 12 100       203 if (length $buf == 1) {
    100          
135 10         197 $fh->ungetc(ord $buf);
136             } elsif (!seek($fh, $fpos, 0)) {
137             # Can't seek
138 1 50       22 if ($fh->can('ungets')) { # support for FileHandle::Unget
139 0         0 $fh->ungets($buf);
140             } else {
141 1         190 croak "Can't seek input handle after BOM detection; You should set an encoding manually, buffer the entire input, or use FileHandle::Unget";
142             }
143             }
144 11         63 return;
145             }
146              
147             # If you need to subclass this iterator, don't. Just implement your own.
148             # i.e. I'm not declaring this implementation stable, yet.
149 4     4   1255 use Data::TableReader::Iterator;
  4         8  
  4         142  
150 4     4   1364 BEGIN { @Data::TableReader::Decoder::CSV::_Iter::ISA= ('Data::TableReader::Iterator'); }
151              
152             sub Data::TableReader::Decoder::CSV::_Iter::position {
153 12     12   34 my $f= shift->_fields;
154 12         19 'row '.${ $f->{row} };
  12         47  
155             }
156              
157             sub Data::TableReader::Decoder::CSV::_Iter::progress {
158 0     0   0 my $f= shift->_fields;
159             # lazy-build the file size, using seek
160 0 0       0 unless (exists $f->{file_size}) {
161 0         0 my $pos= tell $f->{fh};
162 0 0 0     0 if (defined $pos and $pos >= 0 and seek($f->{fh}, 0, 2)) {
      0        
163 0         0 $f->{file_size}= tell($f->{fh});
164 0 0       0 seek($f->{fh}, $pos, 0) or die "seek: $!";
165             } else {
166 0         0 $f->{file_size}= undef;
167             }
168             }
169 0 0       0 return $f->{file_size}? (tell $f->{fh})/$f->{file_size} : undef;
170             }
171              
172             sub Data::TableReader::Decoder::CSV::_Iter::tell {
173 4     4   16 my $f= shift->_fields;
174 4         9 my $pos= tell($f->{fh});
175 4 50 33     17 return undef unless defined $pos && $pos >= 0;
176 4         5 return [ $pos, ${$f->{row}} ];
  4         13  
177             }
178              
179             sub Data::TableReader::Decoder::CSV::_Iter::seek {
180 7     7   24 my ($self, $to)= @_;
181 7         30 my $f= $self->_fields;
182 7 50       54 seek($f->{fh}, ($to? $to->[0] : $f->{origin}), 0) or croak("seek failed: $!");
    50          
183 7 50       18 ${ $f->{row} }= $to? $to->[1] : 0;
  7         17  
184 7         27 1;
185             }
186              
187             1;
188              
189             __END__