File Coverage

blib/lib/Metabrik/File/Csv.pm
Criterion Covered Total %
statement 9 289 3.1
branch 0 150 0.0
condition 0 84 0.0
subroutine 3 9 33.3
pod 2 6 33.3
total 14 538 2.6


line stmt bran cond sub pod time code
1             #
2             # $Id$
3             #
4             # file::csv Brik
5             #
6             package Metabrik::File::Csv;
7 2     2   1080 use strict;
  2         5  
  2         57  
8 2     2   13 use warnings;
  2         4  
  2         53  
9              
10 2     2   10 use base qw(Metabrik);
  2         17  
  2         6137  
11              
12             sub brik_properties {
13             return {
14 0     0 1   revision => '$Revision$',
15             tags => [ qw(unstable) ],
16             author => 'GomoR ',
17             license => 'http://opensource.org/licenses/BSD-3-Clause',
18             attributes => {
19             input => [ qw(file) ],
20             output => [ qw(file) ],
21             first_line_is_header => [ qw(0|1) ],
22             separator => [ qw(character) ],
23             escape => [ qw(character) ],
24             header => [ qw($column_header_list) ],
25             encoding => [ qw(utf8|ascii) ],
26             overwrite => [ qw(0|1) ],
27             append => [ qw(0|1) ],
28             write_header => [ qw(0|1) ],
29             use_quoting => [ qw(0|1) ],
30             use_locking => [ qw(0|1) ],
31             unbuffered => [ qw(0|1) ],
32             encoded_fields => [ qw(fields) ],
33             object_fields => [ qw(fields) ],
34             _csv => [ qw(INTERNAL) ],
35             _fd => [ qw(INTERNAL) ],
36             _sb => [ qw(INTERNAL) ],
37             _sc => [ qw(INTERNAL) ],
38             },
39             attributes_default => {
40             first_line_is_header => 1,
41             separator => ',',
42             escape => '"',
43             encoding => 'utf8',
44             overwrite => 0,
45             append => 1,
46             write_header => 1,
47             use_quoting => 0,
48             use_locking => 0,
49             unbuffered => 0,
50             },
51             commands => {
52             read => [ qw(input_file|OPTIONAL) ],
53             write => [ qw(csv_struct output_file|OPTIONAL) ],
54             get_column_values => [ qw($data column_name|column_int) ],
55             read_next => [ qw(input_file|OPTIONAL) ],
56             },
57             require_modules => {
58             'Data::Dump' => [ ],
59             'Text::CSV_XS' => [ ],
60             'Metabrik::File::Read' => [ ],
61             'Metabrik::File::Write' => [ ],
62             'Metabrik::String::Base64' => [ ],
63             'Metabrik::String::Compress' => [ ],
64             },
65             };
66             }
67              
68             sub brik_init {
69 0     0 1   my $self = shift;
70              
71 0 0         my $sb = Metabrik::String::Base64->new_from_brik_init($self) or return;
72 0 0         my $sc = Metabrik::String::Compress->new_from_brik_init($self) or return;
73 0           $self->_sb($sb);
74 0           $self->_sc($sc);
75              
76 0           return $self->SUPER::brik_init;
77             }
78              
79             sub read {
80 0     0 0   my $self = shift;
81 0           my ($input) = @_;
82              
83 0   0       $input ||= $self->input;
84 0 0         $self->brik_help_run_undef_arg('read', $input) or return;
85 0 0         $self->brik_help_run_file_not_found('read', $input) or return;
86              
87 0 0         my $csv = Text::CSV_XS->new({
88             binary => 1,
89             sep_char => $self->separator,
90             allow_loose_quotes => 1,
91             allow_loose_escapes => 1,
92             escape_char => $self->escape,
93             }) or return $self->log->error('read: Text::CSV_XS new failed');
94              
95 0 0         my $fr = Metabrik::File::Read->new_from_brik_init($self) or return;
96 0           $fr->encoding($self->encoding);
97 0 0         my $fd = $fr->open($input) or return;
98              
99             # When some content is too complex to be stored as a standard CSV cell,
100             # we should encode it as base64.
101 0           my $sb = $self->_sb;
102 0           my $sc = $self->_sc;
103 0           my $encoded_fields = $self->encoded_fields;
104 0 0         if (defined($encoded_fields)) {
105 0           my $str = join(',', @$encoded_fields);
106 0           $encoded_fields = { map { $_ => 1 } @$encoded_fields };
  0            
107 0           $self->log->debug("read: will decode field(s) [$str] in encoded format");
108             }
109 0           my $object_fields = $self->object_fields;
110 0 0         if (defined($object_fields)) {
111 0           my $str = join(',', @$object_fields);
112 0           $object_fields = { map { $_ => 1 } @$object_fields };
  0            
113 0           $self->log->debug("read: will decode field(s) [$str] in object format");
114             }
115              
116 0           my $object_re = qr/^OBJECT:(.*)$/;
117 0           my $base64_re = qr/^BASE64:(.*)$/; # Keep for backward compat.
118              
119 0           my $sep = $self->separator;
120 0           my $headers;
121             my $count;
122 0           my $first_line = 1;
123 0           my @rows = ();
124 0           while (my $row = $csv->getline($fd)) {
125             # The CSV file has a header, we output an array of hashes
126 0 0         if ($self->first_line_is_header) {
127 0 0         if ($first_line) { # This is first line
128 0           $headers = $row;
129 0           $count = scalar @$row - 1;
130 0           $first_line = 0;
131 0           $self->header($headers);
132 0           next;
133             }
134              
135 0           my $h;
136             # We have to decode some fields
137 0 0 0       if ($encoded_fields || $object_fields) {
138 0           for (0..$count) {
139 0           my $k = $headers->[$_];
140 0           my $v = $row->[$_];
141 0 0         next unless defined($v);
142             # Decode only if it has been asked and the value is not empty.
143             # Decode the encode format
144 0 0 0       if ($encoded_fields && exists($encoded_fields->{$k}) && length($v)) {
      0        
145 0           my $decoded = $sb->decode($v);
146 0 0         if (! defined($decoded)) {
147 0           $self->log->error("read: decode encoded format failed, ".
148             "skipping data with length [".length($v)."]");
149 0           next;
150             }
151 0           my $gunzipped = $sc->gunzip($decoded);
152 0 0         if (! defined($gunzipped)) {
153 0           $self->log->error("read: gunzip failed, skipping ".
154             "decoded data with length [".length($decoded)."]");
155 0           next;
156             }
157 0           $v = $$gunzipped;
158             }
159             # Decode the object format
160 0 0 0       if ($object_fields && exists($object_fields->{$k}) && length($v)
      0        
      0        
      0        
161             && ($v =~ $object_re || $v =~ $base64_re)) {
162 0           my $decoded = $sb->decode($1);
163 0 0         if (! defined($decoded)) {
164 0           $self->log->error("read: decode object format failed, ".
165             "skipping data with length [".length($v)."]");
166 0           next;
167             }
168 0           $v = eval($decoded);
169             }
170 0           $h->{$k} = $v;
171             }
172             }
173             # Or not.
174             else {
175 0           for (0..$count) {
176 0           $h->{$headers->[$_]} = $row->[$_];
177             }
178             }
179 0           push @rows, $h;
180             }
181             # The CSV has no header, we output an array of arrays
182             else {
183 0           push @rows, $row;
184             }
185             }
186              
187 0 0         if (! $csv->eof) {
188 0           my $error_str = "".$csv->error_diag();
189 0           $self->log->error("read: incomplete: error [$error_str]");
190 0           return \@rows;
191             }
192              
193 0           $fr->close;
194              
195 0           return \@rows;
196             }
197              
198             #
199             # We only handle ARRAY of HASHes format (aoh) for writing
200             #
201             sub write {
202 0     0 0   my $self = shift;
203 0           my ($csv_struct, $output) = @_;
204              
205 0   0       $output ||= $self->output;
206 0 0         $self->brik_help_run_undef_arg('write', $csv_struct) or return;
207 0 0         $self->brik_help_run_invalid_arg('write', $csv_struct, 'ARRAY') or return;
208 0 0         $self->brik_help_run_empty_array_arg('write', $csv_struct, 'ARRAY') or return;
209 0 0         $self->brik_help_run_undef_arg('write', $output) or return;
210              
211 0 0         if (ref($csv_struct->[0]) ne 'HASH') {
212 0           return $self->log->error("write: csv structure does not contain HASHes");
213             }
214              
215 0 0         my $fw = Metabrik::File::Write->new_from_brik_init($self) or return;
216 0           $fw->output($output);
217 0           $fw->encoding($self->encoding);
218 0           $fw->overwrite($self->overwrite);
219 0           $fw->append($self->append);
220 0           $fw->use_locking($self->use_locking);
221 0           $fw->unbuffered($self->unbuffered);
222              
223             # When some content is too complex to be stored as a standard CSV cell,
224             # we should encode it as base64.
225 0           my $sb = $self->_sb;
226 0           my $sc = $self->_sc;
227 0           my $encoded_fields = $self->encoded_fields;
228 0 0         if (defined($encoded_fields)) {
229 0           my $str = join(',', @$encoded_fields);
230 0           $encoded_fields = { map { $_ => 1 } @$encoded_fields };
  0            
231 0           $self->log->debug("write: will encode field(s) [$str] in encoded format");
232             }
233 0           my $object_fields = $self->object_fields;
234 0 0         if (defined($object_fields)) {
235 0           my $str = join(',', @$object_fields);
236 0           $object_fields = { map { $_ => 1 } @$object_fields };
  0            
237 0           $self->log->debug("write: will encode field(s) [$str] in object format");
238             }
239              
240             #
241             # Set header ordering
242             #
243 0           my %order = ();
244 0           my @header = ();
245             # Order headers either by using user provided one or our own default ordering.
246 0 0         if ($self->header) {
247 0           @header = @{$self->header};
  0            
248 0           my $idx = 0;
249 0           for my $k (@header) {
250 0           $order{$k} = $idx;
251 0           $idx++;
252             }
253             }
254             # If user didn't provide her own header, we use first element from struct.
255             else {
256 0           my $first = $csv_struct->[0];
257 0           @header = sort { $a cmp $b } keys %$first;
  0            
258 0           my $idx = 0;
259 0           for my $k (@header) {
260 0           $order{$k} = $idx;
261 0           $idx++;
262             }
263             }
264              
265 0           my $header_count = @header;
266              
267 0           my $is_new_file = (! -f $output);
268 0 0         my $fd = $fw->open or return;
269              
270 0           my $written = '';
271              
272             # Write header if this is a new file and user asked for it.
273 0 0 0       if ($self->write_header && ($is_new_file || $self->overwrite)) {
      0        
274 0           my $data = join($self->separator, @header)."\n";
275 0           my $r = $fw->write($data);
276 0 0         if (! defined($r)) {
277 0           return;
278             }
279 0           $written .= $data;
280             }
281              
282 0           my $separator = $self->separator;
283 0           my $escape = $self->escape;
284              
285 0           local $Data::Dump::INDENT = ""; # No indentation shorten length
286 0           local $Data::Dump::TRY_BASE64 = 0; # Never encode in base64
287              
288             # Write the structure to file.
289 0           for my $this (@$csv_struct) {
290 0           my @fields = ();
291             # We have to decode some fields
292 0 0 0       if ($encoded_fields || $object_fields) {
293 0           for my $key (keys %$this) {
294             # We may have some unwanted data in this HASH, we skip it.
295 0 0         next if (! defined($order{$key}));
296 0           my $k = $key;
297 0           my $v = $this->{$key};
298 0 0         next unless defined($v);
299             # Encode only if it has been asked and the value is not empty.
300 0 0 0       if ($encoded_fields && exists($encoded_fields->{$k}) && length($v)) {
      0        
301             # Gzip to handle UTF-like encodings, cause Base64 does not like that.
302 0           my $gzipped = $sc->gzip($v);
303 0 0         if (! defined($gzipped)) {
304 0           $self->log->error("write: gzip failed, skipping");
305 0           next;
306             }
307 0           $v = $sb->encode($$gzipped);
308 0 0         if (! defined($v)) {
309 0           $self->log->error("write: encode in encoded format failed, skipping");
310 0           next;
311             }
312             }
313             # Encode only if it has been asked and the value is not empty.
314 0 0 0       if ($object_fields && exists($object_fields->{$k}) && length($v)) {
      0        
315             # Encode ARRAYs and HASHes only if they are not empty.
316             # Do not encode simple strings.
317 0 0 0       if (ref($v) eq 'ARRAY' && @$v > 0
    0 0        
      0        
      0        
318             || ref($v) eq 'HASH' && keys %$v > 0) {
319 0           $v = Data::Dump::dump($v); $v =~ s{\n}{}g;
  0            
320 0           $v = 'OBJECT:'.$sb->encode($v);
321 0 0         if (! defined($v)) {
322 0           $self->log->error("write: encode in object format failed, skipping");
323 0           next;
324             }
325             }
326             # If this is a simple string, we do not encode at all.
327             elsif (ref($v) eq '' && length($v)) {
328             }
329             # And for empty objects, we set them to empty string.
330             else {
331 0           $v = "";
332             }
333             }
334 0           $fields[$order{$key}] = $v;
335             }
336             }
337             # Or not.
338             else {
339 0           for my $key (keys %$this) {
340             # We may have some unwanted data in this HASH, we skip it.
341 0 0         next if (! defined($order{$key}));
342 0           $fields[$order{$key}] = $this->{$key};
343             }
344             }
345              
346 0 0         @fields = map { defined($_) ? $_ : '' } @fields;
  0            
347              
348             # If this entry has less fields than the header, we add null entries.
349 0           my $field_count = @fields;
350 0 0         if ($field_count < $header_count) {
351 0           my $diff = $header_count - $field_count;
352 0           for (1..$diff) {
353 0           push @fields, '';
354             }
355             }
356              
357 0 0         if ($self->use_quoting) {
358 0           for (@fields) {
359 0           s/"/${escape}"/g;
360 0           $_ = '"'.$_.'"';
361             }
362             }
363              
364 0           my $data = join($separator, @fields)."\n";
365              
366 0           my $r = $fw->write($data);
367 0 0         if (! defined($r)) {
368 0           next;
369             }
370              
371 0           $written .= $data;
372             }
373              
374 0           $fw->close;
375              
376 0 0         if (! length($written)) {
377 0           return $self->log->error("write: nothing to write");
378             }
379              
380 0           return $written;
381             }
382              
383             sub get_column_values {
384 0     0 0   my $self = shift;
385 0           my ($data, $column) = @_;
386              
387 0 0         $self->brik_help_run_undef_arg('get_column_values', $data) or return;
388 0 0         $self->brik_help_run_invalid_arg('get_column_values', $data, 'ARRAY') or return;
389 0 0         $self->brik_help_run_undef_arg('get_column_values', $column) or return;
390              
391 0           my @results = ();
392             # CSV structure is an ARRAYREF of HASHREFs
393 0 0         if ($self->first_line_is_header) {
    0          
394 0 0         if (@{$self->header} == 0) {
  0            
395 0           return $self->log->error("get_column_values: no CSV header found");
396             }
397              
398 0           for my $row (@$data) {
399 0 0         if (ref($row) ne 'HASH') {
400 0           $self->log->warning("get_column_values: row is not a HASHREF");
401 0           next;
402             }
403 0 0         if (exists($row->{$column})) {
404 0           push @results, $row->{$column};
405             }
406             }
407             }
408             # CSV structure is an ARRAYREF of ARRAYREFs
409             elsif ($column =~ m{^\d+$}) {
410 0           for my $row (@$data) {
411 0 0         if (ref($row) ne 'ARRAY') {
412 0           $self->log->warning("get_column_values: row is not an ARRAYREF");
413 0           next;
414             }
415 0 0         if (exists($row->[$column])) {
416 0           push @results, $row->[$column];
417             }
418             }
419             }
420              
421 0           return \@results;
422             }
423              
424             sub read_next {
425 0     0 0   my $self = shift;
426 0           my ($input) = @_;
427              
428 0   0       $input ||= $self->input;
429 0 0         $self->brik_help_run_undef_arg('read_next', $input) or return;
430 0 0         $self->brik_help_run_file_not_found('read_next', $input) or return;
431              
432 0           my $csv = $self->_csv;
433 0           my $fd = $self->_fd;
434 0 0         if (! defined($csv)) {
435 0           $self->log->debug('read_next: first call, create _csv');
436 0 0         $csv = Text::CSV_XS->new({
437             binary => 1,
438             sep_char => $self->separator,
439             allow_loose_quotes => 1,
440             allow_loose_escapes => 1,
441             escape_char => $self->escape,
442             }) or return $self->log->error('read_next: Text::CSV_XS new failed');
443 0           $self->_csv($csv);
444              
445 0 0         my $fr = Metabrik::File::Read->new_from_brik_init($self) or return;
446 0           $fr->encoding($self->encoding);
447 0 0         $fd = $fr->open($input) or return;
448 0           $self->_fd($fd);
449              
450 0 0         if ($self->first_line_is_header) {
451 0           my $header = $csv->getline($fd);
452 0           $self->header($header);
453             }
454             }
455              
456             # When some content is too complex to be stored as a standard CSV cell,
457             # we should encode it as base64.
458 0           my $sb = $self->_sb;
459 0           my $sc = $self->_sc;
460 0           my $encoded_fields = $self->encoded_fields;
461 0 0         if (defined($encoded_fields)) {
462 0           my $str = join(',', @$encoded_fields);
463 0           $encoded_fields = { map { $_ => 1 } @$encoded_fields };
  0            
464 0           $self->log->debug("read_next: will decode field(s) [$str] in base64");
465             }
466 0           my $object_fields = $self->object_fields;
467 0 0         if (defined($object_fields)) {
468 0           my $str = join(',', @$object_fields);
469 0           $object_fields = { map { $_ => 1 } @$object_fields };
  0            
470 0           $self->log->debug("read_next: will decode field(s) [$str] in object format");
471             }
472              
473 0           my $object_re = qr/^OBJECT:(.*)$/;
474 0           my $base64_re = qr/^BASE64:(.*)$/; # Keep for backward compat.
475              
476 0           my $row = $csv->getline($fd);
477              
478             # If a header is given as an Attribute, we use it to return a HASH
479 0           my $header = $self->header;
480 0 0         if (defined($header)) {
481 0           my $h = {};
482 0           my $i = 0;
483             # We have to decode some fields
484 0 0 0       if ($encoded_fields || $object_fields) {
485 0           for (@$header) {
486 0           my $k = $_;
487 0           my $v = $row->[$i++];
488 0 0         next unless defined($v);
489             # Decode only if it has been asked and the value is not empty.
490             # Decode the encode format
491 0 0 0       if ($encoded_fields && exists($encoded_fields->{$k}) && length($v)) {
      0        
492 0           my $decoded = $sb->decode($v);
493 0 0         if (! defined($decoded)) {
494 0           $self->log->error("read_next: decode failed, skipping data with ".
495             "with length [".length($v)."]");
496 0           next;
497             }
498 0           my $gunzipped = $sc->gunzip($decoded);
499 0 0         if (! defined($gunzipped)) {
500 0           $self->log->error("read_next: gunzip failed, skipping ".
501             "decoded data with length [".length($decoded)."]");
502 0           next;
503             }
504 0           $v = $$gunzipped;
505             }
506             # Decode the object format
507 0 0 0       if ($object_fields && exists($object_fields->{$k}) && length($v)
      0        
      0        
      0        
508             && ($v =~ $object_re || $v =~ $base64_re)) {
509 0           my $decoded = $sb->decode($1);
510 0 0         if (! defined($decoded)) {
511 0           $self->log->error("read_next: decode object format failed, ".
512             "skipping data with length [".length($v)."]");
513 0           next;
514             }
515 0           $v = eval($decoded);
516             }
517 0           $h->{$k} = $v;
518             }
519             }
520             # Or not.
521             else {
522 0           for (@$header) {
523 0           $h->{$_} = $row->[$i++];
524             }
525             }
526 0           $row = $h;
527             }
528              
529 0 0         if ($csv->eof) {
530 0           $self->log->debug('read_next: eof reached');
531 0           $self->_fd(undef);
532 0           $self->_csv(undef);
533 0           return 0;
534             }
535              
536 0           return $row;
537             }
538              
539             1;
540              
541             __END__