File Coverage

blib/lib/Text/AutoCSV.pm
Criterion Covered Total %
statement 1455 1700 85.5
branch 672 930 72.2
condition 188 288 65.2
subroutine 109 115 94.7
pod 42 42 100.0
total 2466 3075 80.2


line stmt bran cond sub pod time code
1             #!/usr/bin/perl
2             # ABSTRACT: helper module to automate the use of Text::CSV
3              
4             # vim:tw=100
5              
6             # Text/AutoCSV.pm
7              
8             #
9             # Written by Sébastien Millet
10             # March, July, August, September 2016
11             # January, February 2017
12             #
13              
14             package Text::AutoCSV;
15             $Text::AutoCSV::VERSION = '1.1.8';
16             my $PKG = "Text::AutoCSV";
17              
18 19     19   952532 use strict;
  19         48  
  19         468  
19 16     16   80 use warnings;
  16         32  
  16         851  
20              
21             require Exporter;
22             our @ISA = 'Exporter';
23             our @EXPORT_OK = qw(remove_accents);
24              
25 16     16   88 use Carp;
  16         41  
  16         877  
26 16     16   7773 use Params::Validate qw(validate validate_pos :types);
  16         126107  
  16         3315  
27 16     16   7961 use List::MoreUtils qw(first_index indexes);
  16         117145  
  16         173  
28 16     16   12604 use Fcntl qw(SEEK_SET);
  16         43  
  16         828  
29 16     16   7530 use File::BOM;
  16         421354  
  16         1003  
30 16     16   10630 use Text::CSV;
  16         243550  
  16         827  
31 16     16   12023 use DateTime;
  16         7394925  
  16         869  
32             # DateTime::Format::Strptime 1.70 does not work properly with us.
33             # Actually all version as of 1.63 are fine, except 1.70.
34 16     16   10948 use DateTime::Format::Strptime 1.71;
  16         956979  
  16         145  
35 16     16   11087 use Class::Struct;
  16         25547  
  16         114  
36 16     16   10377 use Unicode::Normalize;
  16         28388  
  16         1065  
37             # lock_keys is used to prevent accessing non existing keys
38             # Credits: 3381159 on http://stackoverflow.com
39             # "make perl shout when trying to access undefined hash key"
40 16     16   10330 use Hash::Util qw(lock_keys);
  16         58558  
  16         106  
41              
42             # FIXME
43             # Not needed in release -> should be always commented unless at dev time
44             #use feature qw(say);
45             #use Data::Dumper;
46             #$Data::Dumper::Sortkeys = 1;
47              
48             # Set to 1 if you wish to turn on debug without touching caller's code
49             our $ALWAYS_DEBUG = 0;
50              
51             # Keep it set to 0 unless you know what you're doing!
52             # Note
53             # Taken into account only if debug is set.
54             my $DEBUG_DATETIME_FORMATS = 0;
55             # The below is taken into account only if $DEBUG_DATETIME_FORMATS is set.
56             # It'll resqult in even more debug output. It becomes really MASSIVE debug output.
57             my $DEBUG_DATETIME_FORMATS_EVEN_MORE = 0;
58              
59             #
60             # Uncomment to replace carp and croak with cluck and confess, respectively
61             # Also reachable with perl option:
62             # -MCarp=verbose
63             # See 'perldoc Carp'.
64             #
65             #$Carp::Verbose = 1;
66              
67              
68             # * *************** *
69             # * BEHAVIOR TUNING *
70             # * *************** *
71              
72              
73             # * **************************************************** *
74             # * ALL THE VARIABLES BELOW ARE RATHER LOW LEVEL. *
75             # * IF YOU UPDATE IT, IT WILL LIKELY BREAK THE TEST PLAN *
76             # * **************************************************** *
77              
78             my $DEF_SEARCH_CASE = 0; # Case insensitive search by default
79             my $DEF_SEARCH_TRIM = 1; # Trim values by default
80             my $DEF_SEARCH_IGNORE_ACCENTS = 1; # Ignore accents
81             my $DEF_SEARCH_IGNORE_EMPTY = 1; # Ignore empty strings in searches by default
82             my $DEF_SEARCH_VALUE_IF_NOT_FOUND = undef; # If not found, returned field value is undef
83             my $DEF_SEARCH_VALUE_IF_AMBIGUOUS = undef; # If more than one record found by search (when a
84             # unique value is expected), return undef
85             my $DEF_SEARCH_IGNORE_AMBIGUOUS = 1; # By default, ignore the fact that multiple records are
86             # found by search and return the first record found
87              
88             my $DETECT_ENCODING = 1;
89              
90             my $DEFAULT_IN_ENCODING = 'UTF-8,latin1';
91              
92             # By default, input encoding detected is used for output.
93             # -> the constant below is used if and only if:
94             # Inbound encoding is unknown
95             # No providing of out_encoding attribute (out_encoding takes precedence when provided)
96             my $DEFAULT_OUT_ENCODING = 'UTF-8';
97              
98             my $DEFAULT_ESCAPE_CHAR = '\\';
99             my $DEFAULT_QUOTE_CHAR = '"';
100              
101             #
102             # The code that workarounds $FIX_PERLMONKS_823214 (see below) makes sense only under plain
103             # Windows.
104             #
105             # "Plain" Windows?
106             # This code MUST NOT be executed under cygwin because cygwin uses unix line breaks. This is
107             # why we detect /mswin/. Would we detect /win/, we'd catch cygwin, too, and we don't want
108             # that.
109             #
110             my $OS_IS_PLAIN_WINDOWS = !! ($^O =~ /mswin/i);
111              
112             #
113             # Shall we fix the issue reported as #823214 in PerlMonks? See
114             # http://www.perlmonks.org/?node_id=823214
115             #
116             # In brief (in case the link above would be broken one day):
117             # Under Windows, output mode set to UTF-16LE produces line breaks made of octets "0d 0a 00",
118             # whereas it should be "0d 00 0a 00".
119             #
120             # The code also fixes UTF-16BE (but it was not tested).
121             #
122             my $FIX_PERLMONKS_823214 = 1;
123              
124              
125             # * **** *
126             # * CODE *
127             # * **** *
128              
129              
130             sub ERR_UNKNOWN_FIELD() { 0 }
131              
132             # Store meta-data about each column
133             struct ColData => {
134             field_name => '$',
135             header_text => '$',
136             description => '$',
137             dt_format => '$',
138             dt_locale => '$'
139             };
140              
141             #
142             # Enumeration of ef_type member below
143             # Alternative:
144             # use enum (...)
145             #
146             # But it is not also by default on my distro and installing a package for 3 constants, I find it
147             # a bit overkill!
148             #
149             my ($EF_LINK, $EF_FUNC, $EF_COPY) = 0..2;
150             struct ExtraField => {
151             ef_type => '$',
152             self_name => '$',
153             description => '$',
154              
155             check_field_existence => '$',
156              
157             # For when ef_type is set to $EF_LINK
158              
159             link_self_search => '$',
160             link_remote_obj => '$',
161             link_remote_search => '$',
162             link_remote_read => '$',
163             link_vlookup_opts => '%',
164              
165             # For when ef_type is set to $EF_FUNC
166              
167             func_sub => '$',
168              
169             # For when ef_type is set to $EF_COPY
170              
171             copy_source => '$',
172             copy_sub => '$'
173              
174             };
175              
176             my $SEARCH_VALIDATE_OPTIONS = {
177             value_if_not_found => {type => UNDEF | SCALAR, optional => 1},
178             value_if_found => {type => UNDEF | SCALAR, optional => 1},
179             value_if_ambiguous => {type => UNDEF | SCALAR, optional => 1},
180             ignore_ambiguous => {type => BOOLEAN, optional => 1},
181             case => {type => BOOLEAN, optional => 1},
182             trim => {type => BOOLEAN, optional => 1},
183             ignore_empty => {type => BOOLEAN, optional => 1},
184             ignore_accents => {type => BOOLEAN, optional => 1}
185             };
186              
187             sub _is_utf8 {
188 416     416   1007 my $e = shift;
189              
190 416 100       3836 return 1 if $e =~ m/^(utf-?8|ucs-?8)/i;
191 32         101 return 0;
192             }
193              
194             # To replace // in old perls: return the first non-undef value in provided list
195             sub _get_def {
196 12004     12004   25387 for (@_) {
197 18591 100       53806 return $_ if defined($_);
198             }
199 882         1896 return undef;
200             }
201              
202             sub _print {
203 15     15   32 my $self = shift;
204 15         27 my $t = shift;
205              
206 15         26 my $infoh = $self->{infoh};
207 15 50       55 return if ref $infoh ne 'GLOB';
208              
209 0         0 print($infoh $t);
210             }
211              
212             sub _printf {
213 17     17   31 my $self = shift;
214              
215 17         32 my $infoh = $self->{infoh};
216 17 50       64 return if ref $infoh ne 'GLOB';
217              
218 0         0 printf($infoh @_);
219             }
220              
221             sub _print_warning {
222 67     67   145 my $self = shift;
223 67         140 my $warning_message = shift;
224 67         129 my $dont_wrap = shift;
225              
226 67 100       232 my $msg = ($dont_wrap ? $warning_message : "$PKG: warning: $warning_message");
227 67 100       4925 carp $msg unless $self->{quiet};
228             }
229              
230             sub _close_inh {
231 314     314   621 my $self = shift;
232              
233 314 100       6037 close $self->{_inh} if $self->{_close_inh_when_finished};
234 314         1094 $self->{_inh} = undef;
235 314         1277 $self->{_close_inh_when_finished} = undef;
236             }
237              
238             sub _close_outh {
239 126     126   278 my $self = shift;
240              
241 126 50 66     18873 close $self->{outh} if defined($self->{outh}) and $self->{_close_outh_when_finished};
242 126         464 $self->{outh} = undef;
243 126         327 $self->{_close_outh_when_finished} = undef;
244             }
245              
246             sub _print_error {
247 81     81   636 my ($self, $error_message, $dont_stop, $err_code, $err_extra) = @_;
248              
249 81         294 my $msg = "$PKG: error: $error_message";
250              
251 81 100 100     432 if (defined($err_code) and !$self->{quiet} and $self->{croak_if_error}) {
      100        
252 5 50       21 if ($err_code == ERR_UNKNOWN_FIELD) {
253 5         11 my %f = %{$err_extra};
  5         29  
254 5         14 my @cols;
255 5         20 for my $n (keys %f) {
256 15         41 $cols[$f{$n}] = $n;
257             }
258 5         19 $self->_print($self->get_in_file_disp() . " column - field name correspondance:\n");
259 5         18 $self->_print("COL # FIELD\n");
260 5         15 $self->_print("----- -----\n");
261 5         16 for my $i (0..$#cols) {
262 17 100       58 $self->_printf("%05d %s\n", $i, (defined($cols[$i]) ? $cols[$i] : ''));
263             }
264             } else {
265 0         0 confess "Unknown error code: '$err_code'\n";
266             }
267             }
268              
269 81 100 100     380 if ($self->{croak_if_error} and !$dont_stop) {
270 30         119 $self->_close_read(1);
271 30         123 $self->_close_inh();
272 30         101 $self->_close_outh();
273 30         160 $self->_status_reset(1);
274 30         5202 croak $msg;
275             }
276 51         196 $self->_print_warning($msg, 1);
277             }
278              
279             #
280             # Return the string passed in argument with all accents removed from characters.
281             # Do it in a rather general and reliable way, not tied to latin1.
282             # Tested on latin1 and latin2 character sets.
283             #
284             # Credits:
285             # http://stackoverflow.com/questions/17561839/remove-accents-from-accented-characters
286             #
287             sub remove_accents {
288 3144     3144 1 29323 validate_pos(@_, {type => SCALAR});
289              
290 3144         8564 my $s = $_[0];
291 3144         13584 my $r = NFKD($s);
292 3144         8753 $r =~ s/\p{Nonspacing_Mark}//g;
293 3144         7763 return $r;
294             }
295              
296             sub _detect_csv_sep {
297 255     255   572 my $ST_OUTSIDE = 0;
298 255         500 my $ST_INSIDE = 1;
299              
300 255         779 my ($self, $escape_char, $quote_char, $sep) = @_;
301              
302 255         553 my $_debugh = $self->{_debugh};
303 255         559 my $inh = $self->{_inh};
304 255         502 my $_debug = $self->{_debug};
305              
306 255         526 delete $self->{_inh_header};
307              
308 255 100       823 $escape_char = $DEFAULT_ESCAPE_CHAR unless defined($escape_char);
309              
310 255 50       785 $self->_print_error("illegal \$escape_char: '$escape_char' (length >= 2)"), return 0
311             if length($escape_char) >= 2;
312              
313 255 50       781 $self->_print_error("$PKG: error: illegal \$quote_char '$quote_char' (length >= 2)"), return 0
314             if length($quote_char) >= 2;
315              
316 255 50       782 $escape_char = '--' if $escape_char eq '';
317 255 50       714 $quote_char = '--' if $quote_char eq '';
318              
319             # FIXME (?)
320             # Avoid inlined magic values for separator auto-detection.
321             # Issue is, as you can see below, the behavior is also hard-coded and not straightforward to
322             # render 'tunable' ("," and ";" take precedence over "\t").
323 255         1286 my %Seps = (
324             ";" => 0,
325             "," => 0,
326             "\t" => 0
327             );
328              
329 255         3210 my $h = <$inh>;
330 255 50       2860 if ($self->{inh_is_stdin}) {
331 0         0 $self->{_inh_header} = $h;
332 0 0       0 print($_debugh "Input is STDIN => saving header line to re-read it " .
333             "later (in-memory)\n") if $_debug;
334             } else {
335 255         1800 seek $inh, 0, SEEK_SET;
336 255 50       860 print($_debugh "Input is not STDIN => using seek function to rewind " .
337             "read head after header line reading\n") if $_debug;
338             }
339              
340 255         708 chomp $h;
341 255         520 my $status = $ST_OUTSIDE;
342 255         653 my $l = length($h);
343 255         481 my $c = 0;
344 255         782 while ($c < $l) {
345 4769         7927 my $ch = substr($h, $c, 1);
346 4769         6711 my $chnext = '';
347 4769 100       10870 $chnext = substr($h, $c + 1, 1) if ($c < $l - 1);
348 4769 100       10683 if ($status == $ST_INSIDE) {
    50          
349 1521 50 66     4169 if ($ch eq $escape_char and $chnext eq $quote_char) {
    100          
350 0         0 $c += 2;
351             } elsif ($ch eq $quote_char) {
352 197         307 $status = $ST_OUTSIDE;
353 197         435 $c++;
354             } else {
355 1324         2585 $c++;
356             }
357             } elsif ($status == $ST_OUTSIDE) {
358 3248 50 33     10777 if ($ch eq $escape_char and ($chnext eq $quote_char or
    100 66        
    100          
359             exists $Seps{$chnext})) {
360 0         0 $c += 2;
361             } elsif (exists $Seps{$ch}) {
362 930         1420 $Seps{$ch}++;
363 930         1990 $c++;
364             } elsif ($ch eq $quote_char) {
365 197         296 $status = $ST_INSIDE;
366 197         425 $c++;
367             } else {
368 2121         4746 $c++;
369             }
370             }
371             }
372              
373 255 100 100     1852 if ($Seps{";"} == 0 and $Seps{","} >= 1) {
    100 66        
    50 33        
      33        
374 144         348 $$sep = ",";
375 144         761 return 1;
376             } elsif ($Seps{","} == 0 and $Seps{";"} >= 1) {
377 102         220 $$sep = ";";
378 102         499 return 1;
379             } elsif ($Seps{","} == 0 and $Seps{";"} == 0 and $Seps{"\t"} >= 1) {
380 0         0 $$sep = "\t";
381 0         0 return 1;
382             } else {
383              
384             # Check the case where there is one unique column, in which case,
385             # assume comma separator.
386 9         32 my $h_no_accnt = remove_accents($h);
387 9 100       64 if ($h_no_accnt =~ m/^[[:alnum:]_]+$/i) {
388 3         8 $$sep = ",";
389 3         15 return 1;
390             }
391              
392 6         14 $$sep = "";
393 6 50       14 if ($_debug) {
394 0         0 for my $k (keys %Seps) {
395 0         0 print($_debugh "\$Seps{'$k'} = $Seps{$k}\n");
396             }
397             }
398 6         31 return 0;
399             }
400             }
401              
402             sub _reopen_input {
403 633     633   1161 my $self = shift;
404              
405 633         1191 my $in_file = $self->{in_file};
406              
407 633         972 my $inh;
408 633 50       15399 if (!open($inh, "<", $in_file)) {
409 0         0 $self->_print_error("unable to open file '$in_file': $!");
410 0         0 return undef;
411             }
412 633 50       2252 if (!$self->{_leave_encoding_alone}) {
413              
414             confess "Oups! _inh_encoding_string undef?"
415 633 50       1690 unless defined($self->{_inh_encoding_string});
416              
417 633         4315 binmode $inh, $self->{_inh_encoding_string};
418             }
419              
420 633         30469 return $inh;
421             }
422              
423             # Abstraction layer, not useful Today, could bring added value when looking into Text::CSV I/O
424             sub _mygetline {
425 9078     9078   19491 my ($csvobj, $fh) = @_;
426              
427 9078         210971 return $csvobj->getline($fh);
428             }
429              
430             sub _detect_escape_char {
431 304     304   960 my ($self, $quote_char, $sep_char, $ref_escape_char, $ref_is_always_quoted) = @_;
432              
433 304         670 my $in_file = $self->{in_file};
434 304         683 my $_debug = $self->{_debug};
435 304         579 my $_debugh = $self->{_debugh};
436              
437 304         707 $$ref_escape_char = $DEFAULT_ESCAPE_CHAR;
438 304         582 $$ref_is_always_quoted = undef;
439              
440 304 100       862 if ($self->{_int_one_pass}) {
441 13         37 return;
442             }
443              
444 291         881 $self->_register_pass("detect escape character");
445              
446 291         534 my $qesc = 0;
447 291         909 my $inh = $self->_reopen_input();
448 291 50       951 if (defined($inh)) {
449 291         4363 while (my $l = <$inh>) {
450 7088         17838 chomp $l;
451              
452             # Very heuristic criteria...
453             # Tant pis.
454 7088 100       31105 $qesc = 1 if $l =~ m/(?<!$sep_char)$quote_char$quote_char(?!$sep_char)/;
455              
456             }
457 291         2732 close $inh;
458             }
459 291 100       992 if ($qesc) {
460 7         20 $$ref_escape_char = '"';
461             } else {
462 284         735 $$ref_escape_char = '\\' ;
463             }
464              
465 291         567 my $is_always_quoted = 0;
466 291         842 $inh = $self->_reopen_input();
467 291 50       1212 if (defined($inh)) {
468 291         3694 my $csv = Text::CSV->new({sep_char => $sep_char,
469             allow_whitespace => 1, binary => 1, auto_diag => 0,
470             quote_char => $quote_char, escape_char => $$ref_escape_char,
471             keep_meta_info => 1,
472             allow_loose_escapes => 1});
473 291         65381 $is_always_quoted = 1;
474 291         1004 while (my $ar = _mygetline($csv, $inh)) {
475 1628         57281 my @a = @{$ar};
  1628         4629  
476 1628         2907 my $e = $#a;
477 1628         3473 for my $i (0..$e) {
478 11018 100       91320 $is_always_quoted = 0 unless $csv->is_quoted($i);
479             }
480 1628 100       17517 last unless $is_always_quoted;
481             }
482 291 100       2473 my $is_ok = ($csv->eof() ? 1 : 0);
483 291         5639 close $inh;
484             }
485              
486 291 50       1229 print($_debugh " is_always_quoted: $is_always_quoted\n") if $_debug;
487 291         616 $$ref_is_always_quoted = $is_always_quoted;
488              
489 291         1317 return;
490             }
491              
492             sub _register_pass {
493 950     950   2193 my ($self, $pass_name) = @_;
494 950         1756 my $_debug = $self->{_debug};
495 950         1642 my $_debugh = $self->{_debugh};
496              
497 950         1916 $self->{_pass_count}++;
498              
499 950 50       2781 return unless $_debug;
500              
501 0         0 print($_debugh "Pass #" . $self->{_pass_count} . " ($pass_name) done\n");
502             }
503              
504             sub _update_in_mem_record_count {
505 501     501   1407 my ($self, $nonexistent_arg) = @_;
506 501         1062 my $_debug = $self->{_debug};
507 501         1061 my $_debugh = $self->{_debugh};
508              
509 501 50       1427 confess "Hey! what is this second argument?" if defined($nonexistent_arg);
510              
511 501         910 my $new_count = $#{$self->{_flat}} + 1;
  501         1381  
512              
513 501         1002 my $updated_max = 0;
514 501 100       1776 if ($new_count > $self->get_max_in_mem_record_count()) {
515 153         637 $self->_set_max_in_mem_record_count($new_count);
516 153         310 $updated_max = 1;
517             }
518              
519 501         1528 $self->{_in_mem_record_count} = $new_count;
520 501 50       1687 if ($_debug) {
521 0         0 print($_debugh "_in_mem_record_count updated, set to $new_count");
522 0 0       0 print($_debugh " (also updated max)") if $updated_max;
523 0         0 print($_debugh "\n");
524             }
525             }
526              
527             sub _detect_inh_encoding {
528 313     313   1377 my ($self, $enc, $via, $in_file, $detect_enc) = @_;
529 313         785 my $_debug = $self->{_debug};
530 313         665 my $_debugh = $self->{_debugh};
531              
532 313 100 66     1312 $enc = $DEFAULT_IN_ENCODING if !defined($enc) or $enc eq '';
533              
534 313         2405 my @encodings = split(/\s*,\s*/, $enc);
535              
536 313 50       1158 confess "Oups! No encoding to try?" if $#encodings < 0;
537              
538 313 50       972 print($_debugh "[ST] _detect_inh_encoding(): start\n") if $_debug;
539              
540 313         582 my $wrn = 0;
541 313         944 my $m;
542             my $m0;
543 313         0 my $ee;
544 313         808 for my $e (@encodings) {
545 325         693 $ee = $e;
546 325         856 my $viadef = _get_def($via, '');
547 325         1113 $m = ":encoding($e)$viadef";
548 325 100       1081 $m0 = $m unless defined($m0);
549              
550 325 100       905 last unless $detect_enc;
551              
552 313 50 33     1843 confess "Oups! in_file not defined?" if !defined($in_file) or $in_file eq '';
553              
554 313 50       871 print($_debugh " Checking encoding '$e' / '$m'\n") if $_debug;
555 313         617 $wrn = 0;
556              
557 313         1426 $self->_register_pass("check $e encoding");
558              
559 313         643 my $utf8_bom = 0;
560 313 100       976 if (_is_utf8($e)) {
561 289 50       7500 if (open my $fh, '<:raw', $in_file) {
562 289         732 my $bom;
563 289         5712 read $fh, $bom, 3;
564 289 100 66     1963 if (length($bom) == 3 and $bom eq "\xef\xbb\xbf") {
565 12 100       84 if (!defined($via)) {
566 10         35 $m .= ":via(File::BOM)";
567             }
568             }
569 289         3955 close $fh;
570             }
571             }
572              
573 313         753 my $inh;
574 313 50       6191 if (!open($inh, "<", $in_file)) {
575 0         0 $self->_print_error("unable to open file '$in_file': $!");
576 0         0 return ($encodings[0], $m0);
577             }
578 15     15   116 binmode $inh, $m;
  15         32  
  15         102  
  313         4070  
579              
580             # TURN OFF WARNINGS OUTPUT
581              
582             {
583 313         42788 local $SIG{__WARN__} = sub {
584 51     51   746 $wrn++;
585             # Uncomment only for debug!
586             # Otherwise you'll get quite a good deal of output at each execution :-)
587             # print(STDERR @_);
588 313         2604 };
589 313         4928 while (<$inh>) { }
590             }
591              
592             # WARNINGS ARE BACK ON
593              
594 313         18097 close $inh;
595 313 50       1212 print($_debugh " '$m' counts $wrn warning(s)\n") if $_debug;
596              
597 313 100       1608 last if $wrn == 0;
598             }
599              
600 313 50       1086 if ($wrn >= 1) {
601 0         0 $self->_print_warning("encoding warnings encountered during initial check, " .
602             "using '$encodings[0]'");
603 0         0 return ($encodings[0], $m0);
604             }
605              
606 313 50       958 confess "Oups! undef encoding string?" unless defined($m);
607              
608 313 50       873 print($_debugh " Detected encoding string '$ee' / '$m'\n") if $_debug;
609 313         1561 return ($ee, $m);
610             }
611              
612             #
613             # Each of these functions brings status to the next value (current status + 1).
614             # Each of these functions returns 0 if an error occured, 1 if all good
615             #
616             my @status_forward_functions = (
617             "_S1_init_input", # To go from S0 to S1
618             "_S2_init_fields_from_header", # To go form S1 to S2
619             "_S3_init_fields_extra", # To go from S2 to S3
620             "_S4_read_all_in_mem", # To go from S3 to S4
621             );
622              
623             sub _status_reset {
624 487     487   1010 my $self = shift;
625              
626 487         3754 validate_pos(@_, {type => SCALAR, optional => 1});
627 487         2042 my $called_from_print_error = _get_def($_[0], 0);
628              
629 487 100 100     2351 if (defined($self->{_status}) and $self->{_status} == 4) {
630 18 100       65 unless ($called_from_print_error) {
631 16         39 my $msg = "in-memory CSV content discarded, will have to re-read input";
632 16         65 $self->_print_warning($msg);
633             }
634 18         93 $self->{_flat} = [ ];
635 18         173 $self->_update_in_mem_record_count();
636             }
637              
638 487         1193 $self->{_status} = 0;
639 487 100       1446 return 0 if $called_from_print_error;
640 457         1328 return $self->_status_forward('S1');
641             }
642              
643             sub _status_forward {
644 4017     4017   6996 my $self = shift;
645              
646 4017         10067 return $self->___status_move(@_, 1);
647             }
648              
649             sub _status_backward {
650 139     139   269 my $self = shift;
651              
652 139         399 return $self->___status_move(@_, -1);
653             }
654              
655             # You should not call ___status_move() in the code, that is why the name is prefixed with 3
656             # underscores! Only _status_forward and _status_backward should call it.
657             sub ___status_move {
658 4156     4156   9007 my ($self, $target, $step) = @_;
659              
660 4156         7854 my $_debug = $self->{_debug};
661 4156         7283 my $_debugh = $self->{_debugh};
662              
663 4156 50 66     25252 if (!defined($step) or ($step != -1 and $step != 1)) {
      33        
664 0         0 confess "Oups! \$step has a wrong value: '$step'";
665             }
666              
667 4156         6741 my $n;
668 4156 50       20288 confess "Oups! illegal status string: '$target'" unless ($n) = $target =~ m/^S(\d)$/;
669              
670 4156 100       11469 if ($self->{_read_in_progress}) {
671 1         6 $self->_print_error("illegal call while read is in progress, " .
672             "would lead to infinite recursion", 0);
673 0         0 confess "Aborted.";
674             }
675              
676 4155 100       10067 if ($step == -1) {
677 139 100       527 if ($n < $self->{_status}) {
678 19 100       76 if ($self->{_status} == 4) {
679 16 50       72 print($_debugh "[ST] Requested status $n but will go to status 0\n") if $_debug;
680 16         63 return $self->_status_reset();
681             }
682 3         6 $self->{_status} = $n ;
683 3 50       9 print($_debugh "[ST] New status: ". $self->{_status} . "\n") if $_debug;
684             }
685 123         446 return 1;
686             }
687              
688 4016 100       11991 if ($self->{_status} < $n) {
689 856 50       2278 print($_debugh "[ST] Current status: ". $self->{_status} . "\n") if $_debug;
690             }
691              
692 4016 100 100     13962 if ($self->{_status} <= 1 and $n >= 2 and $self->{_int_one_pass} and
      100        
      100        
693             $self->get_pass_count() >= 1) {
694 12         26 my $msg = "one_pass set, unable to read input again";
695 12 50       54 $self->_print_error($msg), return 0 if $self->{one_pass};
696 0 0       0 $self->_print_warning($msg) if !$self->{one_pass};
697             }
698              
699 4004         10264 while ($self->{_status} < $n) {
700              
701 1326         3229 my $funcname = $status_forward_functions[$self->{_status}];
702 1326 50       3316 confess "Oups! Unknown status?" unless defined($funcname);
703              
704 1326 50       3175 print($_debugh "[ST] Now executing $funcname\n") if $_debug;
705              
706 1326 50       6403 if (my $member_function = $self->can($funcname)) {
707 1326 100       3958 return 0 unless $self->$member_function();
708             } else {
709 0         0 confess "Could not find method $funcname in $PKG!";
710             }
711              
712 1307         3121 $self->{_status} += $step;
713 1307 50       5235 print($_debugh "[ST] New status: ". $self->{_status} . "\n") if $_debug;
714             }
715              
716 3985         12473 return 1;
717             }
718              
719             sub new {
720 326     326 1 317556 my ($class, @args) = @_;
721              
722 326         42533 @args = validate(@args,
723             { in_file => {type => SCALAR, optional => 1},
724             infoh => {type => UNDEF | GLOBREF, default => \*STDERR, optional => 1},
725             verbose => {type => BOOLEAN, default => 0, optional => 1},
726             quiet => {type => BOOLEAN, optional => 1},
727             croak_if_error => {type => BOOLEAN, default => 1, optional => 1},
728             inh => {type => GLOBREF, optional => 1},
729             in_csvobj => {type => OBJECT, optional => 1},
730             sep_char => {type => SCALAR, optional => 1},
731             quote_char => {type => SCALAR, optional => 1},
732             escape_char => {type => SCALAR, optional => 1},
733             has_headers => {type => BOOLEAN, default => 1, optional => 1},
734             out_has_headers => {type => UNDEF | BOOLEAN, default => undef, optional => 1},
735             fields_ar => {type => ARRAYREF, optional => 1},
736             fields_hr => {type => HASHREF, optional => 1},
737             fields_column_names => {type => ARRAYREF, optional => 1},
738             search_case => {type => SCALAR, optional => 1},
739             search_trim => {type => SCALAR, optional => 1},
740             search_ignore_empty => {type => SCALAR, optional => 1},
741             search_ignore_accents => {type => SCALAR, optional => 1},
742             search_ignore_ambiguous => {type => SCALAR, optional => 1},
743             search_value_if_not_found => {type => SCALAR, optional => 1},
744             search_value_if_found => {type => SCALAR, optional => 1},
745             search_value_if_ambiguous => {type => SCALAR, optional => 1},
746             walker_hr => {type => CODEREF, optional => 1},
747             walker_ar => {type => CODEREF, optional => 1},
748             read_post_update_hr => {type => CODEREF, optional => 1},
749             write_filter_hr => {type => CODEREF, optional => 1},
750             out_filter => {type => CODEREF, optional => 1},
751             write_fields => {type => ARRAYREF, optional => 1},
752             out_fields => {type => ARRAYREF, optional => 1},
753             out_file => {type => SCALAR, optional => 1},
754             out_always_quote => {type => BOOLEAN, optional => 1},
755             out_sep_char => {type => SCALAR, optional => 1},
756             out_quote_char => {type => SCALAR, optional => 1},
757             out_escape_char => {type => SCALAR, optional => 1},
758             out_dates_format => {type => SCALAR, optional => 1},
759             out_dates_locale => {type => SCALAR, optional => 1},
760             encoding => {type => SCALAR, optional => 1},
761             via => {type => SCALAR, optional => 1},
762             out_encoding => {type => SCALAR, optional => 1},
763             dont_mess_with_encoding => {type => BOOLEAN, optional => 1},
764             one_pass => {type => BOOLEAN, optional => 1},
765             no_undef => {type => BOOLEAN, optional => 1},
766             fields_dates => {type => ARRAYREF, optional => 1},
767             fields_dates_auto => {type => BOOLEAN, optional => 1},
768             dates_formats_to_try => {type => ARRAYREF, optional => 1},
769             dates_formats_to_try_supp => {type => ARRAYREF, optional => 1},
770             dates_ignore_trailing_chars => {type => BOOLEAN, optional => 1},
771             dates_search_time => {type => BOOLEAN, optional => 1},
772             dates_locales => {type => SCALAR, optional => 1},
773             out_utf8_bom => {type => SCALAR, optional => 1},
774             dates_zeros_ok => {type => SCALAR, default => 1, optional => 1},
775             _debug => {type => BOOLEAN, default => 0, optional => 1},
776             _debug_read => {type => BOOLEAN, default => 0, optional => 1},
777             _debug_extra_fields => {type => BOOLEAN, optional => 1},
778             _debugh => {type => UNDEF | GLOBREF, optional => 1}
779             }
780             );
781              
782 322         7777 my $self = { @args };
783              
784 322         916 my @fields = keys %{$self};
  322         1651  
785              
786             # croak_if_error
787              
788 322         1050 my $croak_if_error = $self->{croak_if_error};
789              
790             # verbose and _debug management
791              
792 322 50       1471 $self->{_debugh} = $self->{infoh} if !defined($self->{_debugh});
793 322 50       1109 $self->{_debug} = 1 if $ALWAYS_DEBUG;
794 322         701 my $_debug = $self->{_debug};
795 322 50       909 $self->{verbose} = 1 if $_debug;
796 322         690 my $verbose = $self->{verbose};
797              
798 322         704 my $_debugh = $self->{_debugh};
799              
800 322         763 bless $self, $class;
801              
802             # fields_ar, fields_hr
803              
804 322 100       1373 if (defined($self->{fields_ar}) +
805             defined($self->{fields_hr}) +
806             defined($self->{fields_column_names})
807             >= 2) {
808 1         4 $self->_print_error("mixed use of fields_ar, fields_hr and fields_column_names. " .
809             "Use one at a time.");
810             }
811 322 100 100     1316 if (defined($self->{fields_ar}) and !defined($self->{fields_hr})) {
812 2         4 my @f = @{$self->{fields_ar}};
  2         6  
813 2         5 my %h;
814 2         5 for my $e (@f) {
815 6         18 $h{$e} = "^$e\$";
816             }
817 2         6 $self->{fields_hr} = \%h;
818             }
819 322 100       1020 if (!$self->{has_headers}) {
820 13 100       38 if (defined($self->{fields_ar})) {
821 1         4 $self->_print_error("fields_ar irrelevant if CSV file has no headers");
822 1         8 return undef;
823             }
824 12 100       35 if (defined($self->{fields_hr})) {
825 1         3 $self->_print_error("fields_hr irrelevant if CSV file has no headers");
826 1         10 return undef;
827             }
828             }
829              
830             # in_file or inh
831              
832 320         921 $self->{_flat} = [ ];
833              
834 320         803 $self->{_read_update_after_hr} = { };
835 320         794 $self->{_write_update_before_hr} = { };
836              
837 320         1364 $self->_update_in_mem_record_count();
838              
839 320 100       1070 return undef unless $self->_status_reset();
840              
841 311 50       899 $self->_debug_show_members() if $_debug;
842              
843 311 100       1062 if ($self->{dates_zeros_ok}) {
844             $self->{_refsub_is_datetime_empty} = sub {
845 9626     9626   18500 my $v = $_[0];
846 9626 100       36790 if ($v !~ m/[1-9]/) {
847 4201 100       12200 return 1 if $v =~ m/^[^0:]*0+[^0:]+0+[^0:]+0+/;
848             }
849 9623         37148 return 0;
850             }
851 309         1937 }
852              
853 311         2190 return $self;
854             };
855              
856             #
857             # Return 0 if error, 1 if all good
858             #
859             # Do all low level activities associated to input:
860             # I/O init
861             # Detect encoding
862             # Detect CSV separator
863             # Detect escape character
864             #
865             sub _S1_init_input {
866 463     463   1087 my $self = shift;
867              
868 463         1011 my $croak_if_error = $self->{croak_if_error};
869 463         914 my $_debug = $self->{_debug};
870 463         917 my $_debugh = $self->{_debugh};
871              
872 463 100       1368 $self->{in_file} = '' unless defined($self->{in_file});
873 463         1160 $self->{_close_inh_when_finished} = 0;
874              
875             $self->{_leave_encoding_alone} = $self->{dont_mess_with_encoding}
876 463 50       1342 if defined($self->{dont_mess_with_encoding});
877              
878 463         1596 $self->{_int_one_pass} = _get_def($self->{one_pass}, 0);
879 463         1118 my $in_file_disp;
880              
881              
882             #
883             # LOW LEVEL INIT STEP 1 OF 4
884             #
885             # Manage I/O (= in most cases, open input file...)
886             #
887              
888 463 100       1259 if (defined($self->{inh})) {
889 4 50       11 $self->{_leave_encoding_alone} = 1 unless defined($self->{dont_mess_with_encoding});
890 4         10 $in_file_disp = _get_def($self->{in_file}, '<?>');
891 4 50       13 $self->{_int_one_pass} = 1 unless defined($self->{one_pass});
892 4         6 $self->{_inh} = $self->{inh};
893             } else {
894 459 50       1568 $self->{_leave_encoding_alone} = 0 unless defined($self->{dont_mess_with_encoding});
895 459         1013 my $in_file = $self->{in_file};
896 459         769 my $inh;
897 459 50       1190 if ($in_file eq '') {
898 0         0 $inh = \*STDIN;
899 0         0 $self->{inh_is_stdin} = 1;
900 0 0       0 $self->{_int_one_pass} = 1 unless defined($self->{one_pass});
901 0         0 $in_file_disp = '<stdin>';
902             } else {
903 459 100       21694 if (!open($inh, '<', $in_file)) {
904 3         42 $self->_print_error("unable to open file '$in_file': $!");
905 3         48 return 0;
906             }
907 456         1412 $in_file_disp = $in_file;
908 456         1173 $self->{_close_inh_when_finished} = 1;
909             }
910 456         1226 $self->{_inh} = $inh;
911             }
912              
913 460 50       1262 confess "Oups! in_file_disp not defined?" unless defined($in_file_disp);
914 460         1228 $self->{_in_file_disp} = $in_file_disp;
915              
916              
917             #
918             # LOW LEVEL INIT STEP 2 OF 4
919             #
920             # "Detection" of encoding
921             #
922             # WARNING
923             # As explained in the manual, it is a very partial and limited detection...
924             #
925              
926 460 100       1405 unless ($self->{_leave_encoding_alone}) {
927 456 100       1323 unless ($self->{_init_input_already_called}) {
928             my ($e, $m) = $self->_detect_inh_encoding($self->{encoding}, $self->{via},
929 313 100       2098 $self->{in_file}, ($self->{_int_one_pass} ? 0 : $DETECT_ENCODING));
930 313         1203 $self->{_inh_encoding} = $e;
931 313         859 $self->{_inh_encoding_string} = $m;
932             }
933              
934 456         3848 binmode $self->{_inh}, $self->{_inh_encoding_string};
935             print($_debugh "Input encoding: '" . $self->{_inh_encoding} . "' / '" .
936 456 50       23439 $self->{_inh_encoding_string} . "'\n") if $_debug;
937              
938             }
939              
940 460 100       1707 $self->{out_file} = '' unless defined($self->{out_file});
941              
942              
943             #
944             # LOW LEVEL INIT STEP 3 OF 4
945             #
946             # Detection of CSV separator and escape character
947             #
948              
949 460         872 my $sep_char;
950 460         1047 my $escape_char = $self->{escape_char};
951 460 100       1535 $self->{quote_char} = $DEFAULT_QUOTE_CHAR unless defined($self->{quote_char});
952 460         968 my $quote_char = $self->{quote_char};
953 460 100       1362 unless (defined($self->{in_csvobj})) {
954 455 100       1383 if (defined($self->{sep_char})) {
955 200         442 $sep_char = $self->{sep_char};
956 200 50       568 print($_debugh "-- $in_file_disp: CSV separator set to \"") if $_debug;
957             } else {
958             # The test below (on _init_input_already_called) shoud be useless.
959             # Left for the sake of robustness.
960 255 50       770 unless ($self->{_init_input_already_called}) {
961 255 100       1099 if (!$self->_detect_csv_sep($escape_char, $quote_char, \$sep_char)) {
962 6         29 $self->_print_error("'$in_file_disp': cannot detect CSV separator");
963 0         0 return 0;
964             }
965 249 50       746 print($_debugh "-- $in_file_disp: CSV separator detected to \"") if $_debug;
966 249         686 $self->{sep_char} = $sep_char;
967             }
968             }
969 449 0       1168 print($_debugh ($sep_char eq "\t" ? '\t' : $sep_char) . "\"\n") if $_debug;
    50          
970              
971 449         796 my $is_always_quoted;
972 449 100       1316 unless (defined($self->{escape_char})) {
973 304         1346 $self->_detect_escape_char($quote_char, $sep_char, \$escape_char, \$is_always_quoted);
974 304         1177 $self->{escape_char} = $escape_char;
975 304         751 $self->{_is_always_quoted} = $is_always_quoted;
976             }
977              
978 449         3655 $self->{_in_csvobj} = Text::CSV->new({sep_char => $sep_char,
979             allow_whitespace => 1, binary => 1, auto_diag => 0,
980             quote_char => $quote_char, escape_char => $escape_char,
981             allow_loose_escapes => 1});
982 449 50       88376 unless (defined($self->{_in_csvobj})) {
983 0         0 $self->_print_error("error creating input Text::CSV object");
984 0         0 return 0;
985             }
986              
987             } else {
988 5         9 $self->{_in_csvobj} = $self->{in_csvobj};
989             }
990              
991 454         1272 $self->{_init_input_already_called} = 1;
992              
993 454         1598 return 1;
994             }
995              
996             sub get_in_file_disp {
997 3640     3640 1 6004 my $self = shift;
998              
999 3640         21415 validate_pos(@_);
1000              
1001 3640         10961 my $in_file_disp = _get_def($self->{_in_file_disp}, '?');
1002 3640         7890 return $in_file_disp;
1003             }
1004              
1005             sub get_sep_char {
1006 0     0 1 0 my $self = shift;
1007              
1008 0         0 validate_pos(@_);
1009              
1010 0         0 return $self->{sep_char};
1011             }
1012              
1013             sub get_escape_char {
1014 9     9 1 255 my $self = shift;
1015              
1016 9         79 validate_pos(@_);
1017              
1018 9         84 return $self->{escape_char};
1019             }
1020              
1021             sub get_in_encoding {
1022 27     27 1 6035 my $self = shift;
1023              
1024 27         208 validate_pos(@_);
1025              
1026 27         97 return _get_def($self->{_inh_encoding}, '');
1027             }
1028              
1029             sub get_is_always_quoted {
1030 15     15 1 435 my $self = shift;
1031              
1032 15         138 validate_pos(@_);
1033              
1034 15         130 return $self->{_is_always_quoted};
1035             }
1036              
1037             sub get_pass_count {
1038 43     43 1 1156 my $self = shift;
1039              
1040 43         275 validate_pos(@_);
1041              
1042 43         170 return _get_def($self->{_pass_count}, 0);
1043             }
1044              
1045             sub get_in_mem_record_count {
1046 0     0 1 0 my $self = shift;
1047              
1048 0         0 validate_pos(@_);
1049              
1050 0         0 return ($self->{_in_mem_record_count}, 0);
1051             }
1052              
1053             sub get_max_in_mem_record_count {
1054 504     504 1 2270 my $self = shift;
1055              
1056 504         3481 validate_pos(@_);
1057              
1058 504         2620 return _get_def($self->{_max_in_mem_record_count}, 0);
1059             }
1060              
1061             sub _set_max_in_mem_record_count {
1062 153     153   341 my $self = shift;
1063              
1064 153         1697 validate_pos(@_, {type => SCALAR});
1065              
1066 153         619 $self->{_max_in_mem_record_count} = $_[0];
1067             }
1068              
1069             sub get_fields_names {
1070 19     19 1 3909 my $self = shift;
1071              
1072 19         162 validate_pos(@_);
1073              
1074 19 50       73 return () unless $self->_status_forward('S3');
1075 19         39 return @{$self->{_columns}};
  19         109  
1076             }
1077              
1078             sub get_field_name {
1079 1     1 1 609 my $self = shift;
1080              
1081 1         12 validate_pos(@_, {type => SCALAR});
1082              
1083 1         3 my ($n) = @_;
1084              
1085 1 50       4 return undef unless $self->_status_forward('S3');
1086 1         5 return $self->{_columns}->[$n];
1087             }
1088              
1089             sub get_coldata {
1090 0     0 1 0 my $self = shift;
1091              
1092 0         0 validate_pos(@_);
1093              
1094 0 0       0 return () unless $self->_status_forward('S3');
1095 0         0 my @ret;
1096 0         0 for (@{$self->{_coldata}}) {
  0         0  
1097 0         0 push @ret, [
1098             $_->field_name,
1099             $_->header_text,
1100             $_->description,
1101             $_->dt_format,
1102             $_->dt_locale];
1103             }
1104              
1105 0         0 return @ret;
1106             }
1107              
1108             sub get_stats {
1109 2     2 1 12 my $self = shift;
1110              
1111 2         13 validate_pos(@_);
1112              
1113 2 50       9 return () unless defined($self->{_stats});
1114 2         3 return %{$self->{_stats}};
  2         10  
1115             }
1116              
1117             sub _debug_show_members {
1118 0     0   0 my ($self) = @_;
1119 0         0 my $_debugh = $self->{_debugh};
1120 0 0       0 my @a = @{$self->{fields_ar}} if defined($self->{fields_ar});
  0         0  
1121 0 0       0 my @c = @{$self->{fields_column_names}} if defined($self->{fields_column_names});
  0         0  
1122 0 0       0 my %h = %{$self->{fields_hr}} if defined($self->{fields_hr});
  0         0  
1123              
1124 0         0 print($_debugh "-- _debug_show_members() start\n");
1125 0         0 print($_debugh " croak_if_error $self->{croak_if_error}\n");
1126 0         0 print($_debugh " verbose $self->{verbose}\n");
1127 0         0 print($_debugh " _debug $self->{_debug}\n");
1128 0         0 print($_debugh " _debug_read $self->{_debug_read}\n");
1129 0         0 print($_debugh " infoh $self->{infoh}\n");
1130 0         0 print($_debugh " _debugh $_debugh\n");
1131 0         0 print($_debugh " inh: $self->{_inh}\n");
1132 0         0 print($_debugh " in_file_disp " . $self->get_in_file_disp() . "\n");
1133 0         0 print($_debugh " _in_csvobj $self->{_in_csvobj}\n");
1134 0         0 print($_debugh " has_headers $self->{has_headers}\n");
1135 0         0 print($_debugh " fields_ar:\n");
1136 0         0 for my $e (@a) {
1137 0         0 print($_debugh " '$e'\n");
1138             }
1139 0         0 print($_debugh " fields_hr:\n");
1140 0         0 for my $e (keys %h) {
1141 0         0 print($_debugh " '$e' => '$h{$e}'\n");
1142             }
1143 0         0 print($_debugh " fields_column_names:\n");
1144 0         0 for my $e (@c) {
1145 0         0 print($_debugh " '$e'\n");
1146             }
1147 0         0 print($_debugh "-- _debug_show_members() end\n");
1148             }
1149              
1150             #
1151             # Check headers in CSV header line
1152             # Used to increase robustness by relying on header title rather than
1153             # column number.
1154             #
1155             # Return 1 if success (all fields found), 0 otherwise.
1156             #
1157             sub _process_header {
1158 12     12   27 my $self = shift;
1159 12         23 my @headers = @{shift(@_)};
  12         45  
1160 12         25 my %fields_h = %{shift(@_)};
  12         80  
1161 12         31 my $retval = shift;
1162              
1163 12         22 my @tmp = keys %{$retval};
  12         30  
1164              
1165 12         38 my $in_file_disp = $self->get_in_file_disp();
1166              
1167 12 50       46 confess '$_[4] must be an empty by-ref hash' if $#tmp >= 0;
1168              
1169 12         23 my $e = 0;
1170 12         45 for my $k (keys %fields_h) {
1171 48         105 my $v = $fields_h{$k};
1172              
1173 48     462   252 my @all_idx = indexes { /$v/i } @headers;
  462         3507  
1174 48 50       210 if ($#all_idx >= 1) {
1175 0         0 $self->_print_error("file $in_file_disp: " .
1176             "more than one column matches the criteria '$v'");
1177 0         0 $e++;
1178             }
1179 48     258   223 my $idx = first_index { /$v/i } @headers;
  258         2172  
1180 48 50       313 if ($idx < 0) {
1181 0         0 $self->_print_error("file $in_file_disp: unable to find field '$v'");
1182 0         0 $e++;
1183             } else {
1184 48         146 $retval->{$k} = $idx;
1185             }
1186             }
1187              
1188 12 50       82 return ($e >= 1 ? 0 : 1);
1189             }
1190              
1191             sub set_walker_hr {
1192 2     2 1 1928 my $self = shift;
1193 2         22 validate_pos(@_, {type => UNDEF | CODEREF, optional => 1});
1194              
1195 2         7 my ($walker_hr) = @_;
1196              
1197 2 50       6 return undef unless $self->_status_forward('S2');
1198 2 50       6 return undef unless $self->_status_backward('S2');
1199 2         6 $self->{walker_hr} = $walker_hr;
1200              
1201 2         5 return $self;
1202             }
1203              
1204             sub set_walker_ar {
1205 2     2 1 3245 my $self = shift;
1206 2         31 validate_pos(@_, {type => UNDEF | CODEREF, optional => 1});
1207              
1208 2         8 my ($walker_ar) = @_;
1209              
1210 2 50       8 return undef unless $self->_status_forward('S2');
1211 2 50       7 return undef unless $self->_status_backward('S2');
1212 2         6 $self->{walker_ar} = $walker_ar;
1213              
1214 2         7 return $self;
1215             }
1216              
1217              
1218             # * *************************************** *
1219             # * BEGINNING OF DATE FORMAT DETECTION CODE *
1220             # * *************************************** *
1221              
1222              
1223             #
1224             # The '%m.%d.%y' is not at its "logical" location. It is done to make sure the order in which
1225             # entries are written does not impact the result.
1226             #
1227             # It could occur because there is some code that correlates an entry containing %y with another
1228             # one that would contain %Y. The %Y will be called the master, the %y will be called the slave.
1229             # It is important to match such entries, otherwise an identified format with %y would always be
1230             # ambiguous with the same written with %Y.
1231             #
1232             # IMPORTANT
1233             # The list below is written almost as-is in the POD at the bottom of this file.
1234             #
1235             my @DATES_DEFAULT_FORMATS_TO_TRY = (
1236             '',
1237             '%Y-%m-%d',
1238             '%Y.%m.%d',
1239             '%Y/%m/%d',
1240              
1241             '%m.%d.%y',
1242              
1243             '%m-%d-%Y',
1244             '%m.%d.%Y',
1245             '%m/%d/%Y',
1246             '%d-%m-%Y',
1247             '%d.%m.%Y',
1248             '%d/%m/%Y',
1249              
1250             '%m-%d-%y',
1251             '%m/%d/%y',
1252             '%d-%m-%y',
1253             '%d.%m.%y',
1254             '%d/%m/%y',
1255              
1256             '%Y%m%d%H%M%S',
1257              
1258             # Localizaed formats
1259              
1260             '%b %d, %Y',
1261             '%b %d %Y',
1262             '%b %d %T %Z %Y',
1263             '%d %b %Y',
1264             '%d %b, %Y'
1265             );
1266              
1267             #
1268             # IMPORTANT
1269             # Under Linux, $START is useless. Strptime will match a format exactly as it is, and a tring
1270             # like "01/01/16 13:00:00" won't match with "%T". Under Windows, Strptime is capable of doing
1271             # a match by ignoring characters at the beginning, thus "01/01/2016 13:00:00" for example will
1272             # return success when matched against "%T".
1273             # Possibly it has to do with versionning of Strptime, not Linux versus Windows as such. Any
1274             # way, this difference had to be dealt with.
1275             #
1276             # The flexibility under Windows would screw the code logic so I had to add the prefix string
1277             # below, to avoid unexpected success on match.
1278             #
1279             my $START = '<';
1280              
1281             struct RecordCounter => {
1282             count_ok => '$',
1283             count_ko => '$',
1284             has_searched_time => '$',
1285              
1286             format => '$',
1287             locale => '$',
1288              
1289             has_found_time => '$',
1290             format_with_addition_of_time => '$',
1291             locale_with_addition_of_time => '$',
1292             parser_with_addition_of_time => '$'
1293             };
1294              
1295             struct Format => {
1296             id => '$',
1297             format => '$',
1298             locale => '$',
1299             parser => '$',
1300             index_slave => '$',
1301             index_master => '$'
1302             };
1303              
1304             sub _col_dispname {
1305 530     530   1035 my ($self, $n) = @_;
1306              
1307 530         778 my $col;
1308              
1309             #
1310             # IMPORTANT
1311             #
1312             # We cannot execute here a command like
1313             # $self->_status_forward('S3');
1314             # (to ensure _columns is well defined) because _col_dispname is called by
1315             # _detect_dates_formats that is in turn called by _S3_init_fields_extra. A call to
1316             # _status_forward would trigger a never-ending call loop.
1317             #
1318 530         1478 my $cols = _get_def($self->{'_columns'}, $self->{'_S2_columns'});
1319              
1320 530 50       1272 if ($self->{has_headers}) {
1321 530         1033 $col = $cols->[$n];
1322 530 50       1225 $col = "<UNDEF>" unless defined($col);
1323             } else {
1324 0         0 $col = "[$n]";
1325             }
1326 530         1243 return $col;
1327             }
1328              
1329             # Used by test plan only...
1330             sub _dds {
1331 42     42   1909 my $self = shift;
1332              
1333 42 50       115 return undef unless $self->_status_forward('S3');
1334 40 50       145 return undef unless defined($self->{_dates_detailed_status});
1335 40         341 return $self->{_dates_detailed_status};
1336             }
1337              
1338             sub _detect_dates_formats {
1339 348     348   698 my $self = shift;
1340              
1341 348 100       1150 return if $self->{_detect_dates_formats_has_run};
1342 281         785 $self->{_detect_dates_formats_has_run} = 1;
1343 281 100       894 my @fields_dates = @{$self->{fields_dates}} if defined($self->{fields_dates});
  11         75  
1344 281 100 100     1755 return unless @fields_dates or $self->{fields_dates_auto};
1345              
1346 51 50       214 if ($self->{_int_one_pass}) {
1347 0         0 $self->_print_error("date format detection disallowed when one_pass is set");
1348 0         0 return;
1349             }
1350              
1351 51         113 my $_debug = $self->{_debug};
1352 51         105 my $_debugh = $self->{_debugh};
1353 51   33     153 my $debug_fmt = ($_debug and $DEBUG_DATETIME_FORMATS);
1354              
1355 51         186 $self->_register_pass("detect date format");
1356              
1357             #
1358             # Why re-opening the input?
1359             # I tried two other ways that never worked on some OSes (like freebsd) and/or with older perl
1360             # versions.
1361             #
1362             # 1) The "tell" tactic
1363             # Recording at the beginning of the function the file position with
1364             # my $pos = tell($self->{inh});
1365             # ... and then recalling with a seek instruction is the most logical.
1366             # But it didn't work = sometimes, reading would go back to first row (the headers) instead
1367             # of the second row, could not figure out why (it would work on my Ubuntu 16.04 / perl 5.22, but
1368             # would fail with other OSes and/or perl versions).
1369             #
1370             # 2) The "complete rewind" tactic
1371             # I then undertook to do (at the end of detection function):
1372             # seek $inh, 0, SEEK_SET;
1373             # $incsv->getline($inh) if $self->{has_headers};
1374             # based on the assumption that a seek to zero would behave differently from a seek to an
1375             # arbitrary position.
1376             # But still, it would sometimes fail....
1377             #
1378              
1379 51         135 my $inh = $self->_reopen_input();
1380 51         112 my $incsv = $self->{_in_csvobj};
1381 51 50       249 _mygetline($incsv, $inh) if $self->{has_headers};
1382              
1383 51         2412 my $formats_to_try = $self->{dates_formats_to_try};
1384 51         112 my $ignore_trailing_chars = $self->{dates_ignore_trailing_chars};
1385 51         115 my $search_time = $self->{dates_search_time};
1386 51         111 my $localizations = $self->{dates_locales};
1387              
1388 51         103 my %regular_named_fields = %{$self->{_regular_named_fields}};
  51         438  
1389              
1390 51         203 my $refsub_is_datetime_empty = $self->{_refsub_is_datetime_empty};
1391              
1392 51         88 my @fields_to_detect_format;
1393 51 100       211 if (defined($self->{fields_dates})) {
    50          
1394 11         23 my $count_field_not_found = 0;
1395 11         25 my %column_seen;
1396 11         25 for my $f (@{$self->{fields_dates}}) {
  11         43  
1397 23 100       74 if (!exists $regular_named_fields{$f}) {
1398 1         17 $self->_print_error("fields_dates: unknown field: '$f'",
1399             1, ERR_UNKNOWN_FIELD, { %regular_named_fields } );
1400 1         64 $count_field_not_found++;
1401 1         5 next;
1402             }
1403 22         42 my $n = $regular_named_fields{$f};
1404 22 50       67 if (exists $column_seen{$n}) {
1405 0         0 $self->_print_warning("field '$f' already seen");
1406 0         0 next;
1407             }
1408 22         49 $column_seen{$n} = 1;
1409 22         52 push @fields_to_detect_format, $n;
1410             }
1411 11 100       50 $self->_print_error("non existent field(s) encountered, aborted") if $count_field_not_found;
1412             } elsif ($self->{fields_dates_auto}) {
1413 40         281 my @k = keys %regular_named_fields;
1414 40         228 @fields_to_detect_format = (0..$#k);
1415             } else {
1416 0         0 confess "Hey! check this code, man";
1417             }
1418              
1419             #
1420             # FIXME?
1421             # Sort by column number of not?
1422             #
1423             # At this moment in time, the author inclines to answer "yes".
1424             # But I must admit it is rather arbitrary decision for now.
1425             #
1426 50         342 @fields_to_detect_format = sort { $a <=> $b } @fields_to_detect_format;
  744         1054  
1427              
1428 2         8 my @dates_formats_supp = @{$self->{dates_formats_to_try_supp}}
1429 50 100       172 if defined($self->{dates_formats_to_try_supp});
1430              
1431 50 100       344 $formats_to_try = [ @DATES_DEFAULT_FORMATS_TO_TRY ] unless defined($formats_to_try);
1432 50         100 $formats_to_try = [ @{$formats_to_try}, @dates_formats_supp ];
  50         212  
1433 50         138 my %seen;
1434 50         102 my $f2 = [ ];
1435 50         139 for (@${formats_to_try}) {
1436 912 50       1854 push @{$f2}, $_ unless exists($seen{$_});
  912         1468  
1437 912         1843 $seen{$_} = undef;
1438             }
1439 50         103 $formats_to_try = $f2;
1440              
1441 50 100       190 $ignore_trailing_chars = 1 unless defined($ignore_trailing_chars);
1442 50 100       145 $search_time = 1 unless defined($search_time);
1443              
1444 50 100       146 my $stop = ($ignore_trailing_chars ? '' : '>');
1445              
1446             #
1447             # The code below (from # AMB to # AMB-END) aims to remove ambiguity that comes from %Y versus %y.
1448             # That is: provided you have (among others) the formats to try
1449             # '%d-%m-%Y'
1450             # and
1451             # '%d-%m-%y'
1452             # then if parsing 4-digit-year dates (like '31-12-2016'), the two formats will work and you'll end
1453             # up with an ambiguity. To be precise, there'll be no ambiguity if the date is followed by a time,
1454             # but if the date is alone, both formats will work.
1455             #
1456             # Thanks to the below code, the member 'index_slave' (and its counterpart index_master) is populated
1457             # and later, if such an ambiguity is detected, the upper case version (the one containing upper case
1458             # '%Y') will be kept and the other one will be discarded.
1459             #
1460             # NOTE
1461             # Such an ambiguity can exist only when ignore_trailing_chars is set. Otherwise, the remaining two
1462             # digits make the date parsing fail in the '%y' case.
1463             #
1464             # The other members of the 'Format' object are used to work "normally", independently from this
1465             # ambiguity removal feature.
1466             #
1467              
1468             # WIP = Work In Progress...
1469 50         88 my @formats_wip;
1470 50 100       438 my @locales = split(/,\s*/, $localizations) if defined($localizations);
1471 50         101 for my $f (@{$formats_to_try}) {
  50         149  
1472 912 100       2392 my $has_localized_item = ($f =~ m/%a|%A|%b|%B|%c|%\+/ ? 1 : 0);
1473 912 100 100     2189 unless (@locales and $has_localized_item) {
1474 902         1822 push @formats_wip, [$f, ''];
1475 902         1484 next;
1476             }
1477 10         36 push @formats_wip, [$f, $_] foreach @locales;
1478             }
1479              
1480             # AMB
1481 50         108 my @formats;
1482             my %mates;
1483 50         177 for my $i (0..$#formats_wip) {
1484 922         2353 my $fstr = $formats_wip[$i]->[0];
1485 922         1580 my $floc = $formats_wip[$i]->[1];
1486              
1487             # FIXME
1488             # Will not manage correctly a string like
1489             # '%%Y'
1490             # that means (when used with Strptime), the litteral string '%Y' with no substitution.
1491             # Such cases will be complicated to fix, as it'll require to do a kind-of
1492             # Strptime-equivalent parsing of the string, and I find it a bit overkill.
1493             #
1494             # I prefer to push back in caller world saying
1495             # "Hey, if using constructs like '%%Y', you'll be in trouble."
1496 922         1581 my $m = $fstr;
1497 922         3903 $m =~ s/%y//ig;
1498 922         1887 $m .= $floc;
1499              
1500 922         1478 my $index_slave = -1;
1501 922         1428 my $index_master = -1;
1502 922 100       2317 if (exists $mates{$m}) {
1503 244         640 my $alt_fstr = $formats_wip[$mates{$m}]->[0];
1504 244 100       859 my $m_lower = ($fstr =~ m/%y/ ? 1 : 0);
1505 244 100       677 my $m_upper = ($fstr =~ m/%Y/ ? 1 : 0);
1506 244 100       639 my $a_lower = ($alt_fstr =~ m/%y/ ? 1 : 0);
1507 244 100       621 my $a_upper = ($alt_fstr =~ m/%Y/ ? 1 : 0);
1508              
1509             # We ignore the weird cases where we'd have both %y and %Y in a format string.
1510              
1511 244 100 66     2819 if (!$m_lower and $m_upper and $a_lower and !$a_upper) {
    50 66        
      33        
      33        
      33        
      33        
1512 42         104 $index_slave = $mates{$m};
1513 42         720 $formats[$mates{$m}]->index_master($i);
1514             } elsif ($m_lower and !$m_upper and !$a_lower and $a_upper) {
1515 202         422 $index_master = $mates{$m};
1516 202         3565 $formats[$mates{$m}]->index_slave($i);
1517             }
1518              
1519             } else {
1520 678         1660 $mates{$m} = $i;
1521             }
1522              
1523 922         4366 my %strptime_opts = (pattern => $START . $fstr . $stop);
1524 922 100       2748 $strptime_opts{locale} = $floc if $floc ne '';
1525 922 100       6003 my $format = Format->new(
1526             id => "$i",
1527             format => $fstr,
1528             locale => $floc,
1529             parser => ($fstr ne '' ?
1530             DateTime::Format::Strptime->new(%strptime_opts) :
1531             undef),
1532             index_slave => $index_slave,
1533             index_master => $index_master
1534             );
1535 922         1130237 push @formats, $format;
1536             }
1537 50         242 for my $i (0..$#formats) {
1538 922         11421 my $format = $formats[$i];
1539              
1540             # If a master could be itself the slave of another entry, that'd make it a hierarchical
1541             # relation tree with multiple levels. It is not possible, only a direct, unique
1542             # master-slave relation can be managed here.
1543 922 50 66     11660 confess "Inonsistent data, check this module's code urgently!"
1544             if $format->index_slave >= 0 and $format->index_master >= 0;
1545              
1546 922 100       21329 if ($format->index_slave >= 0) {
1547 244         4429 my $mate = $formats[$format->index_slave];
1548 244 50 33     4186 if ($mate->index_master != $i or $mate->index_slave != -1) {
1549 0         0 confess "Inonsistent data (2), check this module's code urgently!"
1550             }
1551             }
1552              
1553 922 100       21239 if ($format->index_master >= 0) {
1554 244         4486 my $mate = $formats[$format->index_master];
1555 244 50 33     4217 if ($mate->index_slave != $i or $mate->index_master != -1) {
1556 0         0 confess "Inonsistent data (3), check this module's code urgently!"
1557             }
1558             }
1559              
1560             }
1561 50 50       574 if ($debug_fmt) {
1562 0         0 for (@formats) {
1563 0         0 my ($idx, $rel) = (-1, "");
1564 0 0       0 $idx = $_->index_slave, $rel = "S: " if $_->index_slave >= 0;
1565 0 0       0 $idx = $_->index_master, $rel = "M: " if $_->index_master >= 0;
1566 0         0 printf($_debugh "%-18s %s %2d", "'" . $_->format . "'", $rel, $idx);
1567 0 0       0 print($_debugh ": '" . $formats[$idx]->format . "'") if $idx >= 0;
1568 0         0 print($_debugh "\n");
1569             }
1570             }
1571             # AMB-END
1572              
1573 50         151 my %records;
1574             my $record_number;
1575 50         104 my $count_gotit = 0;
1576 50         107 my $count_ambiguous = 0;
1577 50         105 my $count_nodate = 0;
1578 50         98 my $count_empty = 0;
1579 50         99 my $has_signaled_can_start_recording_data = 0;
1580 50         154 $self->{_line_after_which_recording_can_start} = 0;
1581              
1582             #
1583             # Seems a weird optimization here, but it is very important.
1584             # In some cases, divides execution time (to detect date format on big files
1585             # containing numerous fields) by 10.
1586             #
1587             # When evaluates to true, it means the input column has no identified date format, meaning,
1588             # no further check to do.
1589             #
1590 50         122 my @cache_nodate;
1591              
1592 50         208 while (my $f = _mygetline($incsv, $inh)) {
1593 4645         147657 $record_number++;
1594              
1595 4645 50       10989 if ($debug_fmt) {
1596 0         0 print($_debugh "RECORD $record_number:\n");
1597 0         0 for (0 .. @$f - 1) { printf($_debugh " %02d: '%s'\n", $_, $f->[$_]); }
  0         0  
1598             }
1599              
1600 4645         9460 for my $n (@fields_to_detect_format) {
1601 31430 100       71512 next if $cache_nodate[$n];
1602              
1603 16897         30214 my $v = $f->[$n];
1604 16897 100       35622 $v = '' unless defined($v);
1605 16897 100       39587 next if $v eq '';
1606 9395 100 100     34623 next if defined($refsub_is_datetime_empty) and $refsub_is_datetime_empty->($v);
1607              
1608 9392 50       22468 if ($debug_fmt) {
1609 0         0 my $col = $self->_col_dispname($n);
1610 0         0 print($_debugh "Line $record_number, column '$col':\n");
1611             }
1612              
1613 9392         18512 for my $fmt (@formats) {
1614 158222         2011791 my $fid = $fmt->id;
1615 158222         2724809 my $fstr = $fmt->format;
1616              
1617 158222 50       1152556 $self->_debug_output_fmt('** pre ', $fmt, $records{$n}->{$fid}) if $debug_fmt;
1618              
1619             $records{$n}->{$fid} = RecordCounter->new(
1620             count_ok => 0,
1621             count_ko => 0,
1622             has_searched_time => 0,
1623              
1624             format => undef,
1625             locale => undef,
1626              
1627             has_found_time => 0,
1628             format_with_addition_of_time => undef,
1629             locale_with_addition_of_time => undef,
1630             parser_with_addition_of_time => undef
1631 158222 100       501871 ) unless defined($records{$n}->{$fid});
1632              
1633 158222 100       2537923 unless ($records{$n}->{$fid}->count_ko) {
1634 19361         181657 my $is_ok = &_try_parser($fmt, $records{$n}->{$fid}, $START . $v . $stop);
1635              
1636 19361 100       51956 if (!$is_ok) {
1637 7604         12924 my $give_up_time = 0;
1638 7604 100 66     98468 if ($records{$n}->{$fid}->count_ko == 0 and
      100        
1639             $records{$n}->{$fid}->has_searched_time and
1640             $records{$n}->{$fid}->has_found_time) {
1641 77   100     4598 $give_up_time = (defined($fmt->parser) and
1642             defined($fmt->parser->parse_datetime($START . $v . $stop))
1643             ?
1644             1 : 0);
1645 77 100       47156 if ($give_up_time) {
1646 4         92 $records{$n}->{$fid}->has_found_time(0);
1647 4         39 $is_ok = 1;
1648             }
1649             }
1650             }
1651              
1652 19361 100 100     252839 if ($is_ok or !$ignore_trailing_chars) {
1653 14144   100     188133 my $incr = (defined($fmt->parser) and $is_ok ? 1: 0);
1654              
1655 14144 100       306619 unless ($records{$n}->{$fid}->has_searched_time) {
1656 3141         58701 $records{$n}->{$fid}->has_searched_time(1);
1657              
1658             croak "Inconsistent status! Issue in module code not in caller's!"
1659 3141 50       53907 if $records{$n}->{$fid}->count_ok != 0;
1660              
1661 3141 100       27660 if ($search_time) {
    100          
1662              
1663 1965 50       5577 print($_debugh " Search time in '$v', format '$fstr'\n")
1664             if $debug_fmt;
1665              
1666 1965         25839 my $t = $self->_guess_time_format($fstr, $fmt->locale, $v, $stop);
1667 1965 100       48455 $records{$n}->{$fid}->has_found_time((defined($t) ? 1 : 0));
1668 1965 100       22083 if (defined($t)) {
    100          
1669 254         4087 $records{$n}->{$fid}->format_with_addition_of_time($t->[0]);
1670 254         5234 $records{$n}->{$fid}->locale_with_addition_of_time($t->[1]);
1671 254         5218 $records{$n}->{$fid}->parser_with_addition_of_time($t->[2]);
1672 254         2308 $incr = 1;
1673             } elsif ($fstr eq '') {
1674 191         3066 $records{$n}->{$fid}->count_ko(1);
1675             }
1676             } elsif ($fstr eq '') {
1677 78         1003 $records{$n}->{$fid}->count_ko(1);
1678             }
1679              
1680             }
1681              
1682 14144         258579 $records{$n}->{$fid}->count_ok($records{$n}->{$fid}->count_ok + $incr);
1683              
1684 14144 100 100     212481 $records{$n}->{$fid}->count_ko($records{$n}->{$fid}->count_ko + 1)
1685             if !$incr and !$is_ok;
1686              
1687 14144 100       57896 if ($incr) {
1688             # We remove the slave if master is fine.
1689             # Depending on the order in which parsing got done, the master could
1690             # pop up first, or the slave, that is why we need manage both cases.
1691 9155 100 100     120488 if ($fmt->index_slave >= 0 or $fmt->index_master >= 0) {
1692 6837 100       218610 my $has_slave = ($fmt->index_slave >= 0 ? 1 : 0);
1693 6837 100       129995 my $idx = ($has_slave ? $fmt->index_slave : $fmt->index_master);
1694 6837         121500 my $mate = $formats[$idx]->id;
1695 6837 100       60566 if (exists $records{$n}->{$mate}) {
1696 6686 100       16791 if ($has_slave) {
1697 2239 100       30513 if ($records{$n}->{$mate}->count_ko == 0) {
1698             # I am the master: I remove the slave
1699 2         50 $records{$n}->{$mate}->count_ko(1);
1700             }
1701             } else {
1702 4447 50 66     60285 if ($records{$n}->{$mate}->count_ko == 0 and
      66        
1703             $records{$n}->{$mate}->count_ok >= 1 and
1704             $records{$n}->{$fid}->count_ko == 0) {
1705 114         6722 $records{$n}->{$fid}->count_ko(1);
1706             }
1707             }
1708             }
1709             }
1710             }
1711              
1712             } else {
1713 5217         70104 $records{$n}->{$fid}->count_ko($records{$n}->{$fid}->count_ko + 1);
1714             }
1715             }
1716              
1717 158222 50       1339039 $self->_debug_output_fmt(' post', $fmt, $records{$n}->{$fid}) if $debug_fmt;
1718              
1719             }
1720             }
1721              
1722 4645         8710 $count_gotit = 0;
1723 4645         7516 $count_ambiguous = 0;
1724 4645         7526 $count_empty = 0;
1725 4645         10816 for my $n (@fields_to_detect_format) {
1726 31430 100       71592 next if $cache_nodate[$n];
1727              
1728 16897         25711 my $candidate = 0;
1729 16897         24086 my $tt = 0;
1730 16897         28237 for my $fmt (@formats) {
1731 258870         3239194 my $fid = $fmt->id;
1732 258870         1821426 my $rec = $records{$n}->{$fid};
1733 258870 100       574678 next unless defined($rec);
1734              
1735 200048         2451503 my $ok = $rec->count_ok;
1736 200048         3426576 my $ko = $rec->count_ko;
1737              
1738 200048 50 66     1752815 confess "Oups. Check this module code urgently!" if $ok == 0 and $ko == 0;
1739 200048         301631 $tt += $ok + $ko;
1740              
1741 200048 100 100     549775 $candidate++ if $ok >= 1 and $ko == 0;
1742             }
1743 16897 100       42993 if ($candidate == 1) {
    100          
    100          
1744 9599         18745 $count_gotit++;
1745             } elsif ($candidate >= 2) {
1746 2269         5566 $count_ambiguous++;
1747             } elsif ($tt != 0) {
1748 222         417 $count_nodate++;
1749 222         572 $cache_nodate[$n] = 1;
1750             } else {
1751 4807         8012 $count_empty++;
1752             }
1753             }
1754              
1755 4645 50       11977 if ($debug_fmt) {
1756 0         0 print($_debugh "\$count_gotit = $count_gotit\n");
1757 0         0 print($_debugh "\$count_ambiguous = $count_ambiguous\n");
1758 0         0 print($_debugh "\$count_nodate = $count_nodate\n");
1759 0         0 print($_debugh "\$count_empty = $count_empty\n");
1760             }
1761              
1762 4645         7395 my $can_start_recording_data = 0;
1763 4645 100 100     27636 $can_start_recording_data = 1
      100        
1764             if $count_gotit + $count_ambiguous + $count_nodate >= 1 and
1765             !$count_ambiguous and !$count_empty;
1766              
1767 4645 100 100     32531 if ($can_start_recording_data and !$has_signaled_can_start_recording_data) {
1768 28         64 $has_signaled_can_start_recording_data = 1;
1769              
1770 28 50       97 print($_debugh "Can start recording (all dates formats detection closed) " .
1771             "after record #$record_number\n") if $_debug;
1772              
1773 28         98 $self->{_line_after_which_recording_can_start} = $record_number;
1774 28 100       559 last unless $self->{fields_dates_auto};
1775             }
1776             }
1777              
1778 50         5167 close $inh;
1779              
1780 50         190 my %dates_detailed_status;
1781             my @dates_formats;
1782 50         141 my $check_empty = 0;
1783 50         101 my $check_nodate = 0;
1784 50         104 my $check_ambiguous = 0;
1785 50         114 my $check_gotit = 0;
1786 50         145 for my $n (@fields_to_detect_format) {
1787 523         833 my @formats_ok;
1788 523         798 my $tt = 0;
1789 523         798 for my $fid (sort keys %{$records{$n}}) {
  523         4886  
1790 8267         84350 my $rec = $records{$n}->{$fid};
1791 8267 100 100     99253 if ($rec->count_ok >= 1 and $rec->count_ko == 0) {
1792              
1793 297         10743 my ($fstr, $floc) = ($rec->format, $rec->locale);
1794 297 100       6871 ($fstr, $floc) = (
1795             $rec->format_with_addition_of_time,
1796             $rec->locale_with_addition_of_time
1797             ) if $rec->has_found_time;
1798              
1799 297         6572 push @formats_ok, [$fstr, $floc];
1800             }
1801 8267         147918 $tt += $rec->count_ok + $rec->count_ko;
1802             }
1803 523         5830 my $is_ok = 0;
1804 523         803 my $format;
1805 523         834 my $locale = '';
1806 523 100 100     2432 if ($#formats_ok < 0 and $tt == 0) {
    100          
    100          
1807 16         32 $format = "Z";
1808 16         30 $check_empty++;
1809             } elsif ($#formats_ok < 0) {
1810 222         378 $format = "N";
1811 222         318 $check_nodate++;
1812             } elsif ($#formats_ok > 0) {
1813 12         23 $format = "A";
1814 12         24 $check_ambiguous++;
1815             } else {
1816 273         457 $is_ok = 1;
1817 273         509 $format = $formats_ok[0]->[0];
1818 273         529 $locale = $formats_ok[0]->[1];
1819 273         580 $check_gotit++;
1820             }
1821 523         1318 my $col = $self->_col_dispname($n);
1822              
1823 523 50       1922 $dates_detailed_status{$col} = $format unless exists $dates_detailed_status{$col};
1824 523 100 66     2593 $dates_formats[$n] = [ $format, $locale ] if $is_ok and !defined($dates_formats[$n]);
1825             }
1826 50         213 $dates_detailed_status{'.'} = $self->{_line_after_which_recording_can_start};
1827              
1828 50 50 66     591 if ($check_empty != $count_empty or $check_nodate != $count_nodate or
      66        
      33        
1829             $check_ambiguous != $count_ambiguous or $check_gotit != $count_gotit) {
1830             # The below condition can happen with an empty CSV (empty file (no header) or
1831             # only a header line).
1832 1 50 33     21 unless (!$count_empty and !$check_nodate and !$count_nodate and
      33        
      33        
      33        
      33        
      33        
1833             !$check_ambiguous and !$count_ambiguous and !$check_gotit and !$count_gotit) {
1834 0         0 print(STDERR "\$check_empty = $check_empty\n");
1835 0         0 print(STDERR "\$count_empty = $count_empty\n");
1836 0         0 print(STDERR "\$check_nodate = $check_nodate\n");
1837 0         0 print(STDERR "\$count_nodate = $count_nodate\n");
1838 0         0 print(STDERR "\$check_ambiguous = $check_ambiguous\n");
1839 0         0 print(STDERR "\$count_ambiguous = $count_ambiguous\n");
1840 0         0 print(STDERR "\$check_gotit = $check_gotit\n");
1841 0         0 print(STDERR "\$count_gotit = $count_gotit\n");
1842 0         0 confess "Oups! Check immediately this module code, man!";
1843             }
1844             }
1845              
1846 50 50       148 if ($debug_fmt) {
1847             # A very detailed debug output
1848 0         0 for my $n (@fields_to_detect_format) {
1849 0         0 my $col = $self->_col_dispname($n);
1850 0         0 print($_debugh "$col\n");
1851 0         0 printf($_debugh " %-25s %3s %3s\n", "format", "OK", "KO");
1852 0         0 for my $fid (sort keys %{$records{$n}}) {
  0         0  
1853 0         0 my $rec = $records{$n}->{$fid};
1854 0         0 my $cc = '';
1855 0 0 0     0 $cc = "(" . $rec->locale . ")" if defined($rec->locale) and $rec->locale ne '';
1856 0         0 printf($_debugh " %-25s %3d %3d\n",
1857             $rec->format . $cc, $rec->count_ok, $rec->count_ko);
1858             }
1859             }
1860             }
1861             # Not a typo - displaying it IN ADDITION to debug output above is done on purpose...
1862 50 50       202 if ($_debug) {
1863             # A shorter (as compared to above) output of outcome of DateTime detection
1864 0         0 print($_debugh "Result of DateTime detection:\n");
1865 0         0 printf($_debugh "%-3s %-25s %-30s %s\n", '###', 'FIELD', 'DATETIME FORMAT',
1866             'DATETIME LOCALE');
1867 0         0 for my $n (@fields_to_detect_format) {
1868 0         0 my ($fmt, $loc) = ('<undef>', '<undef>');
1869 0 0       0 if (defined($dates_formats[$n])) {
1870 0         0 ($fmt, $loc) = @{$dates_formats[$n]}[0, 1];
  0         0  
1871             }
1872 0         0 printf($_debugh "%03d %-25s %-30s %s\n", $n, $self->_col_dispname($n), $fmt, $loc);
1873             }
1874             }
1875              
1876 50 100       178 if (!$self->{fields_dates_auto}) {
1877 10         28 my $e = 0;
1878 10         32 for my $n (@fields_to_detect_format) {
1879 20 100       69 next if defined($dates_formats[$n]);
1880 7         27 $self->_print_error("unable to detect DateTime format of field '" .
1881             $self->_col_dispname($n) . "'", 1);
1882 7         281 $e++;
1883             }
1884 10 100       57 $self->_print_error("$e field(s) encountered with unknown DateTime format") if $e;
1885             }
1886              
1887 48         774 $self->{_dates_detailed_status} = { %dates_detailed_status };
1888 48         79659 $self->{_dates_formats} = [ @dates_formats ];
1889             }
1890              
1891             sub _debug_output_fmt {
1892 0     0   0 my ($self, $prefix, $fmt, $rec) = @_;
1893              
1894 0         0 my $_debugh = $self->{_debugh};
1895              
1896 0         0 my ($fstr, $floc) = ($fmt->format, $fmt->locale);
1897 0 0 0     0 ($fstr, $floc) = (
1898             '<+T>' . $rec->format_with_addition_of_time,
1899             $rec->locale_with_addition_of_time
1900             ) if defined($rec) and $rec->has_found_time;
1901              
1902 0         0 my $locstr = '';
1903 0 0 0     0 $locstr = "(" . $floc . ")" if defined($floc) and $floc ne '';
1904              
1905 0 0       0 my $tmpok = $rec->count_ok if defined($rec);
1906 0 0       0 $tmpok = '<undef>' unless defined($tmpok);
1907 0 0       0 my $tmpko = $rec->count_ko if defined($rec);
1908 0 0       0 $tmpko = '<undef>' unless defined($tmpko);
1909              
1910 0         0 print($_debugh "$prefix (format '$fstr$locstr': OK = $tmpok, KO = $tmpko)\n");
1911             }
1912              
1913             # When no parse can be done (parser to test is undef), return 1
1914             sub _try_parser {
1915 19361     19361   41937 my ($fmt, $rec, $value_to_parse) = @_;
1916              
1917 19361         248204 my $parser = $fmt->parser;
1918 19361 100       360345 $parser = $rec->parser_with_addition_of_time if $rec->has_found_time;
1919              
1920 19361         219429 my $is_ok = 1;
1921 19361 100       81838 $is_ok = (defined($parser->parse_datetime($value_to_parse)) ? 1 : 0) if $parser;
    100          
1922              
1923 19361 100       7599175 unless (defined($rec->format)) {
1924 8267         153643 $rec->format($fmt->format);
1925 8267         265471 $rec->locale($fmt->locale);
1926             }
1927              
1928 19361         296678 return $is_ok;
1929             }
1930              
1931             sub _guess_time_format {
1932              
1933             # IMPORTANT
1934             # Formats are tested in the order of the list below, and the first one that succeeds stops the
1935             # tests.
1936             # That makes the order of the elements important: %R would match any value that'd also match
1937             # %T, that'd cause to return %R whereas %T would be possible. Same with AM/PM formats. Thus
1938             # the longest patterns appear first.
1939 1965     1965   18953 my @T = (
1940             '%I:%M:%S %p',
1941             '%I:%M %p',
1942             '%I:%M:%S%p',
1943             '%I:%M%p',
1944             '%T',
1945             '%R'
1946             );
1947              
1948 1965         5684 my ($self, $format, $locale, $v, $stop) = @_;
1949              
1950 1965         4895 my $_debugh = $self->{_debugh};
1951 1965   33     6387 my $debug_fmt = ($self->{_debug} and $DEBUG_DATETIME_FORMATS);
1952              
1953 1965 100       6396 return undef if $format =~ /:/;
1954              
1955 1961         3296 my $sep;
1956 1961 100       6636 if ($format eq '') {
1957 316         663 $sep = '';
1958             } else {
1959 1645 100       12926 unless ((undef, $sep) = $v =~ /(^|\d([^0-9:]+))(\d{1,2}):(\d{1,2})(\D|$)/) {
1960 387 100       1046 if ($v =~ /\d{4}:\d{2}(\D|$)/) {
1961 29         73 $sep = '';
1962             } else {
1963              
1964 358 50       862 print($_debugh "_guess_time_format(): separator candidate not found in '$v'\n")
1965             if $debug_fmt;
1966              
1967 358         945 return undef;
1968             }
1969             }
1970             }
1971 1603 100       5468 $sep = '' unless defined($sep);
1972              
1973             #
1974             # IMPORTANT
1975             #
1976             # The code below allows to successfully detect DateTime format when
1977             # the first lines contain things like:
1978             # Jan 20 2017 2:00AM
1979             # that could lead to a separator set to ' ' while actually it should be ' '. In this case
1980             # if the double-space is kept, then a later value of
1981             # Jan 20 2017 10:00AM
1982             # won't be parsed.
1983             #
1984             # See t/11-bugfix.t, BUG 5, for an explanation of why the line below.
1985             #
1986              
1987             # More generic code, but will also break some separators like ' ' (4 spaces)
1988             # $sep = substr($sep, 0, length($sep) - 1) if length($sep) >= 2 and substr($sep, -2) eq ' ';
1989 1603 100       4677 $sep = ' ' if $sep eq ' ';
1990              
1991 1603 50       4046 if ($debug_fmt) {
1992 0         0 print($_debugh " _guess_time_format(): Searching time in '$v'\n");
1993             }
1994              
1995 1603         3624 for my $t (@T) {
1996 9139         599709 my $increased_format = "$format$sep$t";
1997              
1998 9139 50       22295 print($_debugh " _guess_time_format(): Trying format '$increased_format'\n") if $debug_fmt;
1999              
2000 9139         31763 my %opts = (pattern => $START . $increased_format . $stop);
2001 9139 100 66     46360 $opts{locale} = $locale if defined($locale) and $locale ne '';
2002 9139         40199 my $parser_of_increased_format = DateTime::Format::Strptime->new(%opts);
2003 9139 100       11636796 next unless defined($parser_of_increased_format->parse_datetime($START . $v . $stop));
2004              
2005 254 50       179457 if ($debug_fmt) {
2006 0         0 print($_debugh " _guess_time_format(): found time in '$v'\n");
2007 0         0 print($_debugh " Initial format: '$format'\n");
2008 0         0 print($_debugh " Increased format: '$increased_format'\n");
2009             }
2010              
2011 254         1623 return [$increased_format, $locale, $parser_of_increased_format];
2012             }
2013 1349         113176 return undef;
2014             }
2015              
2016              
2017             # * ********************************* *
2018             # * END OF DATE FORMAT DETECTION CODE *
2019             # * ********************************* *
2020              
2021              
2022             # Take the string of a header in $_ and replace it with the corresponding field name
2023             sub _header_to_field_name {
2024 1246     1246   2749 $_ = remove_accents($_);
2025 1246         2756 s/[^[:alnum:]_]//gi;
2026 1246         7630 s/^.*$/\U$&/;
2027             }
2028              
2029             # Return 0 if error, 1 if all good
2030             sub _S2_init_fields_from_header {
2031 349     349   737 my $self = shift;
2032              
2033 349         804 my $has_headers = $self->{has_headers};
2034 349         734 my $_debug = $self->{_debug};
2035 349         756 my $_debugh = $self->{_debugh};
2036              
2037 349         1108 my $in_file_disp = $self->get_in_file_disp();
2038              
2039 349         739 my $inh = $self->{_inh};
2040 349         758 my $incsv = $self->{_in_csvobj};
2041              
2042 349         990 $self->{_row_read} = 0;
2043              
2044 349         754 my @columns;
2045             my @headers;
2046 349 100       1042 if ($has_headers) {
2047              
2048             print($_debugh "$PKG: '$in_file_disp': will parse header line to get column names\n")
2049 338 50       1055 if $self->{_debug_read};
2050              
2051 338         704 $self->{_row_read}++;
2052              
2053             print($_debugh "$PKG: '$in_file_disp': will read line #" . $self->{_row_read} . "\n")
2054 338 50       1003 if $self->{_debug_read};
2055              
2056 338 50       1067 if (defined($self->{_inh_header})) {
2057 0         0 my $l = $self->{_inh_header};
2058 0         0 my $inmemh;
2059 0 0       0 if (!open ($inmemh, '<', \$l)) {
2060 0         0 $self->_print_error("can't open header line in-memory. CSV read aborted.");
2061 0         0 return 0;
2062             }
2063 0         0 @headers = @{_mygetline($incsv, $inmemh)};
  0         0  
2064             } else {
2065 338         893 my $r = _mygetline($incsv, $inh);
2066 336 50       17668 @headers = @{$r} if defined($r);
  336         1414  
2067             }
2068             print($_debugh "Line " . $self->{_row_read} . ":\n--\n" . join('::', @headers) . "\n--\n")
2069 336 50       1322 if $self->{_debug_read};
2070             }
2071              
2072 347 100 100     2083 if ($has_headers and !defined($self->{fields_column_names})) {
2073 330         677 my %indexes;
2074 330 100       978 if (defined($self->{fields_hr})) {
2075 12 50       65 if (!$self->_process_header(\@headers, $self->{fields_hr}, \%indexes)) {
2076 0         0 $self->_print_error("missing headers. CSV read aborted.");
2077 0         0 return 0;
2078             }
2079 12 50       40 if ($_debug) {
2080 0         0 print($_debugh " \%indexes:\n");
2081 0         0 for my $k (sort keys %indexes) {
2082 0         0 print($_debugh " \t$k => $indexes{$k}\n");
2083             }
2084             }
2085 12         75 for (sort keys %indexes) {
2086 48 50       122 next if $_ eq '';
2087 48         113 $columns[$indexes{$_}] = $_;
2088             }
2089             } else {
2090 318         1075 @columns = @headers;
2091 318         778 map { _header_to_field_name } @columns;
  1246         9060  
2092             }
2093             }
2094              
2095 347 100       1266 @columns = @{$self->{fields_column_names}} if defined($self->{fields_column_names});
  14         40  
2096              
2097             # Avoid undef in column names... I prefer empty strings
2098 347 100       1255 @columns = map { defined($_) ? $_ : '' } @columns;
  1370         3967  
2099              
2100 347 50       1185 if ($_debug) {
2101 0         0 print($_debugh "-- CSV headers management\n");
2102 0 0       0 if (@columns) {
2103 0         0 printf($_debugh " %-3s %-40s %-40s\n", 'COL', 'CSV Header', 'Hash Key');
2104 0         0 for my $i (0..$#columns) {
2105 0         0 my $h = '';
2106 0 0       0 $h = $headers[$i] if defined($headers[$i]);
2107 0         0 printf($_debugh " %03d %-40s %-40s\n", $i, "'$h'", "'$columns[$i]'");
2108             }
2109             } else {
2110 0         0 print($_debugh " No headers\n");
2111             }
2112             }
2113              
2114 347         695 my %regular_named_fields;
2115 347         1170 for my $i (0..$#columns) {
2116 1370 100 66     6865 $regular_named_fields{$columns[$i]} = $i if defined($columns[$i]) and $columns[$i] ne '';
2117             }
2118 347         2023 $self->{_regular_named_fields} = { %regular_named_fields };
2119 347         1446 $self->{_S2_columns} = [ @columns ];
2120 347 100       1635 $self->{_S2_headers} = [ @headers ] if $has_headers;
2121              
2122 347         1730 return 1;
2123             }
2124              
2125             sub out_header {
2126 8     8 1 15 my $self = shift;
2127 8         131 validate_pos(@_, {type => SCALAR}, {type => SCALAR});
2128              
2129 8         28 my ($field, $header) = @_;
2130 8 100       28 $self->{_out_headers} = { } unless exists $self->{_out_headers};
2131              
2132             $self->_print_warning("out_header: field $field already set")
2133 8 50       19 if exists $self->{_out_headers}->{$field};
2134              
2135 8         17 $self->{_out_headers}->{$field} = $header;
2136              
2137 8         27 return $self;
2138             }
2139              
2140             # Return 0 if error, 1 if all good
2141             sub _S3_init_fields_extra {
2142 349     349   738 my $self = shift;
2143              
2144 349         783 my $_debug = $self->{_debug};
2145 349         721 my $_debugh = $self->{_debugh};
2146              
2147 349         760 my $verbose = $self->{verbose};
2148              
2149 349         676 my $has_headers = $self->{has_headers};
2150              
2151 349         610 my %named_fields = %{$self->{_regular_named_fields}};
  349         1857  
2152 349         821 my @columns = @{$self->{_S2_columns}};
  349         1176  
2153 349 100       1019 my @headers = @{$self->{_S2_headers}} if $has_headers;
  338         1066  
2154              
2155 349         664 my @extra_fields_indexes;
2156 349 100       1173 my @extra_fields_definitions_list = @{$self->{_extra_fields}} if exists $self->{_extra_fields};
  36         135  
2157 349         754 my %extra_fields_definitions;
2158              
2159             my @coldata;
2160 349         1096 for my $i (0..$#columns) {
2161 1376         47229 my $col = $columns[$i];
2162 1376 100       3304 my $h = $headers[$i] if $has_headers;
2163 1376         24745 push @coldata, ColData->new(
2164             field_name => $col,
2165             header_text => $h,
2166             description => ''
2167             );
2168             }
2169              
2170 349         16487 for my $edef (@extra_fields_definitions_list) {
2171 92         6463 my $c = $edef->check_field_existence;
2172 92 100       1234 if (defined($c)) {
2173 80 100       276 unless (exists $named_fields{$c}) {
2174 9         173 $self->_print_error("unknown field '" . $edef->check_field_existence . "'",
2175             0, ERR_UNKNOWN_FIELD, { %named_fields } );
2176 8         37 next;
2177             }
2178             }
2179              
2180 83         247 my @e_eclated = $edef;
2181              
2182 83 100 100     1621 if ($edef->ef_type == $EF_LINK and $edef->link_remote_read eq '*') {
2183 2         102 my @cols = $edef->link_remote_obj->get_fields_names();
2184              
2185 2         7 @e_eclated = ();
2186 2         10 my %nf = %named_fields;
2187              
2188 2         7 for my $c (@cols) {
2189              
2190 4         84 my $ex_base = $edef->self_name . $c;
2191 4         40 my $ex_target = $ex_base;
2192 4         9 my $i = 1;
2193 4         19 while (exists $nf{$ex_target}) {
2194 1         3 $i++;
2195 1         61 $ex_target = $ex_base . '_' . $i;
2196             }
2197              
2198 4         77 my $e = ExtraField->new(
2199             ef_type => $EF_LINK,
2200             self_name => $ex_target,
2201             description => $edef->description . " ($c)",
2202              
2203             link_self_search => $edef->link_self_search,
2204             link_remote_obj => $edef->link_remote_obj,
2205             link_remote_search => $edef->link_remote_search,
2206             link_remote_read => $c,
2207              
2208             link_vlookup_opts => $edef->link_vlookup_opts
2209             );
2210 4         522 push @e_eclated, $e;
2211 4         19 $nf{$ex_target} = undef;
2212             }
2213             }
2214              
2215 83         2010 for my $e1 (@e_eclated) {
2216 85 100       1756 if (exists $named_fields{$e1->self_name}) {
2217 6         184 $self->_print_error("extra field: duplicate field name: '" . $e1->self_name . "'");
2218 6         36 next;
2219             }
2220              
2221 79         905 my $index_of_new_element = $#columns + 1;
2222 79         193 push @extra_fields_indexes, $index_of_new_element;
2223 79         1463 $columns[$index_of_new_element] = $e1->self_name;
2224 79         2011 $named_fields{$e1->self_name} = $index_of_new_element;
2225 79         1987 $extra_fields_definitions{$e1->self_name} = $e1;
2226              
2227 79 50       2104 push @headers, $e1->self_name if $has_headers;
2228 79         2057 push @coldata, ColData->new(
2229             field_name => $e1->self_name,
2230             header_text => $e1->self_name,
2231             description => $e1->description
2232             );
2233             }
2234              
2235             }
2236 348 100       4319 $self->{_headers} = [ @headers ] if $has_headers;
2237 348         1295 $self->{_extra_fields_indexes} = [ @extra_fields_indexes ];
2238 348         1375 $self->{_columns} = [ @columns ];
2239 348         1229 $self->{_extra_fields_definitions} = { %extra_fields_definitions };
2240              
2241 348         2011 $self->{_named_fields} = { %named_fields };
2242              
2243 348         1694 $self->_detect_dates_formats();
2244              
2245 345         1046 $self->{_read_update_after_ar} = [ ];
2246 345         3959 $self->{_write_update_before_ar} = [ ];
2247 345 100       3326 my @dates_formats = @{$self->{_dates_formats}} if defined($self->{_dates_formats});
  57         286  
2248 345         1224 for my $i (0..$#columns) {
2249 1436         2591 my $dt_format;
2250             my $dt_locale;
2251 1436 100       3506 if (defined($dates_formats[$i])) {
2252 329         674 $dt_format = $dates_formats[$i]->[0];
2253 329         607 $dt_locale = $dates_formats[$i]->[1];
2254             }
2255 1436         23769 $coldata[$i]->dt_format($dt_format);
2256 1436         27651 $coldata[$i]->dt_locale($dt_locale);
2257              
2258 1436 100       11583 next unless defined($dt_format);
2259              
2260 329         540 my %opts_in;
2261 329 100 66     1663 $opts_in{locale} = $dt_locale if defined($dt_locale) and $dt_locale ne '';
2262              
2263 329         1443 my $obj_strptime_in = DateTime::Format::Strptime->new(pattern => $dt_format, %opts_in);
2264              
2265 329         379557 my %opts_out;
2266 329 50       1038 my $loc_out = (exists $self->{out_dates_locale} ? $self->{out_dates_locale} : $dt_locale);
2267 329 100 66     1922 $opts_out{locale} = $loc_out if defined($loc_out) and $loc_out ne '';
2268             my $obj_strptime_out = DateTime::Format::Strptime->new(
2269 329 50       1534 pattern => (exists $self->{out_dates_format} ? $self->{out_dates_format} :$dt_format),
2270             %opts_out
2271             );
2272              
2273 329         356151 my $refsub_is_datetime_empty = $self->{_refsub_is_datetime_empty};
2274 329         1123 my $in_file_disp = $self->get_in_file_disp();
2275              
2276             $self->{_read_update_after_ar}->[$i] = sub {
2277 352 100 66 352   2215 return undef if !defined($_) or $_ eq '' or
      33        
      66        
2278             (defined($refsub_is_datetime_empty) and $refsub_is_datetime_empty->($_));
2279              
2280 315         585 my $s = $_[0];
2281 315         741 my $field = _get_def($_[1], '<?>');
2282              
2283 315         1028 my $dt = $obj_strptime_in->parse_datetime($_);
2284              
2285 315 0 33     175356 if ($_debug and $DEBUG_DATETIME_FORMATS and $DEBUG_DATETIME_FORMATS_EVEN_MORE) {
      33        
2286 0 0       0 print($_debugh "-- Record " . $s->get_recnum() .
2287             ", field '$field':\n String parsed: '$_'\n Parse format: '$dt_format'\n" .
2288             " DateTime obj: <" . (defined($dt) ? $dt . '' : 'undef') . ">\n");
2289             }
2290              
2291 315 100       753 if (!defined($dt)) {
2292 2         11 my $recnum = $s->get_recnum();
2293 2 50       8 if ($verbose) {
2294 0         0 $s->_print("$PKG: " .
2295             "$in_file_disp: record $recnum: field $field: unable to parse DateTime\n");
2296 0         0 $s->_print("$PKG: field: '$_'\n");
2297 0         0 $s->_print("$PKG: format: '$dt_format'\n");
2298 0 0       0 $s->_print("$PKG: " .
2299             "locale: '" . ($dt_locale eq '' ? '<none>' : $dt_locale) . "'\n");
2300 0         0 $s->_print("$PKG: " .
2301             "Probable cause: when detecting DateTime format, $PKG will stop reading\n");
2302 0         0 $s->_print("$PKG: " .
2303             "input as soon as the format is worked out. If a value found later\n");
2304 0         0 $s->_print("$PKG: " .
2305             "turns out to use another DateTime format, it'll generate a DateTime\n");
2306 0         0 $s->_print("$PKG: parse error, as is the case now.\n");
2307 0         0 $s->_print_error("unable to parse DateTime");
2308             } else {
2309 2         18 $s->_print_error("$in_file_disp: record $recnum: field $field: " .
2310             "unable to parse DateTime '$_'");
2311             }
2312             }
2313              
2314 314         651 return $dt;
2315 329         2387 };
2316             $self->{_write_update_before_ar}->[$i] = sub {
2317 96 100   96   240 return '' unless defined($_);
2318 87 100       211 return $_ if !ref($_);
2319 81 50       297 return $_ unless $_->isa('DateTime');
2320              
2321 81         268 my $str = $obj_strptime_out->format_datetime($_);
2322              
2323 81 50       16151 if (!defined($str)) {
2324 0         0 my $s = $_[0];
2325 0         0 my $recnum = $s->get_recnum();
2326 0         0 my $field = _get_def($_[1], '<?>');
2327 0         0 $s->_print_error("$in_file_disp: record $recnum: field $field: " .
2328             "unable to print DateTime '$_'")
2329             }
2330              
2331 81         188 return $str;
2332 329         1696 };
2333             }
2334              
2335 345         1234 $self->{_coldata} = [ @coldata ];
2336              
2337 345         2110 my @loop = (
2338             ['_read_update_after_hr', '_read_update_after_ar', 'read post'],
2339             ['_write_update_before_hr', '_write_update_before_ar', 'write pre']
2340             );
2341 345         1143 for my $ii (0..$#loop) {
2342 690         1360 my $l = $loop[$ii];
2343              
2344 690         1583 my $ht = $self->{$l->[0]};
2345 690         1112 my @subrefs = @{$self->{$l->[1]}};
  690         1821  
2346 690         1171 for my $field (keys %{$ht}) {
  690         1776  
2347 70 50       192 unless (exists $named_fields{$field}) {
2348 0         0 $self->_print_error($l->[2] . ": unknown field '$field'",
2349             0, ERR_UNKNOWN_FIELD, { %named_fields } );
2350 0         0 next;
2351             }
2352              
2353 70         120 my $i = $named_fields{$field};
2354              
2355 70         96 my @allsubs;
2356 70         108 push @allsubs, @{$ht->{$field}};
  70         135  
2357 70 100       174 if (defined($subrefs[$i])) {
2358 2 50       8 unshift @allsubs, $subrefs[$i] if $ii == 0;
2359 2 50       6 push @allsubs, $subrefs[$i] if $ii == 1;
2360             }
2361              
2362             my $finalsub = sub {
2363 196     196   349 for my $s (@allsubs) {
2364 262         979 $_ = $s->(@_);
2365             }
2366 189         6359 return $_;
2367 70         248 };
2368 70         157 $subrefs[$i] = $finalsub;
2369              
2370             }
2371 690         2132 $self->{$l->[1]} = [ @subrefs ];
2372             }
2373              
2374 345         1684 my $tmp = _get_def($self->{out_fields}, $self->{write_fields});
2375 345 100       1326 my @wf = @{$tmp} if defined($tmp);
  6         14  
2376 345         748 my $count_field_not_found = 0;
2377 345         850 for (@wf) {
2378 16 100 66     106 next if !defined($_) or $_ eq '' or exists $named_fields{$_};
      100        
2379 3         6 $count_field_not_found++;
2380 3         18 $self->_print_error("out_fields: unknown field '$_'",
2381             1, ERR_UNKNOWN_FIELD, { %named_fields } );
2382             }
2383 345 100       1022 if ($count_field_not_found) {
2384 2         5 $self->_print_error("non existent field(s) encountered");
2385 1         59 delete $self->{out_fields};
2386 1         4 delete $self->{write_fields};
2387             }
2388              
2389 344 100       1068 my %sh = %{$self->{_out_headers}} if defined($self->{_out_headers});
  4         16  
2390 344         682 $count_field_not_found = 0;
2391 344         988 for (keys %sh) {
2392 8 100 33     50 next if !defined($_) or $_ eq '' or exists $named_fields{$_};
      66        
2393 2         4 $count_field_not_found++;
2394 2         12 $self->_print_error("out_header: unknown field '$_'",
2395             1, ERR_UNKNOWN_FIELD, { %named_fields } );
2396             }
2397 344 100       937 $self->_print_error("non existent field(s) encountered") if $count_field_not_found;
2398              
2399 343         2544 return 1;
2400             }
2401              
2402             #
2403             # Return 0 if there's no more records (error or eof reached), 1 if a record got read
2404             # successfully.
2405             #
2406             # If return value is 1:
2407             # $$ref_ar and $$ref_hr are set to array ref and hash ref of the record, respectively
2408             #
2409             # If return value is 0:
2410             # $$ref_ar and $$ref_hr are set to undef if an error occured
2411             # $$ref_ar and $$ref_hr are set to a scalar if eof reached
2412             #
2413             sub _read_one_record_from_input {
2414 2329     2329   4935 my ($self, $ref_ar, $ref_row_hr) = @_;
2415              
2416 2329         4355 my $_debug = $self->{_debug};
2417 2329         3688 my $_debug_extra_fields = $self->{_debug_extra_fields};
2418 2329         3707 my $_debugh = $self->{_debugh};
2419              
2420 2329         5212 my $in_file_disp = $self->get_in_file_disp();
2421              
2422 2329         3981 my $incsv = $self->{_in_csvobj};
2423 2329         3539 my $ar;
2424              
2425             print($_debugh "$PKG: '$in_file_disp': will read line #" . ($self->{_row_read} + 1) . "\n")
2426 2329 50       5342 if $self->{_debug_read};
2427              
2428 2329 100       5116 unless ($ar = _mygetline($incsv, $self->{_inh})) {
2429 284 50       11520 if (!$incsv->eof()) {
2430 0         0 my ($code, $str, $pos) = $incsv->error_diag();
2431 0         0 $self->_print_error("$code: $str, record " . $incsv->record_number . ", position $pos");
2432 0         0 $$ref_ar = undef;
2433 0         0 $$ref_row_hr = undef;
2434             } else {
2435 284         2104 $$ref_ar = 1;
2436 284         677 $$ref_row_hr = 1;
2437             }
2438              
2439 284         1443 $self->_close_inh();
2440              
2441 284         1141 return 0;
2442             }
2443              
2444 2045         63747 $self->{_row_read}++;
2445              
2446 2045         3385 my %named_fields = %{$self->{_named_fields}};
  2045         10002  
2447              
2448 2045 50       5756 if ($self->{_debug_read}) {
2449 0         0 print($_debugh "Line " . $self->{_row_read} . ":\n--\n");
2450 0         0 for (sort keys %named_fields) {
2451 0         0 my $c = _get_def($ar->[$named_fields{$_}], '<undef>');
2452 0         0 print($_debugh " $_ => '" . $c . "'\n");
2453             }
2454             }
2455              
2456 2045         3630 my $columns_ar = $self->{_columns};
2457              
2458 2045         3399 my $no_undef = $self->{no_undef};
2459 2045 100       4499 if ($no_undef) {
2460 30         63 for (0..$#{$columns_ar}) {
  30         103  
2461 324 100       1043 $ar->[$_] = '' unless defined($ar->[$_]);
2462             }
2463             }
2464              
2465 2045         3621 my $row_hr = { };
2466             $row_hr->{$_} = $ar->[$self->{_regular_named_fields}->{$_}]
2467 2045         3224 foreach keys %{$self->{_regular_named_fields}};
  2045         14072  
2468              
2469 2045         4523 my $rpost = $self->{_read_update_after_ar};
2470 2045         3118 for my $i (0..$#{$columns_ar}) {
  2045         4791  
2471 10556         15404 my $subref = $rpost->[$i];
2472 10556 100       23234 next unless defined($subref);
2473              
2474 479         750 do {
2475 479         817 my $field = $columns_ar->[$i];
2476 479         845 local $_ = $ar->[$i];
2477 479         1080 my $new_val = $subref->($self, $field);
2478 474         940 $ar->[$i] = $new_val;
2479 474 50       1726 $row_hr->{$field} = $new_val if defined($field);
2480             }
2481              
2482             }
2483              
2484 2040         3337 for my $i (@{$self->{_extra_fields_indexes}}) {
  2040         4099  
2485 442         1039 my $name = $columns_ar->[$i];
2486 442         1023 my $e = $self->{_extra_fields_definitions}->{$name};
2487              
2488 442 50       1186 print($_debugh "Extra field: #$i: $name\n") if $_debug_extra_fields;
2489              
2490 442         852 my $value;
2491 442 100       9646 if ($e->ef_type == $EF_LINK) {
    100          
    50          
2492              
2493 238 50       2639 print($_debugh " linked field\n") if $_debug_extra_fields;
2494              
2495 238         4405 my $remobj = $e->link_remote_obj;
2496             $value = $remobj->vlookup(
2497             $e->link_remote_search,
2498 238         5846 $ar->[$named_fields{$e->link_self_search}],
2499             $e->link_remote_read,
2500             $e->link_vlookup_opts
2501             );
2502              
2503             } elsif ($e->ef_type == $EF_FUNC) {
2504              
2505 51 50       1616 print($_debugh " computed field\n") if $_debug_extra_fields;
2506              
2507 51         889 $value = $e->func_sub->($name, $row_hr, $self->{_stats});
2508              
2509             } elsif ($e->ef_type == $EF_COPY) {
2510              
2511 153 50       8580 print($_debugh " copy field\n") if $_debug_extra_fields;
2512              
2513 153         2850 my $input = $row_hr->{$e->copy_source};
2514 153 50 33     1664 $input = '' if !defined($input) and $no_undef;
2515 153 100       3878 if (defined($e->copy_sub)) {
2516 57         595 local $_ = $input;
2517 57         995 $value = $e->copy_sub->();
2518             } else {
2519 96         971 $value = $input;
2520             }
2521              
2522 153 50       1448 print($_debugh " in: '$input', out: '$value'\n") if $_debug_extra_fields;
2523              
2524             } else {
2525 0         0 confess "Unknown ef_type '" . $e->ef_type . "', check this module' code urgently!";
2526             }
2527              
2528 440 100 100     2431 $value = '' if !defined($value) and $no_undef;
2529 440         1151 $ar->[$i] = $value;
2530 440         1217 $row_hr->{$name} = $value;
2531              
2532 440 50       1369 print($_debugh " $name => '$value'\n") if $_debug_extra_fields;
2533              
2534             }
2535              
2536 2038 100       5070 if (defined($self->{read_post_update_hr})) {
2537 33         100 $self->{read_post_update_hr}->($row_hr, $self->{_stats}, $self->get_recnum());
2538 33         318 $ar->[$named_fields{$_}] = $row_hr->{$_} foreach keys %named_fields;
2539             }
2540              
2541 2038 100       7468 lock_keys(%$row_hr) if $self->{croak_if_error};
2542              
2543             $self->{walker_ar}->($ar, $self->{_stats}, $self->get_recnum())
2544 2038 100       15211 if defined($self->{walker_ar});
2545             $self->{walker_hr}->($row_hr, $self->{_stats}, $self->get_recnum())
2546 2038 100       5428 if defined($self->{walker_hr});
2547              
2548 2037         3705 $$ref_ar = $ar;
2549 2037         3314 $$ref_row_hr = $row_hr;
2550              
2551 2037         8424 return 1;
2552             }
2553              
2554             sub _open_read {
2555 295     295   626 my $self = shift;
2556              
2557 295         619 my $verbose = $self->{verbose};
2558 295         1195 my $in_file_disp = $self->get_in_file_disp();
2559              
2560 295         849 $self->{_stats} = { };
2561 295         677 $self->{_read_in_progress} = 1;
2562              
2563 295 50       895 $self->_print("-- $in_file_disp reading start\n") if $verbose;
2564             }
2565              
2566             sub _close_read {
2567 314     314   712 my $self = shift;
2568 314         600 my $keep_quiet = shift;
2569              
2570 314         657 my $verbose = $self->{verbose};
2571 314         849 my $in_file_disp = $self->get_in_file_disp();
2572              
2573 314         772 $self->{_read_in_progress} = 0;
2574              
2575 314 50 33     1196 if ($verbose and !$keep_quiet) {
2576 0         0 $self->_print("-- $in_file_disp reading end: " . $self->{_row_read} . " row(s) read\n");
2577 0         0 for my $k (sort keys %{$self->{_stats}}) {
  0         0  
2578 0         0 $self->_printf(" %7d %s\n", $self->{_stats}->{$k}, $k);
2579             }
2580             }
2581             }
2582              
2583             # Return 0 if error, 1 if all good
2584             sub _S4_read_all_in_mem {
2585 165     165   377 my $self = shift;
2586              
2587 165         588 $self->_register_pass("_S4_read_all_in_mem()");
2588              
2589 165         633 $self->_open_read();
2590              
2591 165         374 my $ar;
2592             my $row_hr;
2593 165         697 while ($self->_read_one_record_from_input(\$ar, \$row_hr)) {
2594              
2595 1543         2566 push @{$self->{_flat}}, $ar;
  1543         4903  
2596              
2597             }
2598              
2599 163 50       549 my $retcode = (defined($ar) ? 1 : 0);
2600 163         645 $self->_update_in_mem_record_count();
2601              
2602 163         616 $self->_close_read();
2603              
2604 163         513 return $retcode;
2605             }
2606              
2607             sub _chain_array {
2608 24     24   194 return split(/\s*->\s*/, $_[0]);
2609             }
2610              
2611             sub _chain_str {
2612 2     2   11 return join('->', @_);
2613             }
2614              
2615             sub field_add_link {
2616 25     25 1 5483 my $self = shift;
2617              
2618 25         821 validate_pos(@_, {type => UNDEF | SCALAR}, {type => SCALAR}, {type => SCALAR | OBJECT},
2619             {type => HASHREF, optional => 1});
2620              
2621 22         109 my ($new_field, $chain, $obj, $param_opts) = @_;
2622              
2623 22         51 my $croak_if_error = $self->{croak_if_error};
2624 22         52 my $_debug = $self->{_debug};
2625 22         44 my $_debugh = $self->{_debugh};
2626              
2627 22         75 my @c = _chain_array($chain);
2628 22 100       85 $new_field = $c[2] unless defined($new_field);
2629              
2630 22 50       65 print($_debugh "Registering new linked field, new_field = '$new_field', chain = '$chain'\n")
2631             if $_debug;
2632              
2633 22 100 66     164 unless (@c == 3 and $c[2] ne '') {
2634 1         5 $self->_print_error("wrong links chain parameter: '$chain', " .
2635             "look for CHAIN in Text::AutoCSV manual for help");
2636 1         8 return undef;
2637             }
2638              
2639 21 50       68 return undef unless $self->_status_forward('S2');
2640 21 50       76 return undef unless $self->_status_backward('S2');
2641              
2642 21 100       73 my @tmp = %{$param_opts} if $param_opts;
  13         52  
2643 21         585 my %opts = validate(@tmp, $SEARCH_VALIDATE_OPTIONS);
2644              
2645 20         92 my $target_name = '';
2646 20 100       69 if (ref $obj eq '') {
2647 19         38 my $in_file = $obj;
2648 19         39 $target_name = $in_file;
2649              
2650             #
2651             # TODO (?)
2652             #
2653             # Take into account the fact that the OS' file system is case insensitive. At the
2654             # moment, two different strings (even if identical in a case insensitive comparison)
2655             # will be managed as being distinct.
2656             # I put a question mark in this TO DO - after all, the user of this module had better
2657             # use same case when dealing with multiple links of the same file.
2658             #
2659             # Also, tuning this module' behavior depending on the OS' characteristics would be not
2660             # ideal, it'd add a level of complexity to understand how it works and what to expect.
2661             #
2662 19 100 100     95 if (exists $self->{_obj} and exists $self->{_obj}->{$in_file}) {
2663              
2664 4 50       11 print(
2665             $_debugh
2666             "field_add_link: file '$in_file': re-using existing Text::AutoCSV object\n"
2667             ) if $_debug;
2668              
2669 4         8 $obj = $self->{_obj}->{$in_file};
2670             } else {
2671              
2672 15 50       49 print($_debugh "field_add_link: file '$in_file': creating new Text::AutoCSV object\n")
2673             if $_debug;
2674              
2675 15 100       64 $self->{_obj} = { } unless exists $self->{_obj};
2676              
2677             #
2678             # The created Text::AutoCSV must be created with the same search options as what is
2679             # currently found in $self.
2680             #
2681             # Why?
2682             # Because the link is populated doing a vlookup on the remote object ($obj below),
2683             # not on $self. Therefore, if we don't "propagate" search options from $self to
2684             # $obj, search tunnings won't work as user would expect.
2685             #
2686 15         36 my %search_opts;
2687 15         41 for (qw(search_case search_trim search_ignore_empty search_ignore_accents
2688             search_value_if_not_found search_value_if_ambiguous search_ignore_ambiguous)) {
2689             # We assign depending on whether or not the attribute EXISTS - the definedness
2690             # is not appropriate, in case an attribute would have been assigned to undef.
2691 105 100       267 $search_opts{$_} = $self->{$_} if exists $self->{$_};
2692             }
2693              
2694             $obj = Text::AutoCSV->new(
2695             in_file => $in_file,
2696             verbose => $self->{verbose},
2697             infoh => $self->{infoh},
2698             _debug => $self->{debug},
2699             _debugh => $self->{debugh},
2700 15         130 %search_opts
2701             );
2702 15         83 $self->{_obj}->{$in_file} = $obj;
2703             }
2704             } else {
2705 1         3 $target_name = '(object)';
2706 1 50       6 print($_debugh "field_add_link: Text::AutoCSV object provided\n") if $_debug;
2707             }
2708              
2709 20 100       90 $self->{_extra_fields} = [ ] unless exists $self->{_extra_fields};
2710              
2711 20         45 push @{$self->{_extra_fields}}, ExtraField->new(
  20         596  
2712             ef_type => $EF_LINK,
2713             self_name => $new_field,
2714             description => "link: $target_name, chain: $chain",
2715             check_field_existence => $c[0],
2716              
2717             link_self_search => $c[0],
2718             link_remote_obj => $obj,
2719             link_remote_search => $c[1],
2720             link_remote_read => $c[2],
2721              
2722             link_vlookup_opts => \%opts
2723             );
2724              
2725 20         2778 return $self;
2726             }
2727              
2728             sub links {
2729 2     2 1 6 my $self = shift;
2730              
2731 2         44 validate_pos(@_, {type => UNDEF | SCALAR}, {type => SCALAR}, {type => SCALAR | OBJECT},
2732             {type => HASHREF, optional => 1});
2733              
2734 2         10 my $prefix_field = shift;
2735 2         6 my $chain = shift;
2736 2         5 my ($obj, $param_opts) = @_;
2737              
2738 2         9 my @c = _chain_array($chain);
2739              
2740 2 50 33     25 if (@c != 2 or $c[0] eq '' or $c[1] eq '') {
      33        
2741 0         0 $self->_print_error("wrong links chain parameter: '$chain', " .
2742             "look for JOINCHAIN in Text::AutoCSV manual for help");
2743 0         0 return undef;
2744             }
2745              
2746 2 100       10 $prefix_field = '' unless defined($prefix_field);
2747 2         7 my $chain2 = _chain_str(@c, '*');
2748              
2749 2         8 return $self->field_add_link($prefix_field, $chain2, @_);
2750             }
2751              
2752             sub field_add_computed {
2753 7     7 1 2517 my $self = shift;
2754              
2755 7         209 validate_pos(@_, {type => SCALAR}, {type => CODEREF});
2756 6         23 my ($new_field, $func) = @_;
2757              
2758 6         17 my $croak_if_error = $self->{croak_if_error};
2759              
2760 6         14 my $_debug = $self->{_debug};
2761 6         13 my $_debugh = $self->{_debugh};
2762              
2763 6 50       20 print($_debugh "Registering new computed field, new_field = '$new_field'\n") if $_debug;
2764              
2765 6 100       17 return undef unless $self->_status_forward('S2');
2766 5 50       20 return undef unless $self->_status_backward('S2');
2767              
2768 5         14 push @{$self->{_extra_fields}}, ExtraField->new(
  5         133  
2769             ef_type => $EF_FUNC,
2770             self_name => $new_field,
2771             description => "computed",
2772              
2773             func_sub => $func
2774             );
2775              
2776 5         644 return $self;
2777             }
2778              
2779             sub field_add_copy {
2780 14     14 1 13785 my $self = shift;
2781              
2782 14         244 validate_pos(@_, {type => SCALAR}, {type => SCALAR}, {type => CODEREF, optional => 1});
2783 14         72 my ($new_field, $copy_source, $func) = @_;
2784              
2785 14         42 my $croak_if_error = $self->{croak_if_error};
2786              
2787 14         33 my $_debug = $self->{_debug};
2788 14         33 my $_debugh = $self->{_debugh};
2789              
2790 14 50       49 print($_debugh "Registering field copy, new_field = '$new_field' copied from '$copy_source'\n")
2791             if $_debug;
2792              
2793 14 100       47 return undef unless $self->_status_forward('S2');
2794 12 50       47 return undef unless $self->_status_backward('S2');
2795              
2796 12 100       36 push @{$self->{_extra_fields}}, ExtraField->new(
  12         528  
2797             ef_type => $EF_COPY,
2798             self_name => $new_field,
2799             description => "copy of $copy_source " . (defined($func) ? '(with sub)' : '(no sub)'),
2800             check_field_existence => $copy_source,
2801              
2802             copy_source => $copy_source,
2803             copy_sub => $func
2804             );
2805              
2806 12         1680 return $self;
2807             }
2808              
2809             sub in_map {
2810 15     15 1 1349 my $self = shift;
2811              
2812 15         48 return $self->read_update_after(@_);
2813             }
2814              
2815             sub read_update_after {
2816 16     16 1 31 my $self = shift;
2817 16         194 validate_pos(@_, {type => SCALAR}, {type => CODEREF});
2818              
2819 16         60 my ($field, $subref) = @_;
2820              
2821 16         33 my $_debug = $self->{_debug};
2822 16         32 my $_debugh = $self->{_debugh};
2823              
2824 16 50       45 return undef unless $self->_status_forward('S2');
2825 16 50       45 return undef unless $self->_status_backward('S2');
2826              
2827 16 50       42 print($_debugh "Registering read_post_update subref for field '$field'\n") if $_debug;
2828              
2829             $self->{_read_update_after_hr}->{$field} = [ ]
2830 16 100       65 unless defined($self->{_read_update_after_hr}->{$field});
2831              
2832 16         26 push @{$self->{_read_update_after_hr}->{$field}}, $subref;
  16         44  
2833              
2834 16         140 return $self;
2835             }
2836              
2837             sub out_map {
2838 11     11 1 17 my $self = shift;
2839              
2840 11         32 return $self->write_update_before(@_);
2841             }
2842              
2843             sub write_update_before {
2844 12     12 1 19 my $self = shift;
2845 12         108 validate_pos(@_, {type => SCALAR}, {type => CODEREF});
2846              
2847 12         36 my ($field, $subref) = @_;
2848              
2849 12         23 my $_debug = $self->{_debug};
2850 12         20 my $_debugh = $self->{_debugh};
2851              
2852 12 50       26 return undef unless $self->_status_forward('S2');
2853 12 50       27 return undef unless $self->_status_backward('S2');
2854              
2855 12 50       32 print($_debugh "Registering write_pre_update subref for field '$field'\n") if $_debug;
2856              
2857             $self->{_write_update_before_hr}->{$field} = [ ]
2858 12 100       47 unless defined($self->{_write_update_before_hr}->{$field});
2859              
2860 12         18 push @{$self->{_write_update_before_hr}->{$field}}, $subref;
  12         29  
2861              
2862 12         70 return $self;
2863             }
2864              
2865             sub reset_next_record_hr {
2866 187     187 1 418 my $self = shift;
2867              
2868 187         995 validate_pos(@_);
2869              
2870 187         684 $self->{_current_record} = undef;
2871              
2872 187         378 return $self;
2873             }
2874              
2875             sub _create_internal_column_name_from_its_number {
2876 1120     1120   2425 return sprintf("__%04i__", $_[0]);
2877             }
2878              
2879             sub _ar_to_hr {
2880 2048     2048   3221 my $self = shift;
2881              
2882 2048         15471 validate_pos(@_, {type => ARRAYREF});
2883              
2884 2048         5758 my ($ar) = @_;
2885 2048         3191 my $last_elem_index = scalar(@{$ar}) - 1;
  2048         4209  
2886              
2887 2048         3472 my $nr = $self->{_named_fields};
2888 2048         3125 my %h;
2889             my %n_seen;
2890 2048         3017 for (keys %{$nr}) {
  2048         6236  
2891 9841         21307 $h{$_} = $ar->[$nr->{$_}];
2892 9841         19241 undef $n_seen{$nr->{$_}};
2893             }
2894 2048         4913 for my $i (0..$last_elem_index) {
2895 10908 100       24216 if (!exists($n_seen{$i})) {
2896 1120         2010 my $k = _create_internal_column_name_from_its_number($i);
2897 1120 50       3403 $h{$k} = $ar->[$i] if !exists $h{$k};
2898             }
2899             }
2900              
2901 2048 100       7180 lock_keys(%h) if $self->{croak_if_error};
2902              
2903 2048         18621 return \%h;
2904             }
2905              
2906             sub get_next_record_hr {
2907 1989     1989 1 3527 my $self = shift;
2908              
2909 1989         13767 validate_pos(@_, {type => SCALARREF, optional => 1});
2910              
2911 1989         4830 my $refkey = $_[0];
2912              
2913 1989 50       4396 return undef unless $self->_status_forward('S4');
2914              
2915 1986 100       4581 if (!defined($self->{_current_record})) {
2916 184         425 $self->{_current_record} = 0;
2917             } else {
2918 1802         2884 $self->{_current_record}++;
2919             }
2920              
2921 1986         3798 my $ar = $self->{_flat}->[$self->{_current_record}];
2922 1986 100       4196 if (!defined($ar)) {
2923 184         354 $self->{_current_record} = undef;
2924 184         359 $$refkey = undef;
2925 184         606 return undef;
2926             }
2927              
2928 1802         2987 $$refkey = $self->{_current_record};
2929              
2930 1802         3971 return $self->_ar_to_hr($ar);
2931             }
2932              
2933             sub read {
2934 60     60 1 24762 my $self = shift;
2935              
2936 60         459 validate_pos(@_);
2937              
2938 60 50       225 return undef unless $self->_status_backward('S3');
2939 60 100       187 return undef unless $self->_status_forward('S3');
2940              
2941 51         209 $self->_register_pass("read()");
2942              
2943 51         192 $self->_open_read();
2944              
2945 51         113 my $ar;
2946             my $row_hr;
2947 51         230 while ($self->_read_one_record_from_input(\$ar, \$row_hr)) {
2948             # Ben oui quoi... qu'est-ce que l'on peut bien faire d'autre ?
2949             }
2950              
2951 49         202 $self->_close_read();
2952 49 50       147 return undef unless defined($ar);
2953              
2954 49 50       158 return undef unless $self->_status_reset();
2955              
2956 49         228 return $self;
2957             }
2958              
2959             #
2960             # Initially, _read_all_in_mem was intended for the test plan.
2961             #
2962             # Turned out to be sometimes useful for user, thus, is no longer private since 1.1.5.
2963             # Private version below is kept for compatibility.
2964             #
2965             sub read_all_in_mem {
2966 1     1 1 3 my $self = shift;
2967              
2968 1         5 return $self->_read_all_in_mem();
2969             }
2970              
2971             sub _read_all_in_mem {
2972 9     9   1281 my $self = shift;
2973              
2974 9 50       31 return 0 unless $self->_status_backward('S3');
2975 9 50       32 return 0 unless $self->_status_forward('S4');
2976              
2977 9         49 return $self;
2978             }
2979              
2980             sub print_id {
2981 0     0 1 0 my $self = shift;
2982              
2983 0         0 $self->_printf("-- " . $self->get_in_file_disp() . ":\n");
2984 0         0 $self->_printf("sep_char: " . $self->get_sep_char() . "\n");
2985 0         0 $self->_printf("escape_char: " . $self->get_escape_char() . "\n");
2986 0         0 $self->_printf("in_encoding: " . $self->get_in_encoding() . "\n");
2987 0 0       0 $self->_printf("is_always_quoted: " . ($self->get_is_always_quoted() ? 'yes' : 'no') . "\n");
2988              
2989 0         0 my @coldata = $self->get_coldata();
2990 0         0 my @disp;
2991 0         0 push @disp, [ '#', 'FIELD', 'HEADER', 'EXT DATA', 'DATETIME FORMAT', 'DATETIME LOCALE' ];
2992 0         0 push @disp, [ map { my $s = $_; $s =~ s/./-/g; $s } @{$disp[0]} ];
  0         0  
  0         0  
  0         0  
  0         0  
2993 0         0 for my $i (0..$#coldata) {
2994 0         0 my $col = $coldata[$i];
2995              
2996 0         0 my @row;
2997 0         0 push @row, "$i";
2998 0 0       0 push @row, (defined($col->[$_]) ? ($col->[$_] . '') : '') for (0..4);
2999 0         0 map { s/\n/\\n/g; s/\r/\\r/g; s/\t/\\t/g } @row;
  0         0  
  0         0  
  0         0  
3000 0         0 push @disp, [ @row ];
3001             }
3002 0         0 my $n = @{$disp[-1]};
  0         0  
3003 0         0 my @max = (-1) x $n;
3004 0         0 for my $l (@disp) {
3005 0 0       0 do { $max[$_] = length($l->[$_]) if $max[$_] < length($l->[$_]) } for (0 .. $n - 1);
  0         0  
3006             }
3007 0         0 my $s = join(' ', map { "%-${_}s" } @max);
  0         0  
3008 0         0 $self->_print("\n");
3009 0         0 $self->_printf("$s\n", @{$_}) for (@disp);
  0         0  
3010             }
3011              
3012             sub set_out_file {
3013 2     2 1 8983 my $self = shift;
3014 2         50 validate_pos(@_, {type => SCALAR});
3015              
3016 2         11 my ($out_file) = @_;
3017 2         152 $self->{out_file} = $out_file;
3018              
3019 2         18 return $self;
3020             }
3021              
3022             # Subrefs set with out_map
3023             sub _execute_write_update_before {
3024 362     362   763 my ($self, $ar) = @_;
3025              
3026 362         705 my $columns_ar = $self->{_columns};
3027              
3028 362         666 my $wpre = $self->{_write_update_before_ar};
3029 362         641 for my $i (0..$#{$columns_ar}) {
  362         949  
3030 1162         1984 my $subref = $wpre->[$i];
3031 1162 100       3022 next unless defined($subref);
3032              
3033 159         245 do {
3034 159         273 local $_ = $ar->[$i];
3035 159         263 my $field = $columns_ar->[$i];
3036 159         338 my $new_val = $subref->($self, $field);
3037 156         390 $ar->[$i] = $new_val;
3038             }
3039              
3040             }
3041             }
3042              
3043             # Take into account write_fields if it got set
3044             sub _apply_write_fields {
3045 460     460   948 my ($self, $ar) = @_;
3046              
3047 460         792 my @final;
3048              
3049 460         1745 my $tmp = _get_def($self->{out_fields}, $self->{write_fields});
3050 460 100       1480 my @wf = @{$tmp} if defined($tmp);
  16         36  
3051              
3052 460 100       1388 return unless @wf;
3053              
3054 16         22 my %named_fields = %{$self->{_named_fields}};
  16         50  
3055 16         38 for my $i (0..$#wf) {
3056 40         58 my $field = $wf[$i];
3057 40 100 66     158 my $tmp = $ar->[$named_fields{$field}] if defined($field) and $field ne '';
3058              
3059             # Put here any post-processing of value
3060             # WARNING
3061             # $tmp can be undef
3062             # ...
3063              
3064 40         79 $final[$i] = $tmp;
3065             }
3066 16         51 $_[1] = [ @final ];
3067             }
3068              
3069             sub write {
3070 108     108 1 27763 my $self = shift;
3071              
3072 108         850 validate_pos(@_);
3073              
3074 108 50       416 return undef unless $self->_status_forward('S3');
3075              
3076 103         260 my $verbose = $self->{verbose};
3077 103         209 my $_debug = $self->{_debug};
3078 103         255 my $_debugh = $self->{_debugh};
3079              
3080 103         228 my $out_file = $self->{out_file};
3081              
3082 103         212 my %stats;
3083              
3084 103 50       297 $self->_print("-- $out_file writing start\n") if $verbose;
3085 103         207 my $rows_written = 0;
3086              
3087 103         210 my $outh = $self->{outh};
3088              
3089 103         241 $self->{_close_outh_when_finished} = 0;
3090 103 50       331 unless (defined($outh)) {
3091 103 50       335 if ($out_file eq '') {
3092 0         0 $outh = \*STDOUT;
3093             } else {
3094 103 50       10569 unless (open($outh, '>', $out_file)) {
3095 0         0 $self->_print_error("unable to open file '$out_file': $!");
3096 0         0 return undef;
3097             }
3098 103         457 $self->{_close_outh_when_finished} = 1;
3099             }
3100 103         280 $self->{outh} = $outh;
3101             }
3102              
3103 103 50       383 unless ($self->{_leave_encoding_alone}) {
3104             my $enc = (defined($self->{_inh_encoding}) ?
3105             $self->{_inh_encoding} :
3106 103 50       408 $DEFAULT_OUT_ENCODING);
3107              
3108             # out_encoding option takes precedence
3109 103 100       390 $enc = $self->{out_encoding} if defined($self->{out_encoding});
3110 103         350 my $m = ":encoding($enc)";
3111 103 50 66     345 if (_is_utf8($enc) and $self->{out_utf8_bom}) {
3112 0         0 $m .= ':via(File::BOM)';
3113             }
3114              
3115 103 50 33     440 if ($OS_IS_PLAIN_WINDOWS and $FIX_PERLMONKS_823214) {
3116             # Tested with UTF-16LE, NOT tested with UTF-16BE (it should be the same story)
3117 0 0       0 $m = ":raw:perlio:$m:crlf" if $enc =~ /^utf-?16/i;
3118             }
3119              
3120 103         1044 binmode $outh, $m;
3121 103 50       11702 print($_debugh "Encoding string used for output: $m\n") if $_debug;
3122             }
3123              
3124 103         266 my $escape_char = $self->{escape_char};
3125 103         231 my $quote_char = $self->{quote_char};
3126              
3127 103         205 my %opts;
3128 103         322 $opts{binary} = 1;
3129 103         541 $opts{eol} = "\n";
3130              
3131 103 50       437 $opts{sep_char} = $self->{sep_char} if defined($self->{sep_char});
3132 103 100       346 $opts{sep_char} = $self->{out_sep_char} if defined($self->{out_sep_char});
3133              
3134 103 50       389 $opts{quote_char} = $self->{quote_char} if defined($self->{quote_char});
3135 103 50       323 $opts{quote_char} = $self->{out_quote_char} if defined($self->{out_quote_char});
3136              
3137 103 50       383 $opts{escape_char} = $self->{escape_char} if defined($self->{escape_char});
3138 103 100       306 $opts{escape_char} = $self->{out_escape_char} if defined($self->{out_escape_char});
3139              
3140 103         264 $opts{always_quote} = $self->{_is_always_quoted};
3141 103 100       311 $opts{always_quote} = $self->{out_always_quote} if defined($self->{out_always_quote});
3142              
3143 103         927 my $csvout = Text::CSV->new({ %opts });
3144 103 50       19593 if (!defined($csvout)) {
3145 0         0 $self->_print_error("error creating output Text::CSV object");
3146 0         0 return undef;
3147             }
3148              
3149 103         520 my $write_filter_hr = _get_def($self->{out_filter}, $self->{write_filter_hr});
3150              
3151 103 100 66     915 if (($self->{has_headers} and
      100        
      100        
3152             !(defined($self->{out_has_headers}) and !$self->{out_has_headers}))
3153             or $self->{out_has_headers}) {
3154 101         253 my $ar = [ ];
3155 101 100       366 if ($self->{has_headers}) {
3156 97         251 $ar = $self->{_headers};
3157             } else {
3158 4         8 my $nf = $self->{_named_fields};
3159 4         7 $ar->[$nf->{$_}] = $_ for (keys %{$nf});
  4         25  
3160             }
3161              
3162 101 100       343 if (exists $self->{_out_headers}) {
3163 3         5 my $h = $self->{_out_headers};
3164 3         6 for (keys %{$self->{_named_fields}}) {
  3         11  
3165 12 100       30 if (exists $h->{$_}) {
3166 5         13 $ar->[$self->{_named_fields}->{$_}] = $h->{$_};
3167             }
3168             }
3169             }
3170              
3171 101         445 $self->_apply_write_fields($ar);
3172              
3173 101         1848 $csvout->print($outh, $ar);
3174 101         1337 $rows_written++;
3175             }
3176              
3177 103         233 my $do_status_reset = 0;
3178              
3179              
3180             #
3181             # FIXME!!!
3182             #
3183             # Instead of this duplication of code, provide AutoCSV with a "create iterator sub" feature to
3184             # iterate over all records, whatever is going on behind the scene (in-memory or read input).
3185             #
3186             # Such an iterator would also benefit to module users.
3187             #
3188              
3189              
3190 103 100       351 if ($self->{_status} == 4) {
3191              
3192             #
3193             # The content is available in-memory: we write from what we have in-memory then...
3194             #
3195              
3196 24         105 for my $k ($self->get_keys()) {
3197 87         253 my $hr = $self->get_row_hr($k);
3198 87 50       251 if (defined($write_filter_hr)) {
3199 0 0       0 next unless $write_filter_hr->($hr);
3200             }
3201 87         154 my $ar = [ @{$self->get_row_ar($k)} ];
  87         208  
3202              
3203 87         344 $self->_execute_write_update_before($ar);
3204 87         278 $self->_apply_write_fields($ar);
3205              
3206 87         628 $csvout->print($outh, $ar);
3207 87         989 $rows_written++;
3208             }
3209              
3210             } else {
3211              
3212             #
3213             # No in-memory content available: we read and write in parallel.
3214             #
3215              
3216 79         317 $self->_register_pass("write()");
3217              
3218 79         335 $self->_open_read();
3219 79         173 my $ar;
3220             my $row_hr;
3221 79         339 while ($self->_read_one_record_from_input(\$ar, \$row_hr)) {
3222 301 100       812 if (defined($write_filter_hr)) {
3223 46 100       112 next unless $write_filter_hr->($row_hr, \%stats, $self->get_recnum());
3224             }
3225 275         610 $ar = [ @{$ar} ];
  275         803  
3226              
3227 275         1138 $self->_execute_write_update_before($ar);
3228 272         800 $self->_apply_write_fields($ar);
3229              
3230 272         1914 $csvout->print($outh, $ar);
3231 272         2864 $rows_written++;
3232             }
3233 72         330 $self->_close_read();
3234              
3235 72         177 $do_status_reset = 1
3236             }
3237              
3238 96         400 $self->_close_outh();
3239              
3240 96 50       323 if ($verbose) {
3241 0         0 $self->_print("-- $out_file writing end: $rows_written row(s) written\n");
3242 0         0 for my $k (sort keys %stats) {
3243 0         0 $self->_printf(" %7d %s\n", $stats{$k}, $k);
3244             }
3245             }
3246              
3247 96 100       303 if ($do_status_reset) {
3248 72 50       310 return undef unless $self->_status_reset();
3249             }
3250 96         1094 return $self;
3251             }
3252              
3253              
3254              
3255             #
3256             # * *** ***************************************************************************
3257             # * *** ***************************************************************************
3258             # * OBJ ***************************************************************************
3259             # * *** ***************************************************************************
3260             # * *** ***************************************************************************
3261             #
3262              
3263             #
3264             # The subs below assume Text::AutoCSV can be in status S4 = all in memory.
3265             #
3266              
3267              
3268             sub get_keys {
3269 31     31 1 2672 my $self = shift;
3270 31         242 validate_pos(@_);
3271              
3272 31 50       113 return undef unless $self->_status_forward('S4');
3273              
3274 31         63 my $last_key = @{$self->{_flat}} - 1;
  31         85  
3275 31         106 my @r = (0..$last_key);
3276              
3277 31         108 return @r;
3278             }
3279              
3280             sub get_row_ar {
3281 338     338 1 1307 my $self = shift;
3282 338         2565 validate_pos(@_, {type => SCALAR});
3283 338         1052 my ($key) = @_;
3284              
3285 338 50       912 return undef unless $self->_status_forward('S4');
3286              
3287 338 50       983 unless (defined($key)) {
3288 0         0 $self->_print_error("get_row_ar(): \$key is not defined!");
3289 0         0 return undef;
3290             }
3291              
3292 338 100       997 $self->_print_error("unknown row '$key'") unless defined($self->{_flat}->[$key]);
3293 338         882 return $self->{_flat}->[$key];
3294             }
3295              
3296             sub get_row_hr {
3297 248     248 1 795 my $self = shift;
3298 248         2558 validate_pos(@_, {type => SCALAR});
3299 248         881 my ($key) = @_;
3300              
3301 248         747 my $ar = $self->get_row_ar($key);
3302 248 100       719 return undef unless defined($ar);
3303              
3304 246         790 return $self->_ar_to_hr($ar);
3305             }
3306              
3307             #
3308             # Could be made much more efficient (directly read $self->{_flat} instead of calling get_row_hr
3309             # that itself calls get_row_ar).
3310             # I leave it as is because get_hr_all is not good practice (it is not scalable), it was
3311             # primarily done to ease test plan.
3312             #
3313             # By the way I may make it one day not available by default, requesting caller to tune some
3314             # variable (like { $Text::AutoCSV::i_am_the_test_plan = 1 }) to expose it.
3315             #
3316             sub get_hr_all {
3317 106     106 1 5160 my $self = shift;
3318 106         859 validate_pos(@_);
3319              
3320 106         290 my @resp;
3321 106         448 $self->reset_next_record_hr();
3322 106         387 while (my $hr = $self->get_next_record_hr()) {
3323 404         1187 push @resp, $hr;
3324             }
3325 103         476 return @resp;
3326             }
3327              
3328             sub get_recnum {
3329 190     190 1 350 my $self = shift;
3330 190         1102 validate_pos(@_);
3331              
3332 190 50       642 return -1 unless $self->{_read_in_progress};
3333 190         458 return _get_def($self->{_row_read}, -1);
3334             }
3335              
3336             sub _check_for_search {
3337 890     890   2116 my ($self, $field) = @_;
3338 890 50       2331 return undef unless $self->_status_forward('S4');
3339              
3340 889 100       3707 return 1 if exists $self->{_named_fields}->{$field};
3341             $self->_print_error("search: unknown field '$field'",
3342 6         30 0, ERR_UNKNOWN_FIELD, $self->{_named_fields});
3343             }
3344              
3345             sub get_cell {
3346 4     4 1 671 my $self = shift;
3347 4         48 validate_pos(@_, {type => SCALAR}, {type => SCALAR});
3348 4         15 my ($key, $field) = @_;
3349              
3350 4 50       13 return undef unless $self->_check_for_search($field);
3351 3         10 my $row = $self->get_row_hr($key);
3352 3 100       19 return $row unless defined($row);
3353 2         12 return $row->{$field};
3354             }
3355              
3356             sub get_values {
3357 9     9 1 4150 my $self = shift;
3358 9         132 validate_pos(@_, {type => SCALAR}, {type => UNDEF | CODEREF, optional => 1});
3359 9         44 my ($field, $filter_subref) = @_;
3360              
3361 9 50       34 return undef unless $self->_check_for_search($field);
3362              
3363 9         18 my @values;
3364 9         35 $self->reset_next_record_hr();
3365 9         31 while (my $hr = $self->get_next_record_hr()) {
3366 53 100       118 if (defined($filter_subref)) {
3367 23         41 local $_ = $hr->{$field};
3368 23 100       51 next unless $filter_subref->();
3369             }
3370 42         181 push @values, $hr->{$field};
3371             }
3372 9         43 return @values;
3373             }
3374              
3375             sub _get_hash_and_projector {
3376 561     561   1318 my ($self, $field, $arg_opts) = @_;
3377              
3378 561         1105 my $_debug = $self->{_debug};
3379 561         1042 my $_debugh = $self->{_debugh};
3380              
3381 561 50       1415 my %opts = %{$arg_opts} if defined($arg_opts);
  561         1726  
3382              
3383 561         2351 my $opt_case = _get_def($opts{'case'}, $self->{search_case}, $DEF_SEARCH_CASE);
3384 561         2184 my $opt_trim = _get_def($opts{'trim'}, $self->{search_trim}, $DEF_SEARCH_TRIM);
3385             my $opt_ignore_empty = _get_def($opts{'ignore_empty'}, $self->{search_ignore_empty},
3386 561         2058 $DEF_SEARCH_IGNORE_EMPTY);
3387             my $opt_ignacc = _get_def($opts{'ignore_accents'}, $self->{search_ignore_accents},
3388 561         2042 $DEF_SEARCH_IGNORE_ACCENTS);
3389              
3390 561         1868 my $opts_stringified = $opt_case . $opt_trim . $opt_ignore_empty . $opt_ignacc;
3391 561         1423 my $hash_name = "_h${field}_${opts_stringified}";
3392 561         1188 my $projector_name = "_p${field}_${opts_stringified}";
3393              
3394 561 100 66     2959 if (exists $self->{$hash_name} and exists $self->{$projector_name}) {
    50 33        
3395 489 50       1268 print($_debugh "Search by key '$field': using existing hash and projector (" .
3396             "$hash_name, $projector_name)\n") if $_debug;
3397 489         1906 return ($hash_name, $projector_name);
3398             } elsif (exists $self->{$hash_name} or exists $self->{$projector_name}) {
3399 0         0 confess "Man, check your $PKG module code now!";
3400             }
3401              
3402 72 50       237 print($_debugh "Search by key '$field': building hash\n") if $_debug;
3403              
3404             #
3405             # Projectors
3406             #
3407             # The projector contains subs to derivate the search key from the field value.
3408             # At the moment it is used to manage with case / without case searches and with trim / without trim
3409             # searches (meaning, ignoring spaces at beginning and end of fields)
3410             #
3411             # Why naming it a projector?
3412             # Because if you run it twice on a value, the second run should produce the same result, meaning:
3413             # p(p(x)) = p(x) whatever x
3414             #
3415              
3416 72         145 my @projectors;
3417              
3418             # Add case removal in the projector function list
3419 72 100   1479   398 push @projectors, sub { return lc(shift); } unless $opt_case;
  1479         4573  
3420              
3421             # Add trim in the projector function list
3422 72 100       219 if ($opt_trim) {
3423             push @projectors,
3424             sub {
3425 1479     1479   8134 my $v = shift;
3426 1479         5239 $v =~ s/^\s+|\s+$//g;
3427 1479         3808 return $v;
3428 57         213 };
3429             }
3430              
3431             # Add remove_accents in the projector function list
3432 72 100   1886   328 push @projectors, sub { return remove_accents(shift); } if $opt_ignacc;
  1886         4174  
3433              
3434             my $projector = sub {
3435 1906     1906   4245 my $v = _get_def($_[0], '');
3436 1906         4688 $v = $_->($v) foreach (@projectors);
3437 1906         4083 return $v;
3438 72         251 };
3439              
3440             #
3441             # Filter
3442             #
3443             # As opposed to projectors above (where a search key is transformed), the idea now is to ignore
3444             # certain keys when doing a search.
3445             # At the moment, used to manage searches with / without empty values.
3446             #
3447             # That is to say: shall we use empty value as a regular value to search on, as in
3448             # my @results = $self->search('FIELDNAME', '');
3449             # ?
3450             #
3451             # Right now we don't use an array-based construct, that'd allow to chain filters with one another
3452             # (as we now have only one filter to deal with), later, we may use an array of filters, as done with
3453             # projectors...
3454             #
3455              
3456 72         130 my $filter;
3457 72 100       175 if ($opt_ignore_empty) {
3458 1268     1268   3588 $filter = sub { return $_[0] ne ''; }
3459 61         191 } else {
3460 77     77   183 $filter = sub { return 1; }
3461 11         48 }
3462              
3463 72         154 my %h;
3464             my $k;
3465 72         286 $self->reset_next_record_hr();
3466 72         320 while (my $hr = $self->get_next_record_hr(\$k)) {
3467 1345         2470 my $kv = $hr->{$field};
3468 1345         2567 my $p = $projector->($kv);
3469 1345 100       2723 unless ($filter->($p)) {
3470 76 50       184 print($_debugh "Ignoring key value '$p' in hash build\n") if $_debug;
3471 76         256 next;
3472             }
3473 1269         2012 push @{$h{$p}}, $k;
  1269         6049  
3474             }
3475 72         486 for (keys %h) {
3476 1178         1638 @{$h{$_}} = sort { $a <=> $b } @{$h{$_}};
  1178         2405  
  103         273  
  1178         2368  
3477             }
3478              
3479 72         247 $self->{_hash_build_count}++;
3480 72         853 $self->{$hash_name} = { %h };
3481 72         260 $self->{$projector_name} = $projector;
3482 72         564 return ($hash_name, $projector_name);
3483             }
3484              
3485             sub _get_hash_build_count {
3486 6     6   1488 my $self = shift;
3487              
3488 6         33 return _get_def($self->{_hash_build_count}, 0);
3489             }
3490              
3491             sub search {
3492 563     563 1 14062 my $self = shift;
3493 563         6299 validate_pos(@_,
3494             {type => SCALAR}, {type => UNDEF | SCALAR}, {type => UNDEF | HASHREF, optional => 1});
3495 563         2234 my ($field, $value, $param_opts) = @_;
3496              
3497 563         1201 my $croak_if_error = $self->{croak_if_error};
3498              
3499             #
3500             # FIXME?
3501             # A bit overkill to check options each time search is called...
3502             # To be thought about.
3503             #
3504              
3505 563 100       1548 my @tmp = %{$param_opts} if $param_opts;
  421         1572  
3506 563         9833 my %opts = validate(@tmp, $SEARCH_VALIDATE_OPTIONS);
3507              
3508 562 50       2778 return undef unless $self->_check_for_search($field);
3509              
3510             # $self->_print_error("undef value in search call") if !defined($value);
3511 561 50       1448 $value = '' unless defined($value);
3512              
3513 561         1661 my ($hash_name, $projector_name) = $self->_get_hash_and_projector($field, \%opts);
3514              
3515 561         1925 my $ret = $self->{$hash_name}->{$self->{$projector_name}->($value)};
3516              
3517 561 100       2643 return $ret if defined($ret);
3518 185         724 return [ ];
3519             }
3520              
3521             sub search_1hr {
3522 22     22 1 10058 my $self = shift;
3523 22         332 validate_pos(@_,
3524             {type => SCALAR}, {type => UNDEF | SCALAR}, {type => UNDEF | HASHREF, optional => 1});
3525 22         96 my ($field, $value, $arg_opts) = @_;
3526              
3527 22         72 my $r = $self->search($field, $value, $arg_opts);
3528              
3529 22 100       74 return undef unless defined($r->[0]);
3530              
3531 20         48 my $opts = _get_def($arg_opts, { });
3532             my $opt_ignore_ambiguous = _get_def($opts->{'ignore_ambiguous'},
3533 20         72 $self->{'search_ignore_ambiguous'}, $DEF_SEARCH_IGNORE_AMBIGUOUS);
3534              
3535 20 100 100     38 return undef if @{$r} >= 2 and !$opt_ignore_ambiguous;
  20         107  
3536 14         50 return $self->get_row_hr($r->[0]);
3537             }
3538              
3539             sub vlookup {
3540 317     317 1 33785 my $self = shift;
3541 317         5554 validate_pos(@_, {type => SCALAR}, {type => UNDEF | SCALAR}, {type => SCALAR},
3542             {type => UNDEF | HASHREF, optional => 1});
3543 317         1630 my ($searched_field, $value, $target_field, $arg_opts) = @_;
3544              
3545 317         991 my $r = $self->search($searched_field, $value, $arg_opts);
3546 315 50       945 return undef unless $self->_check_for_search($target_field);
3547              
3548 314         924 my $opts = _get_def($arg_opts, { });
3549 314 100 66     989 unless (defined($r->[0])) {
3550             return (exists $opts->{'value_if_not_found'} ? $opts->{'value_if_not_found'} :
3551 143 100       682 $self->{'search_value_if_not_found'});
3552             } elsif (@{$r} >= 2) {
3553             my $opt_ignore_ambiguous = _get_def($opts->{'ignore_ambiguous'},
3554             $self->{'search_ignore_ambiguous'}, $DEF_SEARCH_IGNORE_AMBIGUOUS);
3555             return (exists $opts->{'value_if_ambiguous'} ? $opts->{'value_if_ambiguous'} :
3556             $self->{'search_value_if_ambiguous'}) if !$opt_ignore_ambiguous;
3557             }
3558              
3559 144 100       458 return $opts->{value_if_found} if exists $opts->{value_if_found};
3560 140 50       394 return $self->{search_value_if_found} if exists $opts->{search_value_if_found};
3561              
3562 140         449 my $hr = $self->get_row_hr($r->[0]);
3563              
3564 140         730 return $hr->{$target_field};
3565             }
3566              
3567             1;
3568              
3569             __END__
3570              
3571             =pod
3572              
3573             =encoding UTF-8
3574              
3575             =head1 NAME
3576              
3577             Text::AutoCSV - helper module to automate the use of Text::CSV
3578              
3579             =head1 VERSION
3580              
3581             version 1.1.8
3582              
3583             =head1 SYNOPSIS
3584              
3585             By default, Text::AutoCSV will detect the following characteristics of the input:
3586              
3587             - The separator, among ",", ";" and "\t" (tab)
3588              
3589             - The escape character, among '"' (double-quote) and '\\' (backslash)
3590              
3591             - Try UTF-8 and if it fails, fall back on latin1
3592              
3593             - Read the header line and compute field names
3594              
3595             - If asked to (see L</fields_dates_auto>), detect any field that contains a DateTime value, trying
3596             20 date formats, possibly followed by a time (6 time formats tested)
3597              
3598             - If asked to (see L</fields_dates>), detect DateTime format of certain fields, croak if no DateTime
3599             format can be worked out
3600              
3601             - Fields identified as containing a DateTime value (L</fields_dates_auto> or L</fields_dates>) are
3602             stored as DateTime objects by default
3603              
3604             Text::AutoCSV also provides methods to search on fields (using cached hash tables) and it can
3605             populate the value of "remote" fields, made from joining 2 CSV files with a key-value search
3606              
3607             =head2 General
3608              
3609             use Text::AutoCSV;
3610              
3611             Text::AutoCSV->new()->write(); # Read CSV data from std input, write to std output
3612              
3613             Text::AutoCSV->new(in_file => 'f.csv')->write(); # Read CSV data from f.csv, write to std output
3614              
3615             # Read CSV data from f.csv, write to g.csv
3616             Text::AutoCSV->new(in_file => 'f.csv', out_file => 'g.csv')->write();
3617              
3618             # "Rewrite" CSV file by printing out records as a list (separated by line breaks) of field
3619             # name followed by its value.
3620             my $csv = Text::AutoCSV->new(in_file => 'in.csv', walker_hr => \&walk);
3621             my @cols = $csv->get_fields_names();
3622             $csv->read();
3623             sub walk {
3624             my %rec = %{$_[0]};
3625             for (@cols) {
3626             next if $_ eq '';
3627             print("$_ => ", $rec{$_}, "\n");
3628             }
3629             print("\n");
3630             }
3631              
3632             =head2 OBJ-ish functions
3633              
3634             # Identify column internal names with more flexibility as the default mechanism
3635             my $csv = Text::AutoCSV->new(in_file => 'zips.csv',
3636             fields_hr => {'CITY' => '^(city|town)', 'ZIPCODE' => '^zip(code)?$'});
3637             # Get zipcode of Claix
3638             my $z = $csv->vlookup('CITY', ' claix ', 'ZIPCODE');
3639              
3640             my $csv = Text::AutoCSV->new(in_file => 'zips.csv');
3641             # Get zipcode of Claix
3642             my $z = $csv->vlookup('CITY', ' claix ', 'ZIPCODE');
3643             # Same as above, but vlookup is strict for case and spaces around
3644             my $csv = Text::AutoCSV->new(in_file => 'zips.csv', search_case => 1, search_trim => 0);
3645             my $z = $csv->vlookup('CITY', 'Claix', 'ZIPCODE');
3646              
3647             # Create field 'MYCITY' made by taking pers.csv' ZIP column value, looking it up in the
3648             # ZIPCODE columns of zips.csv, taking CITY colmun value and naming it 'MYCITY'. Output is
3649             # written in std output.
3650             # If a zipcode is ambiguous, say it.
3651             Text::AutoCSV->new(in_file => 'pers.csv')
3652             ->field_add_link('MYCITY', 'ZIP->ZIPCODE->CITY', 'zips.csv',
3653             { ignore_ambiguous => 0, value_if_ambiguous => '<duplicate zipcode found!>' })->write();
3654              
3655             # Note the above can also be written using Text::AutoCSV level attributes:
3656             Text::AutoCSV->new(in_file => 'pers.csv',
3657             search_ignore_ambiguous => 0, search_value_if_ambiguous => '<duplicate zipcode found!>')
3658             ->field_add_link('MYCITY', 'ZIP->ZIPCODE->CITY', 'zips.csv')->write();
3659              
3660             # Create 'MYCITY' field as above, then display some statistics
3661             my $nom_compose = 0;
3662             my $zip_not_found = 0;
3663             Text::AutoCSV->new(in_file => 'pers.csv', walker_hr => \&walk)
3664             ->field_add_link('MYCITY', 'ZIP->ZIPCODE->CITY', 'zips.csv')->read();
3665             sub walk {
3666             my $hr = shift;
3667             $nom_compose++ if $hr->{'NAME'} =~ m/[- ]/;
3668             $zip_not_found++ unless defined($hr->{'MYCITY'});
3669             }
3670             print("Number of persons with a multi-part name: $nom_compose\n");
3671             print("Number of persons with unknown zipcode: $zip_not_found\n");
3672              
3673             =head2 Updating
3674              
3675             Text::AutoCSV->new(in_file => 'names.csv', out_file => 'ucnames.csv',
3676             read_post_update_hr => \&updt)->write();
3677             sub updt { $_[0]->{'LASTNAME'} =~ s/^.*$/\U&/; }
3678              
3679             Text::AutoCSV->new(in_file => 'squares.csv', out_file => 'checkedsquares.csv',
3680             out_filter => \&wf)->write();
3681             sub wf { return ($_[0]->{'X'} ** 2 == $_[0]->{'SQUAREOFX'}); }
3682              
3683             # Add a field for the full name, made of the concatenation of the
3684             # first name and the last name.
3685             # Also display stats about empty full names.
3686             Text::AutoCSV->new(in_file => 'dirpeople.csv', out_file => 'dirwithfn.csv', verbose => 1)
3687             ->field_add_computed('FULLNAME', \&calc_fn)->write();
3688             sub calc_fn {
3689             my ($field, $hr, $stats) = @_;
3690             my $fn = $hr->{'FIRSTNAME'} . ' ' . uc($hr->{'LASTNAME'});
3691             $stats->{'empty full name'}++ if $fn eq ' ';
3692             return $fn;
3693             }
3694              
3695             # Read a file with a lot of columns and keep only 2 columns in output
3696             Text::AutoCSV->new(in_file => 'big.csv', out_file => 'addr.csv',
3697             out_fields => ['NAME', 'ADDRESS'])
3698             ->out_header('ADDRESS', 'Postal Address')
3699             ->write();
3700              
3701             =head2 Datetime management
3702              
3703             # Detect any field containing a DateTime value and convert it to yyyy-mm-dd whatever the
3704             # input format is.
3705             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', fields_dates_auto => 1,
3706             out_dates_format => '%F')->write();
3707              
3708             # Detect any field containing a DateTime value and convert it to a US DateTime whatever the
3709             # input format is.
3710             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', fields_dates_auto => 1,
3711             out_dates_format => '%b %d, %Y, %I:%M:%S %p', out_dates_locale => 'en')->write();
3712              
3713             # Find dates of specific formats and convert it into yyyy-mm-dd
3714             Text::AutoCSV->new(in_file => 'raw.csv', out_file => 'cooked.csv',
3715             dates_formats_to_try => ['%d_%m_%Y', '%m_%d_%Y', '%Y_%m_%d'],
3716             out_dates_format => '%F')->write();
3717              
3718             # Take the dates on columns 'LASTLOGIN' and 'CREATIONDATE' and convert it into French dates
3719             # (day/month/year).
3720             # Text::AutoCSV will croak if LASTLOGIN or CREATIONDATE do not contain a DateTime format.
3721             # By default, Text::AutoCSV will try 20 different formats.
3722             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv',
3723             fields_dates => ['LASTLOGIN', 'CREATIONDATE'], out_dates_format => '%d/%m/%Y')->write();
3724              
3725             # Convert 2 DateTime fields into unix standard epoch
3726             # Write -1 if DateTime is empty.
3727             sub toepoch { return $_->epoch() if $_; -1; }
3728             Text::AutoCSV->new(in_file => 'stats.csv', out_file => 'stats-epoch.csv',
3729             fields_dates => ['ATIME', 'MTIME'])
3730             ->in_map('ATIME', \&toepoch)
3731             ->in_map('MTIME', \&toepoch)
3732             ->write();
3733              
3734             # Do the other way round from above: convert 2 fields containing unix standard epoch into a
3735             # string displaying a human-readable DateTime.
3736             my $formatter = DateTime::Format::Strptime->new(pattern => 'DATE=%F, TIME=%T');
3737             sub fromepoch {
3738             return $formatter->format_datetime(DateTime->from_epoch(epoch => $_)) if $_ >= 0;
3739             '';
3740             }
3741             $csv = Text::AutoCSV->new(in_file => 'stats-epoch.csv', out_file => 'stats2.csv')
3742             ->in_map('ATIME', \&fromepoch)
3743             ->in_map('MTIME', \&fromepoch)
3744             ->write();
3745              
3746             =head2 Miscellaneous
3747              
3748             use Text::AutoCSV 'remove_accents';
3749             # Output 'Francais: etre elementaire, Tcheque: sluzba dum' followed by a new line.
3750             print remove_accents("Français: être élémentaire, Tchèque: služba dům"), "\n";
3751              
3752             =for Pod::Coverage ERR_UNKNOWN_FIELD
3753              
3754             =head1 NAME
3755              
3756             Text::AutoCSV - helper module to automate the use of Text::CSV
3757              
3758             =head1 METHODS
3759              
3760             =head2 new
3761              
3762             my $csv = Text::AutoCSV->new(%attr);
3763              
3764             (Class method) Returns a new instance of Text::AutoCSV. The object attributes are described by the
3765             hash C<%attr> (can be empty).
3766              
3767             Currently the following attributes are available:
3768              
3769             =over 4
3770              
3771             =item Preliminary note about L</fields_hr>, L</fields_ar> and L</fields_column_names> attributes
3772              
3773             By default, Text::AutoCSV assumes the input has a header and will use the field values of this first
3774             line (the header) to work out the column internal names. These internal names are used everywhere in
3775             Text::AutoCSV to designate columns.
3776              
3777             The values are transformed as follows:
3778              
3779             - All accents are removed using the exportable function L</remove_accents>.
3780              
3781             - Any non-alphanumeric character is removed (except underscore) and all letters are switched to
3782             upper case. The regex to do this is
3783              
3784             s/[^[:alnum:]_]//gi; s/^.*$/\U$&/;
3785              
3786             Thus a header line of
3787              
3788             'Office Number 1,Office_2,Personal Number'
3789              
3790             will produce the internal column names
3791              
3792             'OFFICENUMBER1' (first column)
3793              
3794             'OFFICE_2' (second column)
3795              
3796             'PERSONALNUMBER' (third column).
3797              
3798             The attribute L</fields_hr>, L</fields_ar> or L</fields_column_names> (only one of the three is
3799             useful at a time) allows to change this behavior.
3800              
3801             B<NOTE>
3802              
3803             The removal of accents is *not* a conversion to us-ascii, see L</remove_accents> for details.
3804              
3805             =item Preliminary note about fields reading
3806              
3807             Functions that are given a field name (L</get_cell>, L</vlookup>, L</field_add_copy>, ...) raise an
3808             error if the field requested does not exist.
3809              
3810             B<SO WILL THE HASHREFS GIVEN BY Text::AutoCSV:> when a function returns a hashref (L</search_1hr>,
3811             L</get_row_hr>, ...), the hash is locked with the C<lock_keys> function of C<Hash::Util>. Any
3812             attempt to read a non-existing key from the hash causes a croak. This feature is de-activated if you
3813             specified C<croak_if_error =E<gt> 0> when creating Text::AutoCSV object.
3814              
3815             =item in_file
3816              
3817             The name of the file to read CSV data from.
3818              
3819             If not specified or empty, read standard input.
3820              
3821             Example:
3822              
3823             my $csv = Text::AutoCSV->new(in_file => 'in.csv');
3824              
3825             =item inh
3826              
3827             File handle to read CSV data from.
3828             Normally you don't want to specify this attribute.
3829              
3830             C<inh> is useful if you don't like the way Text::AutoCSV opens the input file for you.
3831              
3832             Example:
3833              
3834             open my $inh, "producecsv.sh|";
3835             my $csv = Text::AutoCSV->new(inh => $inh);
3836              
3837             =item encoding
3838              
3839             Comma-separated list of encodings to try to read input.
3840              
3841             Note that finding the correct encoding of any given input is overkill. This script just tries
3842             encodings one after the other, and selects the first one that does not trigger a warning during
3843             reading of input. If all produce warnings, select the first one.
3844              
3845             The encoding chosen is used in output, unless attribute L</out_encoding> is specified.
3846              
3847             Value by default: 'UTF-8,latin1'
3848              
3849             B<IMPORTANT>
3850              
3851             If one tries something like C<encoding =E<gt> 'latin1,UTF-8'>, it'll almost never detect UTF-8
3852             because latin1 rarely triggers warnings during reading. It tends to be also true with encodings like
3853             UTF-16 that can remain happy with various inputs (sometimes resulting in Western languages turned
3854             into Chinese text).
3855              
3856             Ultimately this attribute should be used with a unique value. The result when using more than one
3857             value can produce weird results and should be considered B<experimental>.
3858              
3859             Example:
3860              
3861             my $csv = Text::AutoCSV->new(in_file => 'w.csv', encoding => 'UTF-16');
3862              
3863             =item via
3864              
3865             Adds a C<via> to the file opening instruction performed by Text::AutoCSV. You don't want to use it
3866             under normal circumstances.
3867              
3868             The value should start with a ':' character (Text::AutoCSV won't add one for you).
3869              
3870             Value by default: none
3871              
3872             Example:
3873              
3874             my $csv = Text::AutoCSV->new(in_file => 'in.csv', via => ':raw:perlio:UTF-32:crlf');
3875              
3876             =item dont_mess_with_encoding
3877              
3878             If true, just ignore completely encoding and don't try to alter I/O operations with encoding
3879             considerations (using C<binmode> instruction). Note that if inh attribute is specified, then
3880             Text::AutoCSV will consider the caller manages encoding for himself and dont_mess_with_encoding will
3881             be automatically set, too.
3882              
3883             B<IMPORTANT>
3884              
3885             This attribute does not mean perl will totally ignore encoding and would consider character strings
3886             as bytes for example. The meaning of L</dont_mess_with_encoding> is that Text::AutoCSV itself will
3887             totally ignore encoding matters, and leave it entirely to Perl' default.
3888              
3889             Value by default:
3890              
3891             0 if inh attribute is not set
3892             1 if inh attribute is set
3893              
3894             Example:
3895              
3896             my $csv = Text::AutoCSV->new(in_file => 'in.csv', dont_mess_with_encoding => 1);
3897              
3898             =item sep_char
3899              
3900             Specify the CSV separator character. Turns off separator auto-detection. This attribute is passed as
3901             is to C<Text::CSV-E<gt>new()>.
3902              
3903             Example:
3904              
3905             my $csv = Text::AutoCSV->new(in_file => 'in.csv', sep_char => ';');
3906              
3907             =item quote_char
3908              
3909             Specify the field quote character. This attribute is passed as is to C<Text::CSV-E<gt>new()>.
3910              
3911             Value by default: double quote ('"')
3912              
3913             Example:
3914              
3915             my $csv = Text::AutoCSV->new(in_file => 'in.csv', quote_char => '\'');
3916              
3917             =item escape_char
3918              
3919             Specify the escape character. Turns off escape character auto-detection. This attribute is passed as
3920             is to C<Text::CSV-E<gt>new()>.
3921              
3922             Value by default: backslash ('\\')
3923              
3924             Example:
3925              
3926             my $csv = Text::AutoCSV->new(in_file => 'in.csv', escape_char => '"');
3927              
3928             =item in_csvobj
3929              
3930             Text::CSV object to use.
3931             Normally you don't want to specify this attribute.
3932              
3933             By default, Text::AutoCSV will manage creating such an object and will work hard to detect the
3934             parameters it requires.
3935              
3936             Defining C<in_csvobj> attribute turns off separator character and escape character auto-detection.
3937              
3938             Using this attribute workarounds Text::AutoCSV philosophy a bit, but you may need it in case
3939             Text::AutoCSV behavior is not suitable for Text::CSV creation.
3940              
3941             Example:
3942              
3943             my $tcsv = Text::CSV->new();
3944             my $acsv = Text::AutoCSV->new(in_file => 'in.csv', in_csvobj => $tcsv);
3945              
3946             =item has_headers
3947              
3948             If true, Text::AutoCSV assumes the input has a header line.
3949              
3950             Value by default: 1
3951              
3952             Example:
3953              
3954             my $csv = Text::AutoCSV->new(in_file => 'in.csv', has_headers => 0);
3955              
3956             =item fields_hr
3957              
3958             (Only if input has a header line) Hash ref that contains column internal names along with a regular
3959             expression to find it in the header line.
3960             For example if you have:
3961              
3962             my $csv = Text::AutoCSV->new(in_file => 'in.csv',
3963             fields_hr => {'PHONE OFFICE' => '^office phone nu',
3964             'PHONE PERSONAL' => '^personal phone nu'});
3965              
3966             And the header line is
3967              
3968             'Personal Phone Number,Office Phone Number'
3969              
3970             the column name 'PHONE OFFICE' will designate the second column and the column name 'PHONE PERSONAL'
3971             will designate the first column.
3972              
3973             You can choose column names like 'Phone Office' and 'Phone Personal' as well.
3974              
3975             The regex search is case insensitive.
3976              
3977             =item fields_ar
3978              
3979             (Only if input has a header line) Array ref that contains column internal names. The array is used
3980             to create a hash ref of the same kind as L</fields_hr>, by wrapping the column name in a regex. The
3981             names are surrounded by a leading '^' and a trailing '$', meaning, the name must match the entire
3982             field name.
3983              
3984             For example
3985              
3986             fields_ar => ['OFFICENUMBER', 'PERSONALNUMBER']
3987              
3988             is strictly equivalent to
3989              
3990             fields_hr => {'OFFICENUMBER' => '^officenumber$', 'PERSONALNUMBER' = '^personalnumber$'}
3991              
3992             The regex search is case insensitive.
3993              
3994             C<fields_ar> is useful if the internal names are identical to the file column names. It avoids
3995             repeating the names over and over as would happen if using L</fields_hr> attribute.
3996              
3997             I<NOTE>
3998              
3999             You might wonder why using fields_ar as opposed to Text::AutoCSV default' mechanism. There are two
4000             reasons for that:
4001              
4002             1- Text::AutoCSV removes spaces from column names, and some people may want another behavior. A
4003             header name of 'Phone Number' will get an internal column name of 'PHONENUMBER' (default behavior,
4004             if none of fields_hr, fields_ar and fields_column_names attributes is specified), and one may prefer
4005             'PHONE NUMBER' or 'phone number' or whatsoever.
4006              
4007             2- By specifying a list of columns using either of fields_hr or fields_ar, you not only map column
4008             names as found in the header line to internal column names: you also I<request> these columns to be
4009             available. If one of the requested columns cannot be found, Text::AutoCSV will croak (default) or
4010             print an error and return an undef object (if created with C<croak_if_error =E<gt> 0>).
4011              
4012             =item fields_column_names
4013              
4014             Array ref of column internal names, in the order of columns in file. This attribute works like the
4015             C<column_names> attribute of Text::CSV. It'll just assign names to columns one by one, regardless of
4016             what the header line contains. It'll work also if the file has no header line.
4017              
4018             Example:
4019              
4020             my $csv = Text::AutoCSV->new(in_file => 'in.csv',
4021             fields_column_names => ['My COL1', '', 'My COL3']);
4022              
4023             =item out_file
4024              
4025             Output file when executing the L</write> method.
4026              
4027             If not specified or empty, write to standard output.
4028              
4029             Example:
4030              
4031             my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv');
4032              
4033             =item outh
4034              
4035             File handle to write CSV data to when executing the L</write> method.
4036             Normally you don't want to specify this attribute.
4037              
4038             C<outh> is useful if you don't like the way Text::AutoCSV opens the output file for you.
4039              
4040             Example:
4041              
4042             my $outh = open "myin.csv', ">>";
4043             my $csv = Text::AutoCSV->new(in_file => 'in.csv', has_headers => 0, outh => $outh);
4044              
4045             =item out_encoding
4046              
4047             Enforce the encoding of output.
4048              
4049             Value by default: input encoding
4050              
4051             Example:
4052              
4053             my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv',
4054             out_encoding => 'UTF-16');
4055              
4056             =item out_utf8_bom
4057              
4058             Enforce BOM (Byte-Order-Mark) on output, when it is UTF8. If output encoding is not UTF-8, this
4059             attribute is ignored.
4060              
4061             B<NOTE>
4062              
4063             UTF-8 needs no BOM (there is no Byte-Order in UTF-8), and in practice, UTF8-encoded files rarely
4064             have a BOM.
4065              
4066             Using this attribute is not recommended. It is provided for the sake of completeness, and also to
4067             produce Unicode files Microsoft EXCEL will be happy to read.
4068              
4069             At first sight it would seem more logical to make EXCEL happy with something like this:
4070              
4071             out_encoding => 'UTF-16'
4072              
4073             But... While EXCEL will identify UTF-16 and read it as such, it will not take into account the BOM
4074             found at the beginning. In the end the first cell will have 2 useless characters prepended. The only
4075             solution the author knows to workaround this issue if to use UTF-8 as output encoding, and enforce a
4076             BOM. That is, use:
4077              
4078             ..., out_encoding => 'UTF-8', out_utf8_bom => 1, ...
4079              
4080             =item out_sep_char
4081              
4082             Enforce the output CSV separator character.
4083              
4084             Value by default: input separator
4085              
4086             Example:
4087              
4088             my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', out_sep_char => ',');
4089              
4090             =item out_quote_char
4091              
4092             Enforce the output CSV quote character.
4093              
4094             Value by default: input quote character
4095              
4096             Example:
4097              
4098             my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', out_quote_char => '"');
4099              
4100             =item out_escape_char
4101              
4102             Enforce the output CSV escape character.
4103              
4104             Value by default: input escape character
4105              
4106             Example:
4107              
4108             my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv',
4109             out_escape_char_char => '\\');
4110              
4111             =item out_always_quote
4112              
4113             If true, quote all fields of output (set always_quote of Text::CSV).
4114              
4115             If false, don't quote all fields of output (don't set C<always_quote> of Text::CSV).
4116              
4117             Value by default: same as what is found in input
4118              
4119             While reading input, Text::AutoCSV works out whether or not all fields were quoted. If yes, then the
4120             output Text::CSV object has the always_quote attribute set, if no, then the output Text::CSV object
4121             does not have this attribute set.
4122              
4123             Example:
4124              
4125             my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', out_always_quote => 1);
4126              
4127             =item out_has_headers
4128              
4129             If true, when writing output, write a header line on first line.
4130              
4131             If false, when writing output, don't write a header line on first line.
4132              
4133             Value by default: same as has_headers attribute
4134              
4135             Example 1
4136              
4137             Read standard input and write to standard output, removing the header line.
4138              
4139             Text::AutoCSV->new(out_has_headers => 0)->write();
4140              
4141             Example 2
4142              
4143             Read standard input and write to standard output, adding a header line.
4144              
4145             Text::AutoCSV->new(fields_column_names => ['MYCOL1', 'MYCOL2'], out_has_headers => 1)->write();
4146              
4147             =item no_undef
4148              
4149             If true, non-existent column values are set to an empty string instead of undef. It is also done on
4150             extra fields that happen to have an undef value (for example when the target of a linked field is
4151             not found).
4152              
4153             Note this attribute does not work on callback functions output set with L</in_map>: for example
4154             empty DateTime values (on fields identified as containing a date/time, see C<dates_*> attributes
4155             below) are set to C<undef>, even while C<no_undef> is set. Indeed setting it to an empty string
4156             while non-empty values would contain a Datetime object would not be clean. An empty value in a
4157             placeholder containing an object must be undef.
4158              
4159             Since version 1.1.5 of Text::AutoCSV, C<no_undef> is examined when sending parameter ($_) to
4160             L</in_map> callback: an undef value is now passed as is (as undef), unless C<no_undef> is set. If
4161             C<no_undef> is set, and field value is undef, then $_ is set to the empty string ('') when calling
4162             callback defined by L</in_map>. This new behavior was put in place to be consistent with what is
4163             being done with DateTime values.
4164              
4165             Value by default: 0
4166              
4167             Example:
4168              
4169             my $csv = Text::AutoCSV->new(in_file => 'in.csv', no_undef => 1);
4170              
4171             =item read_post_update_hr
4172              
4173             To be set to a ref sub. Each time a record is read from input, call C<read_post_update_hr> to update
4174             the hash ref of the record. The sub is called with 2 arguments: the hash ref to the record value and
4175             the hash ref to stats.
4176              
4177             The stats allow to count events and are printed in the end of reading in case Text::AutoCSV is
4178             called in verbose mode (C<verbose =E<gt> 1>).
4179              
4180             For example, the C<read_post_update_hr> below will turn column 'CITY' values in upper case and count
4181             occurences of empty cities in stat display:
4182              
4183             Text::AutoCSV->new(in_file => 'addresses.csv', read_post_update_hr => \&updt, verbose => 1)
4184             ->write();
4185             sub updt {
4186             my ($hr, $stats) = @_;
4187             $hr->{'CITY'} =~ s/^.*$/\U$&/;
4188             $stats->{'empty city encountered'}++ if $hr->{'CITY'} eq '';
4189             }
4190              
4191             B<IMPORTANT>
4192              
4193             You cannot create a field this way. To create a field, you have to use the member functions
4194             L</field_add_link>, L</field_add_copy> or L</field_add_computed>.
4195              
4196             B<NOTE>
4197              
4198             If you wish to manage some updates at field level, consider registering update functions with
4199             L</in_map> and L</out_map> member functions. These functions register callbacks that work at field
4200             level and with $_ variable (thus the callback function invoked is AutoCSV-agnostic).
4201              
4202             L</in_map> updates a field after read, L</out_map> updates the field content before writing it.
4203              
4204             =item walker_hr
4205              
4206             To set to a sub ref that'll be executed each time a record is read from input. It is executed after
4207             L</read_post_update_hr>. The sub is called with 2 arguments: the hash ref to the record value and
4208             the hash ref to stats.
4209              
4210             Note L</read_post_update_hr> is meant for updating record fields just after reading, whereas
4211             L</walker_hr> is read-only.
4212              
4213             The stats allow to count events and are printed in the end of reading in case Text::AutoCSV is
4214             called in verbose mode (C<verbose =E<gt> 1>). If the L</verbose> attribute is not set, the stats are
4215             not displayed, however you can get stats by calling the get_stats function.
4216              
4217             The example below will count in the stats the number of records where the 'CITY' field is empty.
4218             Thanks to C<verbose =E<gt> 1> attribute, at the end of reading the stats are displayed.
4219              
4220             my $csv = Text::AutoCSV->new(in_file => 'addresses.csv', walker_hr => \&walk1,
4221             verbose => 1)->read();
4222             sub walk1 {
4223             my ($hr, $stats) = @_;
4224             $stats->{'empty city'}++ if $hr->{'CITY'} eq '';
4225             }
4226              
4227             =item walker_ar
4228              
4229             To set to a sub ref that'll be executed each time a record is read from input. It is executed after
4230             L</read_post_update_hr>. The sub is called with 2 arguments: the array ref to the record value and
4231             the hash ref to stats.
4232              
4233             Note L</read_post_update_hr> is meant for updating record fields just after reading, whereas
4234             C<walker_hr> is read-only.
4235              
4236             The stats allow to count events and are printed in the end of reading in case Text::AutoCSV is
4237             called in verbose mode (C<verbose =E<gt> 1>). If the L</verbose> attribute is not set, the stats are
4238             lost.
4239              
4240             The array ref contains values in their natural order in the CSV. To be used with the column names,
4241             you have to use L</get_fields_names> member function.
4242              
4243             The example below will count in the stats the number of records where the 'CITY' field is empty.
4244             Thanks to C<verbose =E<gt> 1> attribute, at the end of reading the stats are displayed. It produces
4245             the exact same result as the example in walker_hr attribute, but it uses walker_ar.
4246              
4247             use List::MoreUtils qw(first_index);
4248             my $csv = Text::AutoCSV->new(in_file => 'addresses.csv', walker_ar => \&walk2, verbose => 1);
4249             my @cols = $csv->get_fields_names();
4250             my $idxCITY = first_index { /^city$/i } @cols;
4251             die "No city field!??" if $idxCITY < 0;
4252             $csv->read();
4253             sub walk2 {
4254             my ($ar, $stats) = @_;
4255             $stats->{'empty city'}++ if $ar->[$idxCITY] eq '';
4256             }
4257              
4258             =item write_filter_hr
4259              
4260             Alias of L</out_filter>.
4261              
4262             =item out_filter
4263              
4264             To be set to a ref sub. Before writing a record to output, C<out_filter> is called and the record
4265             gets writen only if C<out_filter> return value is true. The sub is called with 1 argument: the hash
4266             ref to the record value.
4267              
4268             For example, if you want to output only records where the 'CITY' column value is Grenoble:
4269              
4270             Text::AutoCSV->new(in_file => 'addresses.csv', out_file => 'grenoble.csv',
4271             out_filter => \&filt)->write();
4272             sub filt {
4273             my $hr = shift;
4274             return 1 if $hr->{'CITY'} =~ /^grenoble$/i;
4275             return 0;
4276             }
4277              
4278             =item write_fields
4279              
4280             Alias of L</out_fields>.
4281              
4282             =item out_fields
4283              
4284             Set to an array ref. List fields to write to output.
4285              
4286             Fields are written in their order in the array ref, the first CSV column being the first element in
4287             the array, and so on. Fields not listed in B<out_fields> are not written in output.
4288              
4289             You can use empty field names to have empty columns in output.
4290              
4291             Example:
4292              
4293             Text::AutoCSV->new(in_file => 'allinfos.csv', out_file => 'only-addresses.csv',
4294             out_fields => [ 'NAME', 'ADDRESS' ] )->write();
4295              
4296             =item search_case
4297              
4298             If true, searches are case sensitive by default. Searches are done by the member functions
4299             L</search>, L</search_1hr>, L</vlookup>, and linked fields (L</field_add_link>).
4300              
4301             The search functions can also be called with the option L</case>, that takes precedence over the
4302             object-level C<search_case> attribute value. See L</vlookup> help.
4303              
4304             Value by default: 0 (by default searches are case insensitive)
4305              
4306             Example:
4307              
4308             my $csv = Text::AutoCSV->new(in_file => 'in.csv', search_case => 1);
4309              
4310             =item search_trim
4311              
4312             If true, searches ignore the presence of leading or trailing spaces in values.
4313              
4314             The search functions can also be called with the option L</trim>, that takes precedence over the
4315             object-level C<search_trim> attribute value. See L</vlookup> help.
4316              
4317             Value by default: 1 (by default searches ignore leading and trailing spaces)
4318              
4319             Example:
4320              
4321             my $csv = Text::AutoCSV->new(in_file => 'in.csv', search_trim => 0);
4322              
4323             =item search_ignore_empty
4324              
4325             If true, empty fields are not included in the search indexes.
4326              
4327             The search functions can also be called with the option L</ignore_empty>, that takes precedence over
4328             the object-level C<search_ignore_empty> attribute value. See L</vlookup> help.
4329              
4330             Value by default: 1 (by default, search of the value '' will find nothing)
4331              
4332             Example:
4333              
4334             my $csv = Text::AutoCSV->new(in_file => 'in.csv', search_ignore_empty => 0);
4335              
4336             =item search_ignore_accents
4337              
4338             If true, accents are ignored by search indexes.
4339              
4340             The search functions can also be called with the option L</ignore_accents>, that takes precedence
4341             over the object-level C<search_ignore_accents> attribute value. See L</vlookup> help.
4342              
4343             Value by default: 1 (by default, accents are ignored by search functions)
4344              
4345             Example:
4346              
4347             my $csv = Text::AutoCSV->new(in_file => 'in.csv', search_ignore_accents => 0);
4348              
4349             =item search_value_if_not_found
4350              
4351             When a search is done with a unique value to return (field_add_link member function behavior or
4352             return value of vlookup), default value of option L</value_if_not_found>. See L</vlookup>.
4353              
4354             =item search_value_if_found
4355              
4356             When a search is done with a unique value to return (field_add_link member function behavior or
4357             return value of vlookup), default value of option L</value_if_found>. See L</vlookup>.
4358              
4359             B<IMPORTANT>
4360              
4361             This attribute is extremly unusual. Once you've provided it, all vlookups and the target field value
4362             of fields created with field_add_link will all be populated with the value provided with this
4363             option.
4364              
4365             Don't use it unless you know what you are doing.
4366              
4367             =item search_ignore_ambiguous
4368              
4369             When a search is done with a unique value to return (field_add_link member function behavior or
4370             return value of search_1hr and vlookup), default value of option L</ignore_ambiguous>. See
4371             L</vlookup>.
4372              
4373             =item search_value_if_ambiguous
4374              
4375             When a search is done with a unique value to return (field_add_link member function behavior or
4376             return value of vlookup), default value of option L</value_if_ambiguous>. See L</vlookup>.
4377              
4378             =item fields_dates
4379              
4380             Array ref of field names that contain a date.
4381              
4382             Once the formats of these fields is known (auto-detection by default), each of these fields will get
4383             a specific L</in_map> sub that converts the text in a DateTime object and a L</out_map> sub that
4384             converts back from DateTime to text.
4385              
4386             B<NOTE>
4387              
4388             The L</out_map> given to a DateTime field is "defensive code": normally, L</in_map> converts text
4389             into a DateTime object and L</out_map> does the opposite, it takes a DateTime object and converts it
4390             to text. If ever L</out_map> encounters a value that is not a DateTime object, it'll just stringify
4391             it (evaluation in a string context), without calling its DateTime formatter.
4392              
4393             If the format cannot be detected for a given field, output an error message and as always when an
4394             error occurs, croak (unless L</croak_if_error> got set to 0).
4395              
4396             Value by default: none
4397              
4398             Example:
4399              
4400             my $csv = Text::AutoCSV->new(in_file => 'logins.csv',
4401             fields_dates => ['LASTLOGIN', 'CREATIONDATE']);
4402              
4403             =item fields_dates_auto
4404              
4405             Boolean value. If set to 1, will detect dates formats on all fields. Fields in which a DateTime
4406             format got detected are then managed as if they had been being listed in L</fields_dates> attribute:
4407             they get an appropriate L</in_map> sub and a L</out_map> sub to convert to and from DateTime (see
4408             L</fields_dates> attribute above).
4409              
4410             C<fields_dates_auto> looks for DateTime on all fields, but it expects nothing: it won't raise an
4411             error if no field is found that contains DateTime.
4412              
4413             Value by default: 0
4414              
4415             Example:
4416              
4417             my $csv = Text::AutoCSV->new(in_file => 'logins.csv', fields_dates_auto => 1);
4418              
4419             =item dates_formats_to_try
4420              
4421             Array ref of string formats.
4422              
4423             Text::AutoCSV has a default built-in list of 20 date formats to try and 6 time formats (also it'll
4424             combine any date format with any time format).
4425              
4426             C<dates_formats_to_try> will replace Text::AutoCSV default format-list will the one you specify, in
4427             case the default would not produce the results you expect.
4428              
4429             The formats are written in Strptime format.
4430              
4431             Value by default (see below about the role of the pseudo-format ''):
4432              
4433             [ '',
4434             '%Y-%m-%d',
4435             '%Y.%m.%d',
4436             '%Y/%m/%d',
4437             '%m.%d.%y',
4438             '%m-%d-%Y',
4439             '%m.%d.%Y',
4440             '%m/%d/%Y',
4441             '%d-%m-%Y',
4442             '%d.%m.%Y',
4443             '%d/%m/%Y',
4444             '%m-%d-%y',
4445             '%m/%d/%y',
4446             '%d-%m-%y',
4447             '%d.%m.%y',
4448             '%d/%m/%y',
4449             '%Y%m%d%H%M%S',
4450             '%b %d, %Y',
4451             '%b %d %Y',
4452             '%b %d %T %Z %Y',
4453             '%d %b %Y',
4454             '%d %b, %Y' ]
4455              
4456             B<IMPORTANT>
4457              
4458             The empty format (empty string) has a special meaning: when specified, Text::AutoCSV will be able to
4459             identify fields that contain only a time (not preceeded by a date).
4460              
4461             B<Note>
4462              
4463             Format identification is over only when there is no more ambiguity. So the usual pitfall of US
4464             versus French dates (month-day versus day-month) gets resolved only when a date is encountered that
4465             disambiguates it (a date of 13th of the month or later).
4466              
4467             Example with a weird format that uses underscores to separate elements, using either US (month, day,
4468             year), French (day, month, year), or international (year, month, day) order:
4469              
4470             my $csv = Text::AutoCSV->new(in_file => 'logins.csv',
4471             dates_formats_to_try => ['%d_%m_%Y', '%m_%d_%Y', '%Y_%m_%d']);
4472              
4473             =item dates_formats_to_try_supp
4474              
4475             Same as L</dates_formats_to_try> but instead of replacing the default list of formats used during
4476             detection, it is added to this default list.
4477              
4478             You want to use this attribute if you need a specific DateTime format while continuing to benefit
4479             from the default list.
4480              
4481             B<IMPORTANT>
4482              
4483             Text::AutoCSV will identify a given Datetime format only when there is no ambiguity, meaning, one
4484             unique Datetime format matches (all other failed). Adding a format that already exists in the
4485             default list will prevent the format from being identified, as it'll always be ambiguous. See
4486             L</dates_formats_to_try> for the default list of formats.
4487              
4488             Example:
4489              
4490             my $csv = Text::AutoCSV->new(in_file => 'logins.csv',
4491             dates_formats_to_try_supp => ['%d_%m_%Y', '%m_%d_%Y', '%Y_%m_%d']);
4492              
4493             =item dates_ignore_trailing_chars
4494              
4495             If set to 1, DateTime auto-detection will ignore trailing text that may follow detected
4496             DateTime-like text.
4497              
4498             Value by default: 1 (do ignore trailing chars)
4499              
4500             my $csv = Text::AutoCSV->new(in_file => 'logins.csv', dates_ignore_trailing_chars => 0);
4501              
4502             =item dates_search_time
4503              
4504             If set to 1, look for times when detecting DateTime format. That is, whenever a date format
4505             candidate is found, a longer candidate that also contains a time (after the date) is tested.
4506              
4507             Value by default: 1 (do look for times when auto-detecting DateTime formats)
4508              
4509             Example:
4510              
4511             my $csv = Text::AutoCSV->new(in_file => 'logins.csv', dates_search_time => 0);
4512              
4513             =item dates_locales
4514              
4515             Comma-separated string of locales to test when detecting DateTime formats. Ultimately, Text::AutoCSV
4516             will try all combinations of date formats, times and locales.
4517              
4518             Value by default: none (use perl default locale)
4519              
4520             Example:
4521              
4522             my $csv = Text::AutoCSV->new(in_file => 'logins.csv', dates_locales => 'fr,de,en');
4523              
4524             =item dates_zeros_ok
4525              
4526             Boolean. If true, a date made only of 0s is regarded as being empty.
4527              
4528             For example if C<dates_zeros_ok> is False, then a date like 0000-00-00 will be always incorrect (as
4529             the day and month are out of bounds), therefore a format like '%Y-%m-%d' will never match for the
4530             field.
4531              
4532             Conversely if C<dates_zeros_ok> is true, then a date like 0000-00-00 will be processed as if being
4533             the empty string, thus the detection of format will work and when parsed, this "full of zeros" dates
4534             will be processed the same way as the empty string (= value will be undef).
4535              
4536             B<IMPORTANT>
4537              
4538             "0s dates" are evaluated to undef when parsed, thus when converted back to text (out_map), they are
4539             set to an empty string, not to the original value.
4540              
4541             Value by default: 1
4542              
4543             Example:
4544              
4545             my $csv = Text::AutoCSV->new(in_file => 'in.csv', dates_zeros_ok => 0);
4546              
4547             =item out_dates_format
4548              
4549             Enforce the format of dates in output, for all fields that contain a DateTime value.
4550              
4551             The format is written in Strptime format.
4552              
4553             Value by default: none (by default, use format detected on input)
4554              
4555             Example:
4556              
4557             # Detect any field containing a DateTime value and convert it to yyyy-mm-dd whatever the
4558             # input format is.
4559             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', fields_dates_auto => 1,
4560             out_dates_format => '%F')->write();
4561              
4562             =item out_dates_locale
4563              
4564             Taken into account only if L</out_dates_format> is used.
4565              
4566             Sets the locale to apply on L</out_dates_format>.
4567              
4568             Value by default: none (by default, use the locale detected on input)
4569              
4570             Example:
4571              
4572             # Detect any field containing a DateTime value and convert it to a US DateTime whatever the
4573             # input format is.
4574             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv', fields_dates_auto => 1,
4575             out_dates_format => '%b %d, %Y, %I:%M:%S %p', out_dates_locale => 'en')->write();
4576              
4577             =item croak_if_error
4578              
4579             If true, stops the program execution in case of error.
4580              
4581             B<IMPORTANT>
4582              
4583             Value by default: 1
4584              
4585             If set to zero (C<croak_if_error =E<gt> 0>), errors are displayed as warnings. This printing can
4586             then be affected by setting the L</quiet> attribute.
4587              
4588             =item verbose
4589              
4590             If true, get Text::AutoCSV to be a bit talkative instead of speaking only when warnings and errors
4591             occur. Verbose output is printed to STDERR by default, this can be tuned with the L</infoh>
4592             attribute.
4593              
4594             Value by default: 0
4595              
4596             Example:
4597              
4598             my $csv = Text::AutoCSV->new(in_file => 'in.csv', verbose => 1);
4599              
4600             =item infoh
4601              
4602             File handle to display program's verbose output. Has effect *mainly* with attribute
4603             C<verbose =E<gt> 1>.
4604              
4605             Note B<infoh> is used to display extra information in case of error (if a field does not exist,
4606             Text::AutoCSV will display the list of existing fields). If you don't want such output, you can set
4607             C<infoh> to undef.
4608              
4609             Value by default: \*STDERR
4610              
4611             Example:
4612              
4613             open my $infoh, ">", "log.txt";
4614             my $csv = Text::AutoCSV->new(in_file => 'in.csv', infoh => $infoh);
4615              
4616             =item quiet
4617              
4618             If true, don't display warnings and errors, unless croaking.
4619              
4620             If L</croak_if_error> attribute is set (as per default), still, Text::AutoCSV will produce output
4621             (on STDERR) when croaking miserably.
4622              
4623             When using C<croak_if_error =E<gt> 0>, errors are processed as warnings and if L</quiet> is set (in
4624             addition to L</croak_if_error> being set to 0), there'll be no output. Note this way of working is
4625             not recommended, as things can go wrong without any notice to the caller.
4626              
4627             Example:
4628              
4629             my $csv = Text::AutoCSV->new(in_file => 'in.csv', quiet => 1);
4630              
4631             =item one_pass
4632              
4633             If true, Text::AutoCSV will perform one reading of the input. If other readings are triggered, it'll
4634             raise an error and no reading will be done. Should that be the case (you ask Text::AutoCSV to do
4635             something that'll trigger more than one reading of input), Text::AutoCSV will croak as is always the
4636             case if an error occurs.
4637              
4638             Normally Text::AutoCSV will do multiple reads of input to work out certain characteristics of the
4639             CSV: guess of encoding and guess of escape character.
4640              
4641             Also if member functions like L</field_add_link>, L</field_add_copy>, L</field_add_computed>,
4642             L</read> or L</write> are called after input has already been read, it'll trigger further reads as
4643             needed.
4644              
4645             If one wishes a unique read of the input to occur, one_pass attribute is to be set.
4646              
4647             When true, encoding will be assumed to be the first one in the provided list (L</encoding>
4648             attribute), if no encoding attribute is provided, it'll be the first one in the default list, to
4649             date, it is UTF-8.
4650              
4651             When true, and if attribute L</escape_char> is not set, escape_char will be assumed to be '\\'
4652             (backslash).
4653              
4654             By default, one_pass is set if inh attribute is set (caller provides the input file handle of input)
4655             or if input file is stdin (in_file attribute not set or set to an empty string).
4656              
4657             Value by default:
4658              
4659             0 if inh attribute is not set and in_file attribute is set to a non empty string
4660             1 if inh attribute is set or in_file is not set or set to an empty string
4661              
4662             Example:
4663              
4664             my $csv = Text::AutoCSV->new(in_file => 'in.csv', one_pass => 1);
4665              
4666             =back
4667              
4668             =head2 read
4669              
4670             $csv->read();
4671              
4672             Read input entirely.
4673              
4674             B<Return value>
4675              
4676             Returns the object itself in case of success.
4677             Returns undef if error.
4678              
4679             Callback functions (when defined) are invoked, in the following order:
4680              
4681             L</read_post_update_hr>, intended to do updates on fields values after each record read
4682              
4683             L</walker_ar>, called after each record read, with an array ref of fields values
4684              
4685             L</walker_hr>, called after each record read, with a hash ref of fields values
4686              
4687             Example:
4688              
4689             # Do nothing - just check CSV can be read successfully
4690             Text::AutoCSV->new(in_file => 'in.csv')->read();
4691              
4692             =head2 read_all_in_mem
4693              
4694             $csv->read_all_in_mem();
4695              
4696             Created in version 1.1.5. Before, existed only as _read_all_in_mem, meaning, was private.
4697              
4698             Read input entirely, as with L</read> function, but enforcing content to be kept in-memory.
4699              
4700             Having the content kept in-memory is implied by search functions (L</vlookup> for example). With
4701             C<read_all_in_mem> you can enforce this behavior without doing a fake search.
4702              
4703             =head2 reset_next_record_hr
4704              
4705             $csv->reset_next_record_hr();
4706              
4707             Reset the internal status to start from the beginning with L</get_next_record_hr>. Used in
4708             conjunction with L</get_next_record_hr>.
4709              
4710             =head2 get_next_record_hr
4711              
4712             my $hr = $csv->get_next_record_hr(\$opt_key);
4713              
4714             Get the next record content as a hash ref. C<$hr> is undef when the end of records has been reached.
4715              
4716             When specified, C<$opt_key> is set to the current (returned) record key.
4717              
4718             B<NOTE>
4719              
4720             You do not need to call L</reset_next_record_hr> once before using C<get_next_record_hr>.
4721              
4722             Therefore L</reset_next_record_hr> is useful only if you wish to restart from the beginning before
4723             you've reached the end of the records.
4724              
4725             B<NOTE bis>
4726              
4727             L</walker_hr> allows to execute some code each time a record is read, and it better fits with
4728             Text::AutoCSV philosophy. Using a loop with C<get_next_record_hr> is primarily meant for
4729             Text::AutoCSV internal usage. Also when using this mechanism, you get very close to original
4730             Text::CSV logic, that makes Text::AutoCSV less useful.
4731              
4732             B<Return value>
4733              
4734             A hashref of the record, or undef once there's no more record to return.
4735              
4736             Example:
4737              
4738             while (my $hr = $csv->get_next_record_hr()) {
4739             say Dumper($hr);
4740             }
4741              
4742             =head2 write
4743              
4744             $csv->write();
4745              
4746             Write input into output.
4747              
4748             B<Return value>
4749              
4750             Returns the object itself in case of success.
4751             Returns undef if error.
4752              
4753             - If the content is not in-memory at the time write() is called:
4754              
4755             Each record is read (with call of L</read_post_update_hr>, L</walker_ar> and L</walker_hr>) and then
4756             written. The read-and-write is done in sequence, each record is written to output before the next
4757             record is read from input.
4758              
4759             - If the content is in-memory at the time write() is called:
4760              
4761             No L</read> operation is performed, instead, records are directly written to output.
4762              
4763             If defined, L</out_filter> is called for each record. If the return value of L</out_filter> is
4764             false, the record is not written.
4765              
4766             Example:
4767              
4768             # Copy input to output.
4769             # As CSV is parsed in-between, this copy also checks a number of characteristics about the
4770             # input, as opposed to a plain file copy operation.
4771             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv')->write();
4772              
4773             =head2 out_header
4774              
4775             $csv->out_header($field, $header);
4776              
4777             Set the header text of C<$field> to C<$header>.
4778              
4779             By default, the input header value is rewritten as is to output. C<out_header> allows you to change
4780             it.
4781              
4782             B<Return value>
4783              
4784             Returns the object itself.
4785              
4786             Example:
4787              
4788             Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv')
4789             ->out_header('LOGIN', 'Login')
4790             ->out_header('FULLNAME', 'Full Name')
4791             ->write();
4792              
4793             =head2 print_id
4794              
4795             $csv->print_id();
4796              
4797             Print out a description of input. Write to \*STDERR by default or to L</infoh> attribute if set.
4798              
4799             The description consists in a list of a few characteristics (CSV separator and the like) followed by
4800             the list of columns with the details of each.
4801              
4802             Example of output:
4803              
4804             If you go to the C<utils> directory of this module and execute the following:
4805              
4806             ./csvcopy.pl -i f1.csv -l "1:,A->B,f2.csv" --id
4807              
4808             You will get this output:
4809              
4810             -- f1.csv:
4811             sep_char: ,
4812             escape_char: \
4813             in_encoding: UTF-8
4814             is_always_quoted: no
4815              
4816             # FIELD HEADER EXT DATA DATETIME FORMAT DATETIME LOCALE
4817             - ----- ------ -------- --------------- ---------------
4818             0 TIMESTAMP timestamp %Y%m%d%H%M%S
4819             1 A a
4820             2 B b
4821             3 C c
4822             4 D d %d/%m/%Y
4823             5 1:SITE 1:SITE link: f2.csv, chain: A->B->* (SITE)
4824             6 1:B 1:B link: f2.csv, chain: A->B->* (B)
4825              
4826             =head2 field_add_computed
4827              
4828             $csv->field_add_computed($new_field, $subref);
4829              
4830             C<$new_field> is the name of the created field.
4831              
4832             C<$subref> is a reference to a sub that'll calculate the new field value.
4833              
4834             B<Return value>
4835              
4836             Returns the object itself in case of success.
4837             Returns undef if error.
4838              
4839             Add a field calculated from other fields values. The subref runs like this:
4840              
4841             sub func {
4842             # $new_field is the name of the field (allows to use one subref for more than one field
4843             # calculation).
4844             # $hr is a hash ref of fields values.
4845             # $stats is a hash ref that gets printed (if Text::AutoCSV is created with verbose => 1)
4846             # in the end.
4847             my ($new_field, $hr, $stats) = @_;
4848              
4849             my $field_value;
4850             # ... compute $field_value
4851              
4852             return $field_value;
4853             }
4854              
4855             Example:
4856              
4857             # Add a field for the full name, made of the concatenation of the
4858             # first name and the last name.
4859             Text::AutoCSV->new(in_file => 'dirpeople.csv', out_file => 'dirwithfn.csv', verbose => 1)
4860             ->field_add_computed('FULLNAME', \&calc_fn)->write();
4861             sub calc_fn {
4862             my ($new_field, $hr, $stats) = @_;
4863             die "Man, you are in serious trouble!" unless $new_field eq 'FULLNAME';
4864             my $fn = $hr->{'FIRSTNAME'} . ' ' . uc($hr->{'LASTNAME'});
4865             $stats->{'empty full name'}++ if $fn eq ' ';
4866             return $fn;
4867             }
4868              
4869             =head2 field_add_copy
4870              
4871             $csv->field_add_copy($new_field, $src_field, $opt_subref);
4872              
4873             C<$new_field> if the name of the new field.
4874              
4875             C<$src_field> is the name of the field being copied.
4876              
4877             C<$opt_subref> is optional. It is a reference to a sub that takes one string (the value of
4878             C<$src_field>) and returns a string (the value assigned to C<$new_field>).
4879              
4880             B<Return value>
4881              
4882             Returns the object itself in case of success.
4883             Returns undef if error.
4884              
4885             C<field_add_copy> is a special case of L</field_add_computed>. The advantage of C<field_add_copy> is
4886             that it relies on a sub that is Text::AutoCSV "unaware", just taking one string as input and
4887             returning another string as output.
4888              
4889             B<IMPORTANT>
4890              
4891             The current field value is passed to C<field_add_copy> in $_.
4892              
4893             A call to
4894              
4895             $csv->field_add_copy($new_field, $src_field, $subref);
4896              
4897             is equivalent to
4898              
4899             $csv->field_add_computed($new_field, \&subref2);
4900             sub subref2 {
4901             my (undef, $hr) = @_;
4902             local $_ = $hr->{$src_field};
4903             return $subref->();
4904             }
4905              
4906             Example of a field copy + pass copied field in upper case and surround content with <<>>:
4907              
4908             my $csv = Text::AutoCSV->new(in_file => 'dirpeople.csv', out_file => 'd2.csv');
4909             $csv->field_add_copy('UCLAST', 'LASTNAME', \&myfunc);
4910             $csv->write();
4911             sub myfunc { s/^.*$/<<\U$&>>/; $_; }
4912              
4913             Note that the calls can be chained as most member functions return the object itself upon success.
4914             The example above is equivalent to:
4915              
4916             Text::AutoCSV->new(in_file => 'dirpeople.csv', out_file => 'd2.csv')
4917             ->field_add_copy('UCLAST', 'LASTNAME', \&myfunc)
4918             ->write();
4919             sub myfunc { s/^.*$/<<\U$&>>/; $_; }
4920              
4921             =head2 field_add_link
4922              
4923             $csv->field_add_link($new_field, $chain, $linked_file, \%opts);
4924              
4925             C<$new_field> is the name of the new field.
4926              
4927             C<$chain> is the CHAIN of the link, that is: 'LOCAL->REMOTE->PICK' where:
4928              
4929             C<LOCAL> is the field name to read the value from.
4930              
4931             C<REMOTE> is the linked field to find the value in. This field belongs to $linked_file.
4932              
4933             C<PICK> is the field from which to read the value of, in the record found by the search. This field
4934             belongs to $linked_file.
4935              
4936             If $new_field is undef, the new field name is the name of the third field of $chain (PICK).
4937              
4938             C<$linked_file> is the name of the linked file, that gets read in a Text::AutoCSV object created
4939             on-the-fly to do the search on. C<$linked_file> can also be a Text::AutoCSV object that you created
4940             yourself, allowing for more flexibility. Example:
4941              
4942             my $lcsv = Text::AutoCSV->new(in_file => 'logins.csv', case => 1);
4943             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', $lcsv);
4944              
4945             C<\%opts> is a hash ref of optional attributes. The same values can be provided as with vlookup.
4946              
4947             =over 4
4948              
4949             =item trim
4950              
4951             If set to 1, searches will ignore leading and trailing spaces. That is, a C<LOCAL> value of ' x '
4952             will match with a C<REMOTE> value of 'x'.
4953              
4954             If option is not present, use L</search_value_if_not_found> attribute of object (default value: 1).
4955              
4956             Example:
4957              
4958             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
4959             { trim => 0 });
4960              
4961             =item case
4962              
4963             If set to 1, searches will take the case into account. That is, a C<LOCAL> value of 'X' will B<not>
4964             match with a C<REMOTE> value of 'x'.
4965              
4966             If option is not present, use L</search_case> attribute of object (default value: 0).
4967              
4968             Example:
4969              
4970             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
4971             { case => 1 });
4972              
4973             =item ignore_empty
4974              
4975             If set to 1, empty values won't match. That is, a C<LOCAL> value of '' will not match with a
4976             C<REMOTE> value of ''.
4977              
4978             If option is not present, use L</search_ignore_empty> attribute of object (default value: 1).
4979              
4980             Example:
4981              
4982             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
4983             { ignore_empty => 0 });
4984              
4985             =item value_if_not_found
4986              
4987             If the searched value is not found, the value of the field is undef, that produces an empty string
4988             at write time. Instead, you can specify the value.
4989              
4990             If option is not present, use L</search_value_if_not_found> attribute of object (default value:
4991             undef).
4992              
4993             Example:
4994              
4995             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
4996             { value_if_not_found => '<not found!>' });
4997              
4998             =item value_if_found
4999              
5000             If the searched value is found, you can specify the value to return.
5001              
5002             If option is not present, use L</search_value_if_found> attribute of object (default value: none).
5003              
5004             B<NOTE>
5005              
5006             Although the C<PICK> field is ignored when using this option, you must specify it any way.
5007              
5008             Example:
5009              
5010             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
5011             { value_if_not_found => '0', value_if_found => '1' });
5012              
5013             =item value_if_ambiguous
5014              
5015             If the searched value is found in more than one record, the value of the field is undef, that
5016             produces an empty string at write time. Instead, you can specify the value.
5017              
5018             If option is not present, use L</search_value_if_ambiguous> attribute of object (default value:
5019             undef).
5020              
5021             Example:
5022              
5023             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
5024             { value_if_ambiguous => '<ambiguous!>' });
5025              
5026             =item ignore_ambiguous
5027              
5028             Boolean value. If ignore_ambiguous is true and the searched value is found in more than one record,
5029             then, silently fall back on returning the value of the first record. Obviously if
5030             C<ignore_ambiguous> is true, then the value of L</value_if_ambiguous> is ignored.
5031              
5032             If option is not present, use L</search_ignore_ambiguous> attribute of object (default value: 1).
5033              
5034             Example:
5035              
5036             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
5037             { ignore_ambiguous => 1 });
5038              
5039             Example with multiple options:
5040              
5041             $csv->field_add_link('NAME', 'ID->LOGIN->DISPLAYNAME', 'logins.csv',
5042             { value_if_not_found => '?', ignore_ambiguous => 1 });
5043              
5044             =back
5045              
5046             B<Return value>
5047              
5048             Returns the object itself in case of success.
5049             Returns undef if error.
5050              
5051             Example of field_add_link usage:
5052              
5053             my $nom_compose = 0;
5054             my $zip_not_found = 0;
5055             Text::AutoCSV->new(in_file => 'pers.csv', walker_hr => \&walk)
5056             ->field_add_link('MYCITY', 'ZIP->ZIPCODE->CITY', 'zips.csv')->read();
5057             sub walk {
5058             my $hr = shift;
5059             $nom_compose++ if $hr->{'NAME'} =~ m/[- ]/;
5060             $zip_not_found++ unless defined($hr->{'MYCITY'});
5061             }
5062             print("Number of persons with a multi-part name: $nom_compose\n");
5063             print("Number of persons with unknown zipcode: $zip_not_found\n");
5064              
5065             =head2 links
5066              
5067             $csv->links($prefix, $chain, $linked_file, \%opts);
5068              
5069             C<$prefix> is the name to add to joined fields
5070              
5071             C<$chain> is the JOINCHAIN of the link, that is: 'LOCAL->REMOTE' where:
5072              
5073             C<LOCAL> is the field name to read the value from.
5074              
5075             C<REMOTE> is the linked field to find the value in. This field belongs to $linked_file.
5076              
5077             As opposed to L</field_add_link>, there is no C<PICK> part, as all fields of target are read.
5078              
5079             As opposed to Text::AutoCSV habits of croaking whenever a field name is duplicate, here, the
5080             duplicates are resolved by appending _2 to the joined field name if it already exists. If _2 already
5081             exists, too, then _3 is appended instead, and so on, until a non-duplicate is found. This mechanism
5082             is executed given the difficulty to control all field names when joining CSVs.
5083              
5084             C<$linked_file> and C<\%opts> work exactly the same way as for L</field_add_link>, see
5085             L</field_add_link> for help.
5086              
5087             B<Return value>
5088              
5089             Returns the object itself in case of success.
5090             Returns undef if error.
5091              
5092             B<NOTE>
5093              
5094             This function used to be called C<join> but got renamed to avoid clash with perl' builtin C<join>.
5095              
5096             Example:
5097              
5098             Text::AutoCSV->new(in_file => 'pers.csv', out_file => 'pers_with_city.csv')
5099             ->links('Read from zips.csv:', 'ZIP->ZIPCODE', 'zips.csv')->write();
5100              
5101             =head2 get_in_encoding
5102              
5103             my $enc = $csv->get_in_encoding();
5104              
5105             Return the string of input encoding, for example 'latin2' or 'UTF-8', etc.
5106              
5107             =head2 get_in_file_disp
5108              
5109             my $f = $csv->get_in_file_disp();
5110              
5111             Return the printable name of in_file.
5112              
5113             =head2 get_sep_char
5114              
5115             my $s = $csv->get_sep_char();
5116              
5117             Return the string of the input CSV separator character, for example ',' or ';'.
5118              
5119             =head2 get_escape_char
5120              
5121             my $e = $csv->get_escape_char();
5122              
5123             Return the string of the input escape character, for example '"' or '\\'.
5124              
5125             =head2 get_is_always_quoted
5126              
5127             my $a = $csv->get_is_always_quoted();
5128              
5129             Return 1 if all fields of input are always quoted, 0 otherwise.
5130              
5131             =head2 get_coldata
5132              
5133             my @cd = get_coldata();
5134              
5135             Return an array that describes each column, from the first one (column 0) to the last.
5136              
5137             Each element of the array is itself an array ref that contains 5 elements:
5138              
5139             0: Name of the field (as accessed in *_hr functions)
5140             1: Content of the field in the header line (if input has a header line)
5141             2: Column content type, shows some meta-data of fields created with field_add_* functions
5142             3: Datetime format detected, if ever, in the format Strptime
5143             4: Locale of DateTime format detected, if ever
5144              
5145             =head2 get_pass_count
5146              
5147             my $n = $csv->get_pass_count();
5148              
5149             Return the number of input readings done. Useful only if you're interested in Text::AutoCSV
5150             internals.
5151              
5152             =head2 get_in_mem_record_count
5153              
5154             my $m = $csv->get_in_mem_record_count();
5155              
5156             Return the number of records currently stored in-memory. Useful only if you're interested in
5157             Text::AutoCSV internals.
5158              
5159             =head2 get_max_in_mem_record_count
5160              
5161             my $mm = $csv->get_max_in_mem_record_count();
5162              
5163             Return the maximum number of records ever stored in-memory. Indeed this number can decrease: certain
5164             functions like field_add* member-functions discard in-memory content. Useful only if you're
5165             interested in Text::AutoCSV internals.
5166              
5167             =head2 get_fields_names
5168              
5169             my @f = $csv->get_fields_names();
5170              
5171             Return an array of the internal names of the columns.
5172              
5173             =head2 get_field_name
5174              
5175             my $name = $csv->get_field_name($n);
5176              
5177             Return the C<$n>-th column name, the first column being number 0.
5178              
5179             Example:
5180              
5181             # Get the field name of the third column
5182             my $col = $csv->get_field_name(2);
5183              
5184             =head2 get_stats
5185              
5186             my %stats = $csv->get_stats();
5187              
5188             Certain callback functions provide a parameter to record event count: L</field_add_computed>,
5189             L</read_post_update_hr>, L</walker_ar> and L</walker_hr>. By default, these stats are displayed if
5190             Text::AutoCSV got created with attribute C<verbose =E<gt> 1>. get_stats() returns the statistics
5191             hash of the object.
5192              
5193             B<IMPORTANT>
5194              
5195             As opposed to most functions that trigger input reading automatically (search functions and other
5196             get_* functions), C<get_stats> just returns you the stats as it is, regardless of whether some
5197             execution already occured.
5198              
5199             =head2 set_walker_ar
5200              
5201             $csv->set_walker_ar($subref);
5202              
5203             Normally one wants to define it at object creation time using L</walker_ar> attribute.
5204             C<set_walker_ar> allows to assign the attribute walker_ar after object creation.
5205              
5206             See attribute L</walker_ar> for help about the way C<$subref> should work.
5207              
5208             B<Return value>
5209              
5210             Returns the object itself in case of success.
5211             Returns undef if error.
5212              
5213             Example:
5214              
5215             # Calculate the total of the two first columns, the first column being money in and the
5216             # second one being money out.
5217             my ($actif, $passif) = (0, 0);
5218             $csv->set_walker_ar(sub { my $ar = $_[0]; $actif += $ar->[0]; $passif += $ar->[1]; })->read();
5219             print("Actif = $actif\n");
5220             print("Passif = $passif\n");
5221              
5222             =head2 set_walker_hr
5223              
5224             $csv->set_walker_hr($subref);
5225              
5226             Normally one wants to define it at object creation time using L</walker_hr> attribute.
5227             C<set_walker_hr> allows to assign the attribute L</walker_hr> after object creation.
5228              
5229             See attribute L</walker_hr> for help about the way C<$subref> should work.
5230              
5231             B<Return value>
5232              
5233             Returns the object itself in case of success.
5234             Returns undef if error.
5235              
5236             Example:
5237              
5238             my $csv = Text::AutoCSV->new(in_file => 'directory.csv', verbose => 1);
5239              
5240             # ...
5241              
5242             $csv->set_walker_hr(
5243             sub {
5244             my ($hr, $stat) = @_;
5245             $stat{'not capital name'}++, return if $hr->{'NAME'} ne uc($hr->{'NAME'});
5246             $stat{'name is capital letters'}++;
5247             }
5248             )->read();
5249              
5250             =head2 set_out_file
5251              
5252             $csv->set_out_file($out_file);
5253              
5254             Normally one wants to define it at object creation time using L</out_file> attribute.
5255             C<set_out_file> allows to assign the attribute L</out_file> after object creation. It is set to
5256             C<$out_file> value.
5257              
5258             B<Return value>
5259              
5260             Returns the object itself in case of success.
5261             Returns undef if error.
5262              
5263             Example:
5264              
5265             $csv->set_out_file('mycopy.csv')->write();
5266              
5267             =head2 get_keys
5268              
5269             my @allkeys = $csv->get_keys();
5270              
5271             Returns an array of all the record keys of input. A record key is a unique identifier that
5272             designates the record.
5273              
5274             At the moment it is just an integer being the record number, the first one (that comes after the
5275             header line) being of number 0. For example if $csv input is made of one header line and 3 records
5276             (that is, a 4-line file typically, if no record contains a line break), $csv->get_keys() returns
5277              
5278             (0, 1, 2)
5279              
5280             B<IMPORTANT>
5281              
5282             If not yet done, this function causes the input to be read entirely and stored in-memory.
5283              
5284             =head2 get_hr_all
5285              
5286             my @allin = $csv->get_hr_all();
5287              
5288             Returns an array of all record contents of the input, each record being a hash ref.
5289              
5290             B<IMPORTANT>
5291              
5292             If not yet done, this function causes the input to be read entirely and stored in-memory.
5293              
5294             =head2 get_row_ar
5295              
5296             my $row_ar = $csv->get_row_ar($record_key);
5297              
5298             Returns an array ref of the record designated by C<$record_key>.
5299              
5300             Example:
5301              
5302             # Get content (as array ref) of last record
5303             my @allkeys = $csv->get_keys();
5304             my $lastk = $allkeys[-1];
5305             my $lastrec_ar = $csv->get_row_ar($lastk);
5306              
5307             B<IMPORTANT>
5308              
5309             If not yet done, this function causes the input to be read entirely and stored in-memory.
5310              
5311             =head2 get_row_hr
5312              
5313             my $row_hr = $csv->get_row_hr($record_key);
5314              
5315             Returns a hash ref of the record designated by C<$record_key>.
5316              
5317             Example:
5318              
5319             # Get content (as hash ref) of first record
5320             my @allkeys = $csv->get_keys();
5321             my $firstk = $allkeys[0];
5322             my $firstrec_hr = $csv->get_row_hr($firstk);
5323              
5324             B<IMPORTANT>
5325              
5326             If not yet done, this function causes the input to be read entirely and stored in-memory.
5327              
5328             =head2 get_cell
5329              
5330             my $val = $csv->get_cell($record_key, $field_name);
5331              
5332             Return the value of the cell designated by its record key (C<$record_key>) and field name
5333             (C<$field_name>).
5334              
5335             Example:
5336              
5337             my @allkeys = $csv->get_keys();
5338             my $midk = $allkeys[int($#allkeys / 2)];
5339             my $midname = $csv->get_cell($midk, 'NAME');
5340              
5341             Note the above example is equivalent to:
5342              
5343             my @allkeys = $csv->get_keys();
5344             my $midk = $allkeys[int($#allkeys / 2)];
5345             my $midrec_hr = $csv->get_row_hr($midk);
5346             my $midname = $midrec_hr->{'NAME'};
5347              
5348             B<IMPORTANT>
5349              
5350             If not yet done, this function causes the input to be read entirely and stored in-memory.
5351              
5352             =head2 get_values
5353              
5354             my @vals = $csv->get_values($field_name, $opt_filter_subref);
5355              
5356             Return an array made of the values of the given field name (C<$field_name>), for every records, in
5357             the order of the records.
5358              
5359             C<$opt_filter_subref> is an optional subref. If defined, it is called with every values in turn (one
5360             call per value) and only values for which C<$opt_filter_subref> returned True are included in the
5361             returned array. Call to C<$opt_filter_subref> is done with $_ to pass the value.
5362              
5363             Example:
5364              
5365             my @logins = $csv->get_values('LOGIN");
5366              
5367             This is equivalent to:
5368              
5369             my @allkeys = $csv->get_keys();
5370             my @logins;
5371             push @logins, $csv->get_cell($_, 'LOGIN') for (@allkeys);
5372              
5373             Example bis
5374              
5375             # @badlogins is the list of logins that contain non alphanumeric characters
5376             my @badlogins = Text::AutoCSV->new(in_file => 'logins.csv')
5377             ->get_values('LOGIN', sub { m/[^a-z0-9]/ });
5378              
5379             This is equivalent to:
5380              
5381             # @badlogins is the list of logins that contain non alphanumeric characters
5382             # This method leads to carrying all values of a given field across function calls...
5383             my @badlogins = grep { m/[^a-z0-9]/ } (
5384             Text::AutoCSV->new(in_file => 'logins.csv')->get_values('LOGIN')
5385             );
5386              
5387             B<IMPORTANT>
5388              
5389             If not yet done, this function causes the input to be read entirely and stored in-memory.
5390              
5391             =head2 get_recnum
5392              
5393             my $r = $csv->get_recnum();
5394              
5395             Returns the current record identifier, if a reading is in progress. If no read is in progress,
5396             return undef.
5397              
5398             =head2 in_map
5399              
5400             =head2 read_update_after
5401              
5402             C<read_update_after> is an alias of C<in_map>.
5403              
5404             $csv->in_map($field, $subref);
5405              
5406             After reading a record from input, update C<$field> by calling C<$subref>. The value is put in
5407             C<$_>. Then the field value is set to the return value of C<$subref>.
5408              
5409             This feature is originally meant to manage DateTime fields: the input and output CSVs carry text
5410             content, and in-between, the values dealt with are DateTime objects.
5411              
5412             See L</out_map> for an example.
5413              
5414             =head2 out_map
5415              
5416             =head2 write_update_before
5417              
5418             C<write_update_before> is an alias of C<out_map>.
5419              
5420             $csv->out_map($field, $subref);
5421              
5422             Before writing C<$field> field content into the output file, pass it through C<out_map>. The value
5423             is put in C<$_>. Then the return value of C<$subref> is written in the output.
5424              
5425             Example:
5426              
5427             Suppose you have a CSV file with the convention that a number surrounded by parenthesis is negative.
5428             You can register corresponding L</in_map> and L</out_map> functions. During the processing of data,
5429             the field content will be just a number (positive or negative), while in input and in output, it'll
5430             follow the "negative under parenthesis" convention.
5431              
5432             In the below example, we rely on convention above and add a new field converted from the original
5433             one, that follows the same convention.
5434              
5435             sub in_updt {
5436             return 0 if !defined($_) or $_ eq '';
5437             my $i;
5438             return -$i if ($i) = $_ =~ m/^\((.*)\)$/;
5439             $_;
5440             }
5441             sub out_updt {
5442             return '' unless defined($_);
5443             return '(' . (-$_) . ')' if $_ < 0;
5444             $_;
5445             }
5446             sub convert {
5447             return ;
5448             }
5449             Text::AutoCSV->new(in_file => 'trans-euros.csv', out_file => 'trans-devises.csv')
5450             ->in_map('EUROS', \&in_updt)
5451             ->out_map('EUROS', \&out_updt)
5452             ->out_map('DEVISE', \&out_updt)
5453             ->field_add_copy('DEVISE', 'EUROS', sub { sprintf("%.2f", $_ * 1.141593); } )
5454             ->write();
5455              
5456             =head2 search
5457              
5458             my $found_ar = $csv->search($field_name, $value, \%opts);
5459              
5460             Returns an array ref of all records keys where the field C<$field_name> has the value C<$value>.
5461              
5462             C<\%opts> is an optional hash ref of options for the search. See help of L</vlookup> about options.
5463              
5464             B<IMPORTANT>
5465              
5466             An unsuccessful search returns an empty array ref, that is, [ ]. Thus you B<cannot> check for
5467             definedness of C<search> return value to know whether or not the search found something.
5468              
5469             On the other hand, you can always examine the value C<search(...)-E<gt>[0]>, as search is always an
5470             array ref. If the search found nothing, then, C<search(...)-E<gt>[0]> is not defined.
5471              
5472             B<IMPORTANT bis>
5473              
5474             If not yet done, this function causes the input to be read entirely and stored in-memory.
5475              
5476             Example:
5477              
5478             my $linux_os_keys_ar = $csv->search('OS', 'linux');
5479              
5480             =head2 search_1hr
5481              
5482             my $found_hr = $csv->search_1hr($field_name, $value, \%opts);
5483              
5484             Returns a hash ref of the first record where the field C<$field_name> has the value C<$value>.
5485              
5486             C<\%opts> is an optional hash ref of options for the search. See help of L</vlookup> about options.
5487              
5488             Note the options L</value_if_not_found> and L</value_if_ambiguous> are ignored. If not found, return
5489             undef. If the result is ambiguous (more than one record found) and ignore_ambiguous is set to a
5490             false value, return undef.
5491              
5492             The other options are taken into account as for any search: L</ignore_ambiguous>, L</trim>,
5493             L</case>, L</ignore_empty>.
5494              
5495             B<IMPORTANT>
5496              
5497             As opposed to L</search>, an unsuccessful C<search_1hr> will return C<undef>.
5498              
5499             B<IMPORTANT bis>
5500              
5501             If not yet done, this function causes the input to be read entirely and stored in-memory.
5502              
5503             Example:
5504              
5505             my $hr = $csv->search_1hr('LOGIN', $login);
5506             my $full_name = $hr->{'FIRSTNAME'} . ' ' . $hr->{'LASTNAME'};
5507              
5508             =head2 vlookup
5509              
5510             my $val = $csv->vlookup($searched_field, $value, $target_field, \%opts);
5511              
5512             Find the first record where C<$searched_field> contains C<$value> and out of this record, returns
5513             the value of C<$target_field>.
5514              
5515             C<\%opts> is optional. It is a hash of options for C<vlookup>:
5516              
5517             =over 4
5518              
5519             =item trim
5520              
5521             If true, ignore spaces before and after the values to search.
5522              
5523             If option is not present, use L</search_trim> attribute of object (default value: 1).
5524              
5525             =item case
5526              
5527             If true, do case sensitive searches.
5528              
5529             If option is not present, use L</search_case> attribute of object (default value: 0).
5530              
5531             =item ignore_empty
5532              
5533             If true, ignore empty values in the search. The consequence is that you won't be able to find
5534             empty values by searching it.
5535              
5536             If option is not present, use L</search_ignore_empty> attribute of object (default value: 1).
5537              
5538             =item ignore_accents
5539              
5540             If true, ignore accents in searches. For exampe, if C<ignore_accents> is set, a string like
5541             "élémentaire" will match "elementaire".
5542              
5543             If option is not present, use L</search_ignore_accents> attribute of object (default value: 1).
5544              
5545             B<NOTE>
5546              
5547             This option uses the function L</remove_accents> to build its internal hash tables. See
5548             L</remove_accents> help for more details.
5549              
5550             =item value_if_not_found
5551              
5552             Return value if vlookup finds nothing.
5553              
5554             If option is not present, use L</search_value_if_not_found> attribute of object (default value:
5555             undef).
5556              
5557             =item value_if_found
5558              
5559             Return value if vlookup finds something.
5560              
5561             If option is not present, use L</search_value_if_found> attribute of object (default value: none).
5562              
5563             This option is to just check whether a value exists, regardless of the target value found.
5564              
5565             B<NOTE>
5566              
5567             Although the B<$target_field> is ignored when using this option, you must specify it any way.
5568              
5569             =item value_if_ambiguous
5570              
5571             Return value if vlookup find more than one result. Tune it only if ignore_ambiguous is unset.
5572              
5573             If option is not present, use L</search_value_if_ambiguous> attribute of object (default value:
5574             undef).
5575              
5576             =item ignore_ambiguous
5577              
5578             If true, then if more than one result is found, silently return the first one.
5579              
5580             If option is not present, use L</search_ignore_ambiguous> attribute of object (default value: 1).
5581              
5582             =back
5583              
5584             B<IMPORTANT>
5585              
5586             If not yet done, this function causes the input to be read entirely and stored in-memory.
5587              
5588             Example:
5589              
5590             my $name = $csv->vlookup('LOGIN', $id, 'NAME', { value_if_not_found => '<login not found>' });
5591              
5592             =head2 remove_accents
5593              
5594             my $t = $csv->remove_accents($s);
5595              
5596             Take the string C<$s> as argument and return the string without accents. Uses a Unicode
5597             decomposition followed by removal of every characters that have the Unicode property
5598             C<Nonspacing_Mark>.
5599              
5600             B<NOTE>
5601              
5602             Only accents are removed. It is not a C<whatever-encoding -E<gt> us-ascii> conversion. For example,
5603             the French B<Å“> character (o followed by e) or the German B<ß> (eszett) are kept as is.
5604              
5605             B<NOTE bis>
5606              
5607             Tested with some latin1 and latin2 characters.
5608              
5609             B<NOTE ter>
5610              
5611             There is no language-level transformation during accents removal. For example B<Jürgen> is returned
5612             as B<Jurgen>, not B<Juergen>.
5613              
5614             This function is not exported by default.
5615              
5616             Example:
5617              
5618             use Text::AutoCSV qw(remove_accents);
5619             my $s = remove_accents("Français: être élémentaire, Tchèque: služba dům");
5620             die "This script will never die" if $s ne 'Francais: etre elementaire, Tcheque: sluzba dum';
5621              
5622             =head1 AUTHOR
5623              
5624             Sébastien Millet <milletseb@laposte.net>
5625              
5626             =head1 COPYRIGHT AND LICENSE
5627              
5628             This software is copyright (c) 2016, 2017 by Sébastien Millet.
5629              
5630             This is free software; you can redistribute it and/or modify it under
5631             the same terms as the Perl 5 programming language system itself.
5632              
5633             =cut