File Coverage

blib/lib/Spreadsheet/Edit/IO.pm
Criterion Covered Total %
statement 210 534 39.3
branch 59 310 19.0
condition 41 181 22.6
subroutine 45 84 53.5
pod 10 11 90.9
total 365 1120 32.5


line stmt bran cond sub pod time code
1             # License: http://creativecommons.org/publicdomain/zero/1.0/
2             # (CC0 or Public Domain). To the extent possible under law, the author,
3             # Jim Avera (email jim.avera at gmail dot com) has waived all copyright and
4             # related or neighboring rights to this document. Attribution is requested
5             # but not required.
6 3     3   25 use strict; use warnings FATAL => 'all'; use utf8;
  3     3   7  
  3     3   122  
  3         18  
  3         7  
  3         144  
  3         17  
  3         6  
  3         2416  
7 3     3   121 use feature qw(say state lexical_subs current_sub);
  3         15  
  3         316  
8 3     3   17 no warnings qw(experimental::lexical_subs);
  3         7  
  3         194  
9              
10             package Spreadsheet::Edit::IO;
11              
12             # Allow "use <thismodule. VERSION ..." in development sandbox to not bomb
13 3     3   17 { no strict 'refs'; ${__PACKAGE__."::VER"."SION"} = 1999.999; }
  3         6  
  3         281  
14             our $VERSION = '1000.006'; # VERSION from Dist::Zilla::Plugin::OurPkgVersion
15             our $DATE = '2023-09-06'; # DATE from Dist::Zilla::Plugin::OurDate
16              
17             # This module is derived from the old never-released Text:CSV::Spreadsheet
18              
19 3     3   28 use Exporter 'import';
  3         6  
  3         233  
20              
21             our @EXPORT = qw/convert_spreadsheet OpenAsCsv cx2let let2cx cxrx2sheetaddr
22             sheetname_from_spec filepath_from_spec
23             form_spec_with_sheetname/;
24              
25             our @EXPORT_OK = qw/@sane_CSV_read_options @sane_CSV_write_options/;
26              
27             # TODO: Provide "known_attributes" function ala Text::CSV::known_attributes()
28              
29 3     3   1644 use version ();
  3         6120  
  3         102  
30 3     3   21 use Carp;
  3         23  
  3         211  
31              
32 3     3   20 use File::Find ();
  3         6  
  3         45  
33 3     3   1606 use File::Copy ();
  3         8822  
  3         98  
34 3     3   1770 use File::Copy::Recursive ();
  3         14329  
  3         106  
35 3     3   23 use File::Glob qw/bsd_glob/;
  3         6  
  3         414  
36              
37 3     3   1038 use Path::Tiny qw/path/;
  3         13022  
  3         165  
38              
39             # Path::Tiny OBVIATES NEED for many but we still need this
40 3     3   22 use File::Spec ();
  3         13  
  3         78  
41 3     3   499 use File::Spec::Functions qw/devnull tmpdir rootdir catdir catfile/;
  3         958  
  3         218  
42              
43             # Still sometimes convenient...
44 3     3   20 use File::Basename qw(basename);
  3         7  
  3         171  
45              
46 3     3   1691 use File::Which qw/which/;
  3         3440  
  3         171  
47 3     3   1553 use URI::file ();
  3         33996  
  3         117  
48 3     3   22 use Guard qw(guard scope_guard);
  3         8  
  3         220  
49 3     3   21 use Fcntl qw(:flock :seek);
  3         6  
  3         533  
50 3     3   22 use Scalar::Util qw/blessed/;
  3         7  
  3         187  
51 3     3   20 use List::Util qw/none all notall first min max/;
  3         5  
  3         269  
52 3     3   50 use Encode qw(encode decode);
  3         6  
  3         205  
53 3     3   21 use File::Glob qw/bsd_glob GLOB_NOCASE/;
  3         6  
  3         191  
54 3     3   31 use Digest::MD5 qw/md5_base64/;
  3         7  
  3         149  
55 3     3   18 use Text::CSV ();
  3         6  
  3         122  
56             # DDI 5.025 is needed for Windows-aware qsh()
57 3     3   18 use Data::Dumper::Interp 5.025 qw/vis visq dvis dvisq ivis ivisq avis qsh qshlist u/;
  3         75  
  3         79  
58              
59 3         50 use Spreadsheet::Edit::Log qw/log_call fmt_call log_methcall fmt_methcall oops/,
60 3     3   1166 ':btw=IO${lno}:';
  3         5  
61             our %SpreadsheetEdit_Log_Options = (
62             is_public_api => sub{
63             $_[1][3] =~ /(?: ::|^ )(?: [a-z][^:]* | OpenAsCsv | ConvertSpreadsheet )$/x
64             },
65             );
66              
67             my $progname = path($0)->basename;
68              
69             sub _get_username(;$) {
70 4     4   11 my ($uid) = @_;
71 4   50     22 $uid //= eval{ $> } // -1; # default to EUID
  4   33     72  
72 4         12 state $answer = {};
73 4   66     25 return $answer->{$uid} //= do {
74             # https://stackoverflow.com/questions/12081246/how-to-get-system-user-full-name-on-windows-in-perl
75 3   33     2818 eval { getpwuid($uid) // $uid }
76             ||
77 3 0 0     6 ($^O =~ /MSWin/ && $uid == (eval{$>}//-1) && eval{ # untested...
  0   0     0  
      0        
      33        
78 0         0 require Win32API::Net;
79 0   0     0 Win32API::Net::UserGetInfo($ENV{LOGONSERVER}||'',Win32::LoginName(),10,my $info={});
80             $info->{fullName}
81 0         0 })
82             ||
83             "UID$uid"
84             };
85             }
86              
87              
88             # A private Libre/Open Office profile dir is needed to avoid conflicts
89             # with interactive sessions, see
90             # https://ask.libreoffice.org/en/question/290306/how-to-start-independent-lo-instance-process
91             #
92             # We use a persistent profile dir shared among processes for a given user
93             # (actually one for each unique external tool which needs one).
94             # Sharing is okay because we get an exclusive lock before actually using it.
95             state $profile_parent_dir = do{ # also used for lockfile
96             my $user = _get_username();
97             my $dname = __PACKAGE__."_${user}_profileparent";
98             $dname =~ s/::/-/g;
99             $dname =~ s/[^-\w.:]/_/g;
100             (my $path = path(File::Spec->tmpdir)->child($dname))->mkpath;
101             $path # Path::Tiny
102             };
103             sub _get_tool_profdir($) {
104 0     0   0 my ($tool_path) = @_;
105 0         0 my $fingerprint = _file_fingerprint($tool_path);
106 0         0 (my $toolname = path($tool_path)->basename(qw/\.\w+$/)) =~ s/[^-\w.:]/_/g;
107 0         0 my $path = $profile_parent_dir->child("${toolname}_$fingerprint");
108 0         0 $path->mkpath;
109 0         0 $path
110             }
111              
112             # Prevent concurrent document conversions.
113             # LO & OO can't handle concurrent access to the same profile.
114             my $locked_fh;
115             sub _get_exclusive_lock($) { # returns lock object
116 0     0   0 my $opts = shift;
117 0         0 my $lockfile_path = $profile_parent_dir->child("LOCKFILE")->canonpath;
118 0         0 my $sleeptime = 1;
119 0         0 my $lock_fh;
120 0         0 while (! defined $lock_fh) {
121             #warn "$$ : ### AAA open $lockfile_path ...\n";
122 0 0       0 open $lock_fh, "+>>", $lockfile_path or die $!;
123             #warn "$$ : ### AA2 open succeeded.\n";
124 0         0 eval { chmod 0666, $lock_fh; }; # sometimes not implemented
  0         0  
125             #warn "$$ : ### AA3 flock ...\n";
126 0 0       0 if (! flock($lock_fh, LOCK_EX|LOCK_NB)) {
127             #warn "$$ : ### AA4 flock FAILED\n";
128 0 0       0 seek($lock_fh, 0, SEEK_SET) or die;
129 0         0 my @lines = <$lock_fh>;
130 0 0       0 close($lock_fh) or die "close:$!"; $lock_fh = undef;
  0         0  
131             #warn "$$ : ### AA6 fh closed...\n";
132 0   0     0 my $owner = $lines[-1] // ""; # pid NNN (progname)
133 0 0       0 { my ($pid) = ($owner =~ /pid (\d+)/) or last;
  0         0  
134 0 0       0 my @s = stat("/proc/$pid") or last;
135 0         0 $owner = _get_username($s[4])." ".$owner;
136             }
137 0 0       0 my $ownermsg = $owner ? " held by $owner" : "";
138             # Carp::longmess ...
139             warn ">> ($$) Waiting for exclusive lock${ownermsg}...\n",
140             " $lockfile_path\n"
141 0 0       0 unless $opts->{silent};
142 0         0 sleep $sleeptime;
143             } else {
144 0         0 $locked_fh = $lock_fh;
145 0 0       0 seek($lock_fh, 0, SEEK_END) or die;
146 0         0 print $lock_fh "pid $$ ($progname)\n"; # always appends anyway on *nix
147             }
148             }
149 0         0 $opts->{lockfile_fh} = $lock_fh;
150             #warn "$$ : ### GOT LOCK\n";
151             }
152             END{
153 3 50   3   759 if (defined $locked_fh) {
154 0         0 flock($locked_fh, LOCK_UN);
155 0         0 close($locked_fh);
156 0         0 $locked_fh = undef;
157 0         0 warn "Did emergency unlock!\n";
158             }
159             #else { warn "(emergency unlock not needed)\n"; }
160             }
161             sub _release_lock($) {
162 0     0   0 my $opts = shift;
163 0   0     0 my $fh = delete($opts->{lockfile_fh}) // oops;
164 0 0       0 oops unless $fh == $locked_fh;
165             #seek($fh, 0, SEEK_SET) or die;
166             #my @x = (<$fh>);
167             #seek($fh, 0, SEEK_SET) or die;
168             #warn dvis "$$ : Lockfile contains: @x\n";
169             ##warn "$$ : ###BBB stalling before unlock...\n"; sleep 3;
170 0         0 truncate($fh,0);
171 0 0       0 flock($fh, LOCK_UN) or die "flock UN: $!";
172 0         0 close $fh;
173 0         0 $locked_fh = undef;
174             #warn "$$ : ###BB0 unlocked and closed.\n";
175             }
176              
177             # Libre Office text converter "charset" numbers
178             my %LO_charsets = (
179             'WINDOWS1252' => 1, 'WINLATIN1' => 1,
180             'APPLEWESTERN' => 2,
181             'DOS/OS2437' => 3,
182             'DOS/OS2850' => 4,
183             'DOS/OS2860' => 5,
184             'DOS/OS2861' => 6,
185             'DOS/OS2863' => 7,
186             'DOS/OS2865' => 8,
187             'SYSTEM' => 9, 'SYSTEMDDEFAULT' => 9,
188             'SYMBOL' => 10,
189             'ASCII' => 11,
190             'ISO88591' => 12,
191             'ISO88592' => 13,
192             'ISO88593' => 14,
193             'ISO88594' => 15,
194             'ISO88595' => 16,
195             'ISO88596' => 17,
196             'ISO88597' => 18,
197             'ISO88598' => 19,
198             'ISO88599' => 20,
199             'ISO885914' => 21,
200             'ISO885915' => 22,
201             'OS2737' => 23,
202             'OS2775' => 24,
203             'OS2852' => 25,
204             'OS2855' => 26,
205             'OS2857' => 27,
206             'OS2862' => 28,
207             'OS2864' => 29,
208             'OS2866' => 30,
209             'OS2869' => 31,
210             'WINDOWS874' => 32,
211             'WINDOWS1250' => 33, 'WINLATIN2' => 33,
212             'WINDOWS1251' => 34,
213             'WINDOWS1253' => 35,
214             'WINDOWS1254' => 36,
215             'WINDOWS1255' => 37,
216             'WINDOWS1256' => 38,
217             'WINDOWS1257' => 39,
218             'WINDOWS1258' => 40,
219             'APPLEARABIC' => 41,
220             'APPLECENTRALEUROPEAN' => 42,
221             'APPLECROATIAN' => 43,
222             'APPLECYRILLIC' => 44,
223             'APPLEDEVANAGARI' => 45,
224             'APPLEFARSI' => 46,
225             'APPLEGREEK' => 47,
226             'APPLEGUJARATI' => 48,
227             'APPLEGURMUKHI' => 49,
228             'APPLEHEBREW' => 50,
229             'APPLEICELANDIC' => 51,
230             'APPLEROMANIAN' => 52,
231             'APPLETHAI' => 53,
232             'APPLETURKISH' => 54,
233             'APPLEUKRAINIAN' => 55,
234             'APPLECHINESESIMPLIFIED' => 56,
235             'APPLECHINESETRADITIONAL' => 57,
236             'APPLEJAPANESE' => 58,
237             'APPLEKOREAN' => 59,
238             'WINDOWS932' => 60,
239             'WINDOWS936' => 61,
240             'WINDOWSWANSUNG949' => 62,
241             'WINDOWS950' => 63,
242             'SHIFTJIS' => 64,
243             'GB2312' => 65,
244             'GBT12345' => 66,
245             'GBK' => 67, 'GB231280' => 67,
246             'BIG5' => 68,
247             'EUCJP' => 69,
248             'EUCCN' => 70,
249             'EUCTW' => 71,
250             'ISO2022JP' => 72,
251             'ISO2022CN' => 73,
252             'KOI8R' => 74,
253             'UTF7' => 75,
254             'UTF8' => 76,
255             'ISO885910' => 77,
256             'ISO885913' => 78,
257             'EUCKR' => 79,
258             'ISO2022KR' => 80,
259             'JIS0201' => 81,
260             'JIS0208' => 82,
261             'JIS0212' => 83,
262             'WINDOWSJOHAB1361' => 84,
263             'GB18030' => 85,
264             'BIG5HKSCS' => 86,
265             'TIS620' => 87,
266             'KOI8U' => 88,
267             'ISCIIDEVANAGARI' => 89,
268             'JAVAUTF8' => 90,
269             'ADOBESTANDARD' => 91,
270             'ADOBESYMBOL' => 92,
271             'PT154' => 93,
272             'UCS4' => 65534,
273             'UCS2' => 65535,
274             );
275              
276             =for Pod::Coverage _name2LOcharsetnum
277             =cut
278              
279             sub _name2LOcharsetnum($) {
280 0     0   0 my ($enc) = @_;
281 0         0 local $_ = uc $enc;
282 0         0 while (! $LO_charsets{$_}) {
283             # successively remove - and other special characters
284 0 0       0 s/\W//a or confess "LO charset: Unknown encoding name '$enc'";
285             }
286 0         0 $LO_charsets{$_}
287             }
288              
289             # convert between 0-based index and spreadsheet column letter code.
290             # Default argument is $_
291             sub cx2let(_) {
292 0     0 1 0 my $cx = shift;
293 0         0 my $ABC="A"; ++$ABC for (1..$cx);
  0         0  
294 0         0 return $ABC
295             }
296             sub let2cx(_) {
297 0     0 1 0 my $ABC = shift;
298 0         0 my $n = ord(substr($ABC,0,1,"")) - ord('A');
299 0         0 while (length $ABC) {
300 0         0 my $letter = substr($ABC,0,1,"");
301 0         0 $n = (($n+1) * 26) + (ord($letter) - ord('A'));
302             }
303 0         0 return $n;
304             }
305             sub cxrx2sheetaddr($$) { # (1,99) -> "B100"
306 0     0 0 0 my ($cx, $rx) = @_;
307 0         0 return cx2let($cx) . ($rx + 1);
308             }
309              
310             =for Pod::Coverage cxrx2sheetaddr oops btw
311             =cut
312              
313             our @sane_CSV_read_options = (
314             # Text::CSV pod says to not specify 'eol' to allow embedded newlines,
315             # and to automatically handle "\n", "\r", or "\r\n".
316             #eol => $/,
317             binary => 1, # Allow reading embedded newlines & unicode etc.
318             sep_char => ",",
319             quote_char => '"',
320             escape_char => '"', # Embedded "s appear as ""
321             allow_whitespace => 0, # Preserve leading & trailing white space
322             auto_diag => 2, # die on errors
323             );
324             our @sane_CSV_write_options = (
325             eol => $/, # Necessary when WRITING csv files
326             binary => 1,
327             sep_char => ",",
328             quote_char => '"',
329             escape_char => '"', # Embedded "s appear as ""
330             allow_whitespace => 0, # Preserve leading & trailing white space
331             auto_diag => 2, # die on errors
332             );
333              
334             my %Saved_Sigs;
335             sub _sighandler {
336 0 0 0 0   0 if (! $Saved_Sigs{$_[0]} or $Saved_Sigs{$_[0]} eq 'DEFAULT') {
337             # The user isn't catching this, so the process will abort without
338             # running destructors: Call exit instead
339 0         0 warn "($$)".__PACKAGE__." caught signal $_[0], exiting\n";
340 0         0 Carp::cluck "($$)".__PACKAGE__." caught signal $_[0], exiting\n";
341 0         0 exit 1;
342             }
343 0         0 $SIG{$_[0]} = $Saved_Sigs{$_[0]};
344 0         0 kill $_[0], $$;
345 0         0 oops "Default (or user-defined) sig $_[0] action was to ignore!";
346             }
347             sub _signals_guard() {
348 0   0 0   0 %Saved_Sigs = ( map{ ($_ => ($SIG{$_} // undef)) } qw/HUP INT QUIT TERM/ );
  0         0  
349 0         0 $SIG{HUP} = $SIG{INT} = $SIG{QUIT} = $SIG{TERM} = \&_sighandler;
350 0     0   0 return guard { @SIG{keys %Saved_Sigs} = (values %Saved_Sigs) }
351 0         0 }
352              
353             # Create a probably-unique fingerprint for a particular file
354             sub _file_fingerprint($) {
355 0     0   0 my $path = shift;
356 0         0 my $ctx = Digest::MD5->new;
357 0         0 $ctx->add($_) for((stat($path))[0,1,9]); # dev,ino,mtime
358 0         0 substr($ctx->b64digest,0,6)
359             }
360              
361             # Find LibreOffice, or failing that OpenOffice
362             our $OLpath_answer = $ENV{SPREADSHEET_EDIT_LOPATH};
363             sub openlibreoffice_path() {
364             return $OLpath_answer if $OLpath_answer;
365             unless ($ENV{SPREADSHEET_EDIT_IGNPATH}) {
366             foreach my $short_name (qw(libreoffice loffice localc)) {
367             if ($OLpath_answer = which($short_name)) {
368             return( ($OLpath_answer=path($OLpath_answer)->canonpath) );
369             }
370             }
371             }
372             # Search for an installation. On Windows it will usually be
373             # C:\Program Files\LibreOffice\...
374             # On *nix, a local/isolated install (i.e. the result of extracting files
375             # from a .deb or other archive) will be
376             # somwehere/opt/libreofficeA.B/...
377             # and "..." is the same standard hierarchy on all platforms.
378             #
379             # If multiple are found try to pick the "latest".
380             my sub _cmp_subpaths($$) {
381             my ($sp1, $sp2) = @_;
382             oops if !defined($sp1);
383             return 1 if !defined($sp2);
384             # Use longest version in the (sub-)path, e.g. "4.4.1/opt/openoffice4/..."
385             my (@v1) = sort { length($a) <=> length($b) } ($sp1 =~ /(\d[.\da-z]*)/ag);
386             my (@v2) = sort { length($a) <=> length($b) } ($sp2 =~ /(\d[.\da-z]*)/ag);
387             my $v1 = $v1[-1]//0;
388             my $v2 = $v2[-1]//0;
389             if ($v1 =~ s/alpha/0/) {
390             return -1 unless $v2 =~ s/alpha/0/;
391             }
392             if ($v2 =~ /alpha/) {
393             return +1
394             }
395             version->parse($v1) <=> version->parse($v2)
396             }
397              
398             # I tried just doing File::Glob::bsd_glob('/*/*/*/opt/libre*/program') but
399             # it silently failed even though the same glob works from the shell. Mmff...
400 3     3   30 no warnings FATAL => 'all';
  3         6  
  3         37928  
401             state $is_MSWin = ($^O eq "MSWin32");
402             my (@search_dirs, $searchfor_re, $maxdepth);
403             if ($is_MSWin) {
404             @search_dirs = ("C:\\Program Files","C:\\Program Files (x86)");
405             $searchfor_re = qr/^Program Files/;
406             $maxdepth = 1;
407             # depth: C:\Program Files\libreofficeXXX/program/
408             # 1
409             } else {
410             @search_dirs = (File::Spec->rootdir());
411             push @search_dirs, $ENV{HOME} if $ENV{HOME};
412             $maxdepth = 4;
413             $searchfor_re = qr/^opt$/;
414             # depth: /*/*/<unpackparent>/opt/libreofficeXXX/program/
415             # 1 2 3 4
416             }
417              
418             my $debug = $ENV{SPREADSHEET_EDIT_FINDDEBUG};
419             my sub _Findvarsmsg() {
420             if (u($_) eq u($File::Find::name) && u($_) eq u($File::Find::fullname)) {
421             return qsh($_)."\n"
422             }
423             if (u($_) eq u($File::Find::name)) {
424             return "\$_=name=".qsh($_)." -> ".qsh($File::Find::fullname)."\n";
425             }
426             "\$_=".qsh($_)." name=".qsh($File::Find::name)." fullname=".qsh($File::Find::fullname)."\n";
427             }
428              
429             my %results;
430             $ENV{SPREADSHEET_EDIT_NOLOSEARCH} or
431             File::Find::find(
432             { wanted => sub{
433             # Undef fullname OR invalid "_" filehandle implies a broken symlink,
434             # see https://github.com/Perl/perl5/issues/21122
435             # Zero size on *nix implies /proc or something similar; do not enter.
436             # File::Find::fullname unreadable implies followed link to inaccessable
437             # (The initial "_" stat may be invalid, so "-l _" is useless)
438 4862638     4862638   17894268 $! = 0;
439             # https://github.com/Perl/perl5/issues/21143
440 4862638         8462043 my $fullname = $File::Find::fullname;
441 4862638 50 33     15582885 if (!defined($fullname) && $is_MSWin) {
442 0 0       0 warn "# _ MSWin undef fullname! ",_Findvarsmsg() if $debug;
443 0         0 stat($_); # lstat was not done. Grr...
444 0         0 $fullname = $File::Find::name;
445 0 0       0 unless (-d _) {
446 0         0 $File::Find::prune = 1; # in case it really is a dir
447 0         0 return;
448             }
449             } else {
450 4862638 100 100     16101753 unless (-d _ or -l _) {
451 4484646 100       10148745 warn "# _ notdir/symlink: ",_Findvarsmsg() if $debug;
452 4484646         54408118 $File::Find::prune = 1; # in case it really is
453 4484646         126386739 return;
454             }
455             }
456 377992 100 33     14509538 if (
      66        
      66        
      66        
      66        
      66        
      66        
      100        
457             !defined($fullname) # broken link, per docs
458             || (! -r _) || (! -x _) # unreadable item or invalid "_" handle
459             # https://github.com/Perl/perl5/issues/21122
460             || (!$is_MSWin && (stat(_))[7] == 0) # zero size ==> /proc etc.
461             || /\$/ # $some_windows_special_thing$
462             || ! -r $fullname # presumably a symlink to unreadable
463             || ! -x _ # or unsearchable dir
464             || m#^/snap/(?!.*ffice)# # snap other than e.g. /snap/libreoffice
465             || m#^/(proc|dev|sys|tmp|boot|run|lost+found|usr/(include|src))$#
466             ) {
467 646 100       2250 warn "# PRUNING ",_Findvarsmsg() if $debug;
468 646         8970 $File::Find::prune = 1;
469             return
470 646         67438 }
471 377346 100       1665687 warn "# DIR: ",_Findvarsmsg() if $debug;
472             # Maximum depth: /*/*/<unpackparent>/opt/libreofficeXXX/program/
473 377346         5664402 my $path = path($_);
474 377346         14389311 my $depth = scalar(() = $path->stringify =~ m#(/)#g);
475 377346 100       14801649 if (basename($_) =~ $searchfor_re) { # ^opt$ or ^Program Files
476 6         35 my $prefix = path($_)->parent->parent;
477 6         1262 for my $o_l (qw/libre open/) {
478 12         48 my $pattern
479             = path($_)->child("${o_l}*/program/soffice*")->canonpath;
480             # eval because I'm suspicious of the glob on Windows
481 12         1088 my @hits; eval{ @hits = sort +bsd_glob($pattern, GLOB_NOCASE) };
  12         33  
  12         2591  
482 12 50       77 if (@hits) {
483             # On windows, use soffice.com not .exe because it writes messages
484             # to stdout not a window. See https://help.libreoffice.org/7.5/en-GB/text/shared/guide/start_parameters.html?&DbPAR=SHARED&System=WIN
485             my $path = (first{ /soffice\.com$/ } @hits) ||
486 0   0     0 (first{ /soffice$/ } @hits);
487 0 0       0 if ($path) {
488 0 0       0 $prefix->subsumes($path) or oops dvis '$prefix $path';
489 0         0 my $subpath = path($path)->relative($prefix);
490 0 0       0 if (_cmp_subpaths($subpath, $results{$o_l}{subpath}) >= 0) {
491 0         0 @{$results{$o_l}}{qw/path subpath/} = ($path, $subpath);
  0         0  
492             # We found where installations are, don't look deeper
493 0         0 $maxdepth = $depth;
494             }
495             }
496             }
497 12 50       63 else { btw dvis '##glob failed: $pattern\n$@' if $@; }
498             }
499             }
500 377346 100       37426138 if ($depth == $maxdepth) {
    50          
501 2074 100       7004 warn "# pruning at maxdepth $depth ",qsh($_),"\n" if $debug;
502 2074         21669 $File::Find::prune = 1;
503 2074         49582 return;
504             }
505 0         0 elsif ($depth > $maxdepth) { oops dvis '$depth $maxdepth $_' }
506             },
507             follow_fast => 1,
508             follow_skip => 2,
509             dangling_symlinks => 0,
510             no_chdir => 1
511             },
512             @search_dirs
513             );
514             $OLpath_answer = path(
515             $results{libre}{path} // $results{open}{path}
516             || (!$ENV{SPREADSHEET_EDIT_IGNPATH} && which("soffice")) # installed OO?
517             || return(undef)
518             )->realpath->canonpath
519             }#openlibreoffice_path
520              
521             sub _openlibre_features() {
522 1     1   4 state $hash;
523 1 50       8 return $hash if defined $hash;
524 1   50     24 my $prog = openlibreoffice_path() // return(($hash={ available => 0 }));
525 0         0 my $raw_version;
526             # This is gross but fast and works in recent versions of LO
527 0 0       0 if (my $fh = eval{ path($prog)->realpath->parent->child("types/offapi.rdb")
  0         0  
528             ->filehandle("<",":raw")} ) {
529 0         0 my $octets; sysread $fh, $octets, 100;
  0         0  
530 0 0       0 if ($octets =~ /Created by LibreOffice (\d+\.\d+\.\w+)/) {
531 0         0 $raw_version = $1;
532             }
533             }
534 0 0       0 unless ($raw_version) {
535 0 0       0 if (qx/$prog --version 2>&1/ =~ /Libre.*? (\d+\.\d+\.\w+)/) {
536 0         0 $raw_version = $1;
537             } else {
538 0         0 warn "$prog --version DID NOT WORK\n";
539             }
540             }
541 0 0       0 unless ($raw_version) {
542 0         0 warn "WARNING: Could not determine version of $prog\n";
543 0         0 $raw_version = "999.01";
544             }
545 0         0 my $version = version->parse("v$raw_version");
546 0         0 $hash = {
547             available => 1,
548             # LibreOffice 7.2 allows extracting all sheets at once
549             allsheets => ($version >= version->parse("v7.2")),
550             # ...but not yet extracting a single sheet by name.
551             # https://bugs.documentfoundation.org/show_bug.cgi?id=135762#c24
552             named_sheet => 0,
553             # Supported output formats are too many to list
554             ousuf_any => 1,
555             raw_version => $raw_version, version => "$version",
556             }
557             }
558              
559 0     0   0 sub _openlibre_supports_allsheets() { _openlibre_features()->{allsheets} }
560 0     0   0 sub _openlibre_supports_named_sheet() { _openlibre_features()->{named_sheet} }
561 0     0   0 sub _openlibre_supports_writing($) { _openlibre_features()->{available} }
562              
563 1     1   343 sub _ssconvert_features() { return { availble => 0 } } # TODO add back?
564 0     0   0 sub _ssconvert_supports_allsheets() { _ssconvert_features()->{allsheets} }
565 0     0   0 sub _ssconvert_supports_named_sheet() { _ssconvert_features()->{named_sheet} }
566 0     0   0 sub _ssconvert_supports_writing($) { _ssconvert_features()->{available} }
567              
568             # These allow users (e.g. App-diff_spreadsheets tests) to determine
569             # if external tool(s) are available to convert between spreadsheet formats
570             # or to/from csv (CSVs are supported directly so can always be used)
571             # Currently used by t/io.pl to skip tests
572             sub can_cvt_spreadsheets() {
573             _openlibre_features()->{available} || _ssconvert_features()->{availble}
574 1 50   1 1 2607607 }
575             sub can_extract_allsheets() {
576 0 0   0 1 0 _openlibre_supports_allsheets() || _ssconvert_supports_allsheets()
577             }
578             sub can_extract_named_sheet() {
579 0 0 0 0 1 0 can_extract_allsheets() # used to emulate
580             || _openlibre_supports_named_sheet() || _ssconvert_supports_named_sheet()
581             }
582              
583             =for Pod::Coverage can_cvt_spreadsheets can_extract_allsheets
584             -for Pod::Coverage can_extract_named_sheet
585             =cut
586              
587             sub _runcmd($@) {
588 0     0   0 my ($opts, @cmd) = @_;
589 0         0 my $guard = _signals_guard;
590             # This used to fork & exec but that blows up on MSWin32 because the child
591             # pseudo-process executes all END{} blocks everywhere after "exec"
592 0         0 my $redirs = "";
593 0 0       0 if ($opts->{suppress_stderr}) {
594 0         0 $redirs .= " 2>".devnull();
595             }
596 0 0       0 if ($opts->{stdout_to_stderr}) {
597 0         0 confess "Not portable";
598 0         0 $redirs .= " 1>&2";
599             }
600 0 0       0 if ($opts->{stderr_to_stdout}) {
601 0         0 confess "Not portable";
602 0         0 $redirs .= " 2>&1";
603             }
604 0 0       0 if ($opts->{suppress_stdout}) {
605 0         0 $redirs .= " >".devnull();
606             }
607 0         0 my $cmdstr = join(" ", map{qsh} @cmd) . $redirs;
  0         0  
608 0 0       0 if ($redirs) {
609 0         0 foreach (@cmd) {
610 0 0       0 confess "Can not portably pass argument '$_'" if /["']/;
611             }
612 0 0       0 warn "> $cmdstr\n" if $opts->{verbose};
613 0         0 system $cmdstr;
614             } else {
615 0 0       0 warn "> $cmdstr\n" if $opts->{verbose};
616 0         0 system @cmd;
617             }
618 0         0 return $?
619             }
620              
621             sub _fmt_outpath_contents($) {
622 0   0 0   0 my $outpath = $_[0]->{outpath} // oops;
623 0 0       0 return "(outpath does not exist)" unless -e $outpath;
624 0 0       0 return "(outpath is a file)" if -f $outpath;
625 0 0       0 return "(outpath is a STRANGE OBJECT)" unless -d $outpath;
626             "\n outpath contains: "
627 0         0 .join(", ",map{qsh basename $_} path($outpath)->children);
  0         0  
628             }
629              
630             my $tempdir;
631             sub _create_tempdir_if_needed($) {
632 2     2   5 my $opts = shift;
633             # Keep a per-process persistent temp directory, deleted at process exit.
634             # It contains result files when the user did not specify {outpath},
635             # plus a cache of as-yet unrequested sheet .csv files, used when the
636             # external tool can only extract all sheets, not a single sheet by name:
637             #
638             # tempdir/
639             # <ifbase>_<sig>.xlsx etc. # single file returned to user
640             # <ifbase>_<sig>/*.csv # directory returned to user
641             # <ifbase>_<sig>_csvcache/*.csv
642             #
643             # <ifbase> is derived from the intput file name, and <sig> is a fingerprint
644             # based on input file's dev, inode, and modification timestamp.
645             #
646 2   66     13 $tempdir //= do{
647             #(my $template = __PACKAGE__."_XXXXX") =~ s/::/-/g;
648             #Path::Tiny->tempdir($template)
649 1         3 my $pid = $$;
650 1         6 my $user = _get_username();
651 1         9 (my $dname = __PACKAGE__."_${user}_${pid}_tempdir") =~ s/::/-/g;
652 1         26 (my $path = path(File::Spec->tmpdir)->child($dname))->mkpath;
653 1         435 $path
654             };
655             }
656 3 100   3   5800 END{ $tempdir->remove_tree if $tempdir; }
657              
658             # Compose a unique path under $tempdir.
659             # This is *not* a "tempfile" or "tempdir" object which auto-destructs,
660             # in fact it does not even exist yet and we don't know here which it will be.
661             # Either the user must remove it when they are done with it, or it will
662             # be removed when $tempdir is removed at process exit.
663             #
664             sub _path_under_tempdir($@) {
665 0     0   0 my $opts = shift;
666             my %args = (
667 0         0 words => [$opts->{ifbase}, $opts->{sheetname}],
668             @_
669             );
670 0         0 my $bname = join "_", grep{defined} @{$args{words}};
  0         0  
  0         0  
671             # Collisions occur when recursing to emulate Extract-by-name,
672             # or if the user repeatedly reads the same thing, etc.
673 0         0 state $seqnums = {};
674 0 0       0 if ($seqnums->{$bname}++) {
675 0         0 $bname .= "_".$seqnums->{$bname}; # append unique sequence number
676             }
677 0 0       0 $bname .= ".$args{suf}" if $args{suf};
678 0         0 return $tempdir->child($bname);
679             }
680              
681             # Compose csv cache subdir path
682             sub _cachedir($) {
683 0     0   0 my $opts = shift;
684 0         0 _path_under_tempdir($opts,words => [$opts->{ifbase}, "csvcache"]);
685             }
686              
687             ## Copy an ephemeral temp file to a path under tempdir if needed
688             #sub _make_file_permanent($$) {
689             # my ($opts, $path) = @_;
690             # if (eval{ $path->cached_temp }) { # didn't throw
691             # my $suf = $path->basename =~ /\.(\w+)$/a ? $1 : undef;
692             # my $newpath = _path_under_tempdir($opts, suf => $suf);
693             # $path->move($newpath);
694             # return $newpath
695             # } else {
696             # return $path
697             # }
698             #}
699              
700             sub _convert_using_openlibre($$$) {
701 0     0   0 my ($opts, $src, $dst) = @_;
702 0 0   0   0 oops unless all{ $opts->{$_} } qw/cvt_from cvt_to/;
  0         0  
703 0 0 0     0 oops if $opts->{allsheets} && ! _openlibre_supports_allsheets();
704 0 0 0     0 oops if $opts->{sheetname} && ! _openlibre_supports_named_sheet();
705 0         0 my $debug = $opts->{debug};
706              
707 0   0     0 my $prog = openlibreoffice_path() // oops;
708              
709 0         0 my $saved_UserInstallation = $ENV{UserInstallation};
710             # URI format is file://server/path where 'server' is empty. "file://path" is
711             # "never correct, but is often used" en.wikipedia.org/wiki/File_URI_scheme
712             # Correct examples: file::///tmp/something file:///C:/somewhere
713 0         0 $ENV{UserInstallation} = URI::file->new(_get_tool_profdir($prog)->canonpath);
714 0 0       0 warn "Temporarily set UserInstallation=$ENV{UserInstallation}\n" if $debug;
715             scope_guard {
716 0 0   0   0 if (defined $saved_UserInstallation) {
717 0         0 $ENV{UserInstallation} = $saved_UserInstallation;
718             } else {
719             delete $ENV{UserInstallation}
720 0         0 }
721 0         0 };
722              
723             # The --convert-to argument is "suffix:filtername:filteropts"
724              
725             # I think (not certain) that we can only specify the encoding of CSV files,
726             # either as input or output; .xlsx and .ods spreadsheets (which are based
727             # on XML) could in principle use any encoding internally, but I'm not sure
728             # we can control that, nor should anyone ever need to.
729              
730             # REFERENCES:
731             # https://help.libreoffice.org/7.5/en-US/text/shared/guide/start_parameters.html?&DbPAR=SHARED&System=UNIX
732             # http://wiki.openoffice.org/wiki/Documentation/DevGuide/Spreadsheets/Filter_Options
733             # https://wiki.documentfoundation.org/Documentation/DevGuide/Spreadsheet_Documents#Filter_Options_for_the_CSV_Filter
734              
735             # I think we never want to specify the filter unless we have parameters
736             # for it. Currently that is only for csv.
737             # If no filter is specified, the suffix (e.g. 'ods') should be enough
738 0         0 state $suf2ofilter = {
739             csv => "Text - txt - csv (StarCalc)",
740             txt => "Text - txt - csv (StarCalc)",
741             #xls => "MS Excel 97",
742             #xlsx => "Calc MS Excel 2007 XML",
743             #ods => "calc8",
744             };
745              
746 0   0     0 my $ifilter = $opts->{soffice_infilter} //= do{
747 0 0       0 if ($opts->{cvt_from} eq "csv") {
748 0 0       0 my $filter_name = $suf2ofilter->{$opts->{cvt_from}} or oops;
749 0         0 my $enc = $opts->{input_encoding};
750 0         0 my $charset = _name2LOcharsetnum($enc); # dies if unknown enc
751 0         0 my $colformats = "";
752 0 0       0 if (my $cf = $opts->{col_formats}) {
753 0 0       0 $cf = [split /[\/,]/, $cf] unless ref($cf); # fmtA/fmtB/...
754 0         0 for (my $ix=0; $ix <= $#$cf; $ix++) {
755 0   0     0 local $_ = $cf->[$ix] // 1;
756 0 0 0     0 m#^([123459]|10)$#
      0        
      0        
      0        
      0        
      0        
      0        
757             || s#^standard$#1#i
758             || s#^text$#2#i
759             || s#^M+/D+/Y+$#3#i
760             || s#^D+/M+/Y+$#4#i
761             || s#^Y+/M+/D+$#5#i
762             || s#^ignore$#9#i
763             || s#^US.*English$#10#i
764             || croak "Unknown format code '$_' in {col_formats}";
765 0 0       0 $colformats .= "/" if $colformats;
766 0         0 $colformats .= ($ix+1)."/$_";
767             }
768             }
769             $filter_name.":"
770             # Tokens 1-4: FldSep=',' TxtDelim='"' Charset FirstLineNum
771             #. "44,34,$charset,1"
772             . ord($opts->{sep_char}//",").","
773 0   0     0 . ord($opts->{quote_char}//'"').","
      0        
774             . "$charset,1"
775             # Token 5: Cell format codes:
776             # If variable-width cells (the norm): colnum/fmt/colnum/fmt...
777             # colnum: 1-based column number
778             # fmt: 1=Std 2=Text 3=MM/DD/YY 4=DD/MM/YY 5=YY/MM/DD 6-8 unused
779             # 9=ignore field (do not import),
780             # 10=US-English content (e.g. 3.14 not 3,14)
781             # (I'm guessing 1=Std means use current lang [or per Tok 6?])
782             # If fixed-width cells... [something else]
783             . ",$colformats"
784             # Token 6: MS-LCID Language Id; 0 or omitted means UI language
785             . "," # default: false
786             # Token 7: On input: true => Quoted cells are always read a 'text',
787             # effectively disabling Token 8. This must be false to recognize dates
788             # like "Jan 1, 2000" which by necessity must be quoted for the comma,
789             # but will **CORRUPT** zip codes with leading zeroes unless
790             # col_formats overrides (which it does now by default).
791             .",false" # default: false
792             # Token 8: on input: "Detect Special Numbers", i.e. date or time values
793             # in human form, numbers in scientific (expondntial) notation etc.
794             # If false, ONLY decimal numbers (thousands separators ok).
795             .",true" # default: false (for import)
796             # Tokens 9-10: not used on import
797             .",,"
798             # Token 11: Remove spaces; trim leading & trailing spaces when reading
799             ."," # default: false
800             # Token 12: not use on import
801             .","
802             # Token 13: Import "=..." as formulas instead of text?
803             ."," # default: false i.e. do not recognize formulas
804             # Token 14: "Automatically detected since LibreOffice 7.6" [BOM?]
805             .","
806             }
807             else {
808             undef
809 0         0 }
810             };
811              
812 0   0     0 my $ofilter = $opts->{soffice_outfilter} //= do{
813             # OutputFilterName[:paramtoken,paramtoken,...]
814 0 0       0 if ($opts->{cvt_to} eq "csv") {
815 0 0       0 my $filter_name = $suf2ofilter->{$opts->{cvt_to}} or oops;
816 0         0 my $enc = $opts->{output_encoding};
817 0         0 my $charset = _name2LOcharsetnum($enc); # dies if unknown enc
818             $filter_name.":"
819             # Tokens 1-4: FldSep=, TxtDelim=" Charset FirstLineNum
820             #."44,34,$charset,1"
821             . ord($opts->{sep_char}//",").","
822             . ord($opts->{quote_char}//'"').","
823             . "$charset,1"
824             # Token 5: Cell format codes. Only used for import? (see above)
825             # What about fixed-width?
826             .","
827             # Token 6: Language identifier (uses Microsoft lang ids)
828             # 1033 means US-English (omitted => use UI's language)
829             .","
830             # Token 7: QuoteAllTextCells
831             # *** true will "quote" even single-bareword cells, which looks
832             # *** bad and makes t/ tests messier, but preserves information
833             # *** that such cells were not numbers or dates, etc. This ensures
834             # *** that Zip codes, etc. with leading zeroes won't be corrupted
835             # *** if converted back into a spreadsheet
836             # Option #1: Specify true to quote all cells on export, then post-process
837             # the result to un-quote obviously safe cells (for yet more overhead).
838             # Option #2: Specify false, and assume the resulting CSV will never
839             # be imported into a spreadsheet except via us, and we pre-scan
840             # the data to generate {col_formats} so will usually be safe.
841             # 5/30/23: Switching to Option #2...
842             .",false"
843             # Token 8: on output: true to store number as numbers; false to
844             # store number cells as text. No UI equivalent.
845             .",true" # default: documented as true (for export) BUT IS NOT!
846             # Token 9: "Save cell contents as shown"
847             # Generally we DO NOT want this because things like dates
848             # can be formatted many different ways.
849             ##.",".($opts->{raw_values} ? "false" : "true")
850             .",false"
851             # Token 10: "Export cell formulas"
852             .",false"
853             # Token 11: not used for export
854             .","
855             # Token 12: (LO 7.2+) sheet selections:
856             # 0 or absent => the "first" sheet
857             # 1-N => the Nth sheet (arrgh, can not specify name!!)
858             # -1 => export all sheets to files named filebasenamne.Sheetname.csv
859             .",".($opts->{allsheets} ? -1 :
860 0 0 0     0 $opts->{sheetname} ? die("add named-sheet support here") :
    0 0        
861             0)
862             # Token 13: Not used for export
863             .","
864             # Token 14: true to include BOM in the result
865             #.","
866             }
867             else {
868             undef
869 0         0 }
870             };
871              
872             # We can only control the output directory path, not the name of
873             # an individual result file. If $dst is a directory then the result
874             # could theoretically output into it directly, but instead we always
875             # output to an ephemeral temp directory and then move the results to $dst
876             #
877             # With 'allsheets' the resulting files must be renamed to conform to our
878             # external API (namely SHEETNAME.csv).
879             #
880             # ERROR DETECTION: As of LO 7.5 we always get zero exist status and the
881             # only way to detect errors is to notice that no files were written.
882             # https://bugs.documentfoundation.org/show_bug.cgi?id=155415
883             #
884 0         0 my $tdir = Path::Tiny->tempdir(path($dst)->basename."_XXXXX");
885             # will be deleted when $dirpath goes out of scope
886              
887             my @cmd = ($prog,
888             "--headless", "--invisible",
889             "--nolockcheck", "--norestore",
890             "--view", # open read-only in case can't create lockfile
891             $ifilter ? ("--infilter=$ifilter") : (),
892             "--convert-to",
893 0 0       0 $opts->{cvt_to}.($ofilter ? ":$ofilter" : ""),
    0          
894             "--outdir", $tdir->canonpath,
895             path($src)->canonpath);
896              
897 0 0       0 unless ($debug) {
898 0         0 $opts->{suppress_stdout} = 1;
899             #$opts->{suppress_stderr} = 1;
900             }
901              
902 0         0 my $cmdstatus = _runcmd($opts, @cmd);
903              
904 0 0       0 if ($cmdstatus != 0) {
905             # This should never happen, see ERROR DETECTION above
906             confess sprintf("($$) UNEXPECTED FAILURE, wstat=0x%04x\n",$cmdstatus),
907             " converting '$opts->{inpath}' to $opts->{cvt_to}\n",
908 0         0 " Command was: ",join(" ",map{qsh} @cmd);
  0         0  
909             }
910              
911 0         0 my @result_files = path($tdir)->children;
912 0 0       0 btw dvis '>> @result_files' if $debug;
913 0 0       0 if (@result_files == 0) {
914 0 0       0 croak qsh($src)." is missing or unreadable\n", "cmd: @cmd\n"
915             unless -r $src;
916 0         0 croak "Something went wrong, ",path($prog)->basename," produced no output\n",
917             "cmd: @cmd\n"
918             }
919              
920 0 0       0 if ($opts->{allsheets}) {
921             # Rename files to match our API (omit the spreadsheetbasename- prefix)
922 0         0 foreach (@result_files) {
923 0         0 my $dir = $_->parent; # Like dirname but including Volume:
924 0         0 my $base = $_->basename;
925 0 0       0 (my $newbase = $base) =~ s/^\Q$opts->{ifbase}\E-// or oops dvis '$base $opts';
926 0         0 my $newpath = $dir->child($newbase)->canonpath;
927 0         0 my $oldpath = path($_)->canonpath;
928 0 0       0 btw ">> Renaming $oldpath -> $newbase\n" if $debug;
929 0 0       0 rename ($oldpath, $newpath) or oops "$!";
930 0         0 $_ = $newpath; # update @result_files
931             }
932             }
933              
934             # Move the results to $dst
935 0 0       0 if (-e $dst) {
936 0 0       0 croak "$dst must be a directory if it pre-exists\n" unless -d $dst;
937 0 0       0 btw ">> Moving results -> $dst\n" if $debug;
938 0         0 foreach (@result_files) {
939 0 0       0 btw ">>> move $_ -> $dst" if $debug;
940 0         0 File::Copy::move($_, $dst)
941             }
942 0 0       0 btw ">> Now $dst contains: ",avis($dst->children) if $debug;
943             } else {
944 0 0       0 if ($opts->{allsheets}) {
945 0 0       0 btw ">> dirmove $tdir -> $dst\n" if $debug;
946 0 0       0 rename($tdir, $dst) or File::Copy::dirmove($tdir, $dst);
947             } else {
948 0 0       0 croak "Expecting only one result file, not @result_files"
949             if @result_files > 1;
950 0 0       0 btw ">> move $result_files[0] -> $dst\n" if $debug;
951 0         0 File::Copy::move($result_files[0], $dst);
952             }
953             }
954             }#_convert_using_openlibre
955              
956             sub _convert_using_ssconvert($$$) {
957 0     0   0 my ($opts, $src, $dst) = @_;
958 0         0 confess "Deprecated with extreme prejudice"; # no longer supported
959             ##
960             ## foreach (qw/inpath cvt_to /)
961             ## { oops "missing opts->{$_}" unless exists $opts->{$_} }
962             ##
963             ## my $eff_outpath = $opts->{outpath};
964             ## if (my $prog=which("ssconvert")) {
965             ## my $enc = _get_encodings_from_opts($opts);
966             ## $enc //= "UTF-8"; # default
967             ## my @options;
968             ## if ($opts->{cvt_to} eq "csv") {
969             ## push @options, '--export-type=Gnumeric_stf:stf_assistant';
970             ## my @dashO_terms = ("format=preserve", "transliterate-mode=escape");
971             ## push @dashO_terms, "charset='${enc}'" if defined($enc);
972             ## if ($opts->{sheetname}) {
973             ## push @dashO_terms, "sheet='$opts->{sheetname}'";
974             ## }
975             ## if ($opts->{allsheets}) {
976             ## #If both {allsheets} and {sheetname} are specified, only a single
977             ## # .csv file will be in the output directory
978             ## croak "'allsheets' option: 'outpath' must specify an existing directory"
979             ## unless -d $eff_outpath;
980             ## $eff_outpath = catfile($eff_outpath, "%s.csv");
981             ## push @options, "--export-file-per-sheet";
982             ## }
983             ## elsif ($opts->{sheetname}) {
984             ## # handled above
985             ## }
986             ## else {
987             ## # A backwards-incompatible change to ssconvert stopped extracting
988             ## # the "current" sheet by default; now all sheets are concatenated!
989             ## # See https://gitlab.gnome.org/GNOME/gnumeric/issues/461
990             ## # ssconvert verison 1.12.45 supports a new "-O active-sheet=y" option
991             ## ## PORTABILITY BUG: Redirection syntax will not work on windows
992             ## my ($ssver) = (qx/ssconvert --version 2>&1/ =~ /ssconvert version '?(\d[\d\.]*)/);
993             ## if (version::is_lax($ssver) && version->parse($ssver) >= v1.12.45) {
994             ## push @dashO_terms, "active-sheet=y";
995             ## } else {
996             ## croak("Due to an ssconvert bug, a sheetname must be given.\n",
997             ## "(for more information, see comment at ",__FILE__,
998             ## " near line ", (__LINE__-10), ")\n");
999             ## }
1000             ## }
1001             ## push @options, '-O', join(" ",@dashO_terms);
1002             ## }
1003             ## elsif ($opts->{cvt_to} eq 'xlsx') {
1004             ## @options = ('--export-type=Gnumeric_Excel:xlsx2');
1005             ## }
1006             ## elsif ($opts->{cvt_to} eq 'xls') {
1007             ## @options = ('--export-type=Gnumeric_Excel:excel_biff8'); # M'soft Excel 97/2000/XP
1008             ## }
1009             ## elsif ($opts->{cvt_to} =~ /^od/) {
1010             ## @options = ('--export-type=Gnumeric_OpenCalc:odf');
1011             ## }
1012             ## elsif ($eff_outpath =~ /\.[a-z]{3,4}$/) {
1013             ## # let ssconvert choose based on the output file suffix
1014             ## }
1015             ## else {
1016             ## croak "unrecognized cvt_to='".u($opts->{cvt_to})."' and no outpath suffix";
1017             ## }
1018             ##
1019             ## my $eff_inpath = $opts->{inpath};
1020             ## if ($opts->{sheetname} && $opts->{inpath} =~ /.csv$/i) {
1021             ## # Control generated sheet name by using a symlink to the input file
1022             ## # See http://stackoverflow.com/questions/22550050/how-to-convert-csv-to-xls-with-ssconvert
1023             ## my $td = catdir($tempdir // oops, "Gnumeric");
1024             ## remove_tree($td); mkdir($td) or die $!;
1025             ## $eff_inpath = catfile($td, $opts->{sheetname});
1026             ## symlink $opts->{inpath}, $eff_inpath or die $!;
1027             ## fixme: handle unimplmented or no-perms symlink failures
1028             ## }
1029             ## my @cmd = ($prog, @options, $eff_inpath, $eff_outpath);
1030             ##
1031             ## my $suppress_stderr = !$opts->{debug};
1032             ## if (0 != _runcmd({%$opts, suppress_stderr => $suppress_stderr}, @cmd)) {
1033             ## # Before showing a complicated ssconvert failure with backtrace,
1034             ## # check to see if the problem is just a non-existent input file
1035             ## { open my $dummy_fh, "<", $eff_inpath or croak "$eff_inpath : $!"; }
1036             ## my $failmsg = "($$) Conversion of '$opts->{inpath}' to $eff_outpath failed\n"."cmd: ".qshlist(@cmd)."\n";
1037             ## if ($suppress_stderr) { # repeat showing all output
1038             ## if (0 == _runcmd({%$opts, suppress_stderr => 0}, @cmd)) {
1039             ## warn "Surprise! Command failed the first time but succeeded on 2nd try!\n";
1040             ## }
1041             ## croak $failmsg;
1042             ## }
1043             ## }
1044             ## elsif (! -e $opts->{outpath}) {
1045             ## croak "($$) Conversion SILENTLY failed\n(using $prog)\n",
1046             ## " cmd: ",qshlist(@cmd),"\n"
1047             ## ;
1048             ## }
1049             ## return ($enc)
1050             ## }
1051             ## else {
1052             ## croak "Can not find ssconvert to convert '$opts->{inpath}' to $opts->{cvt_to}\n",
1053             ## "To install ssconvert: sudo apt-get install gnumeric\n";
1054             ## }
1055             }
1056              
1057             # Extracts |||SHEETNAME or !SHEETNAME or [SHEETNAME] from a path+sheet
1058             # specification, if present.
1059             # (Lots of historical compatibility issues...)
1060             # In scalar context, returns SHEETNAME or undef.
1061             # INTERNAL USE ONLY: In array context, returns (filepath, SHEETNAME or undef)
1062             sub sheetname_from_spec($) {
1063 6     6 1 10 my $spec = shift;
1064 6         9 local $_;
1065 6         17 my $p = path($spec);
1066 6         335 my $parent = $p->parent;
1067 6         379 my ($base,$sn) = ($p->basename =~ /^(.*) (?| \|\|\|([^\!\[\|]+)$
1068             | \!([^\!\[\|]+)$
1069             | \[([^\[\]]+)\]$
1070             )/x);
1071 6 100 33     93 wantarray ? ($parent->child($base//$p->basename)->stringify, $sn) : $sn
1072             }
1073             sub filepath_from_spec($) {
1074 2     2 1 10 my ($path, undef) = sheetname_from_spec($_[0]);
1075 2         106 $path
1076             }
1077             #Tester
1078             #foreach ("", "/a!b/c", "/a!b/c!sheet1", "/a/b/c[sheet2]", "/a/b/c[bozo]d.xls",
1079             # ) {
1080             # foreach($_, basename($_)) {
1081             # my ($fp,$sn) = sheetname_from_spec($_);
1082             # use open ':std', ':locale';
1083             # warn ivis '# $_ → $fp $sn\n';
1084             # my $sn2 = sheetname_from_spec($_);
1085             # die "bug" unless u($sn) eq u($sn2);
1086             # }
1087             #}
1088             #die "TEX";
1089              
1090             # Construct a file + sheetname spec in the preferred form for humans to read
1091             # If sheetname is undef, just return the file path
1092             sub form_spec_with_sheetname($$) {
1093 2     2 1 9 my ($filespec, $sheetname) = @_;
1094 2         6 my $embedded_sheetname = sheetname_from_spec($filespec);
1095 2 0 33     8 croak "conflicting embedded and separate sheetnames given"
      33        
1096             if $embedded_sheetname && $sheetname && $embedded_sheetname ne $sheetname;
1097 2   33     12 $sheetname //= $embedded_sheetname;
1098 2         7 my $filepath = filepath_from_spec($filespec);
1099 2 50       11 $sheetname ? "${filepath}[${sheetname}]" : $filepath
1100             #$sheetname ? "${filepath}|||${sheetname}" : $filepath
1101             }
1102              
1103             our $default_input_encodings = "UTF-8,windows-1252,UTF-16BE,UTF-16LE";
1104             our $default_output_encoding = "UTF-8";
1105              
1106             # Return digested %opts setting
1107             # sheetname, inpath_sans_sheet (as Path::Tiny), encoding or default
1108             sub _process_args($;@) {
1109 2 50   2   8 confess "fix obsolete call to pass linearized options"
1110             if ref($_[0]) eq "HASH";
1111 2 50       8 my $leading_inpath = ( scalar(@_) % 2 == 1 ? shift(@_) : undef );
1112 2         11 my %opts = (
1113             cvt_from => "",
1114             cvt_to => "",
1115             @_,
1116             #verbose => 999, tempdir => "/tmp/J",
1117             );
1118 2 50       31 if (defined $opts{inpath}) {
1119 0 0       0 croak "Initial INPATH arg specified as well as inpath => ... in options"
1120             if defined $leading_inpath;
1121             } else {
1122 2   33     11 $opts{inpath} = $leading_inpath // croak "No inpath was specified";
1123             }
1124 2 50       7 $opts{verbose}=1 if $opts{debug};
1125              
1126             # inpath or outpath may have "!sheetname" appended (or alternate syntaxes),
1127             # but may exist only if a separate 'sheetname' option is not specified.
1128             # Input and output can not both be spreadsheets; one must be a CSV.
1129 2 50       6 if (exists($opts{sheet})) {
1130 0         0 carp "WARNING: Deprecated 'sheet' option found (use 'sheetname' instead)\n";
1131 0 0       0 croak "Both {sheet} and {sheetname} specified" if exists $opts{sheetname};
1132 0         0 $opts{sheetname} = delete $opts{sheet};
1133             }
1134 2         9 { my ($path_sans_sheet, $sheetname, $key);
  2         51  
1135 2         6 for my $thiskey ('inpath', 'outpath') {
1136 4   100     18 my $spec = $opts{$thiskey} || next;
1137 2         9 my ($pssn, $sn) = sheetname_from_spec($spec);
1138 2 50       125 if (defined $sn) {
1139 0 0       0 croak "A sheetname is embeeded in both ",
1140             "'$thiskey' ($opts{$thiskey}) and '$key' ($opts{$key})\n"
1141             if $sheetname;
1142 0         0 ($path_sans_sheet, $sheetname, $key) = ($pssn, $sn, $thiskey);
1143             }
1144             }
1145 2 50       11 if ($opts{sheetname}) {
    50          
1146             croak "'sheetname' option conflicts with embedded sheet name\n",
1147             " sheetname => ", qsh($opts{sheetname}),"\n",
1148             " $key => ", qsh($opts{$key}),"\n"
1149 0 0 0     0 if defined($sheetname) && $sheetname ne $opts{sheetname};
1150             }
1151             elsif (defined $sheetname) {
1152             btw "(extracted sheet name \"$sheetname\" from $key)\n"
1153 0 0       0 if $opts{verbose};
1154 0         0 $opts{sheetname} = $sheetname;
1155             }
1156             $opts{inpath_sans_sheet} = path(
1157             ($key && $key eq 'inpath') ? $path_sans_sheet : $opts{inpath}
1158 2 50 33     14 );
1159             }
1160             # Input file basename sans any .suffix
1161 2         99 $opts{ifbase} = $opts{inpath_sans_sheet}->basename(qr/\.[^.]+/);
1162              
1163 2         167 %opts
1164             }#_process_args
1165              
1166             # Extract the of encoding(s) specified in an iolayers string
1167             # Parse iolayers string, returning ($prefix,[encodings],$suffix)
1168             # For example from ":raw:encodings(utf8,windows-1252):zz" the output
1169             # would be (":raw", [:utf8","windows-1252"], ":zz")
1170             sub _parse_iolayers($) {
1171 0   0 0   0 local $_ = (shift) // "";
1172 0 0       0 /\A(<prefix>.*?)
1173             (<encspec>:utf8|:encoding\(([^\)]+)\))
1174             (<suffix>.*?)\z/ or croak "Invalid iolayers spec '$_'\n";
1175 0         0 (my $prefix, $_, my $suffix) = ($+{prefix}, $+{encspec}, $+{suffix});
1176 0 0 0     0 /^:(utf8)$/ || /^:encoding\(([^\)]+)\)$/ or oops($_);
1177 0         0 my $enclist = [split /,/, $1]; # comma,separated,list,of,encodings
1178 0         0 ($prefix, $enclist, $suffix);
1179             }
1180              
1181             # Detect cvt_from and cvt_to from filenames, or peeking at the data.
1182             # If input is CSV, detect encoding, separator and quote characters;
1183             # add quotes to values with leading zeroes (e.g. Zip codes) which would
1184             # otherwise be corrupted by being read as numbers instead of text strings.
1185             # The modified data is written to a temp file
1186             # Set default output_encoding if not specified
1187             # RETURNS: The effective input path, either inpath_sans_sheet or a tempfile
1188             sub _determine_enc_tofrom($) {
1189             my $opts = shift;
1190             my $debug = $opts->{debug};
1191             # Skip to ==BODY== below
1192              
1193             my sub determine_input_encoding($) {
1194             my $r2octets = shift;
1195             # If user specified one encoding, use it; if user specified list, try them.
1196             # If user did not specify, the default is a list to try.
1197             $opts->{input_encoding} //= $default_input_encodings;
1198             my @enclist = split m#,#, $opts->{input_encoding};
1199             return
1200             if @enclist == 1;
1201             $$r2octets //= $opts->{inpath_sans_sheet}->slurp_raw;
1202             for my $enc (@enclist) {
1203             eval { decode($enc, $$r2octets, Encode::FB_CROAK|Encode::LEAVE_SRC) };
1204             if ($@) {
1205             btw "Input encoding '$enc' did not work...($@)\n" if $debug;
1206             next;
1207             }
1208             btw "Input encoding '$enc' seems to work.\n" if $debug;
1209             @enclist = ($enc);
1210             last
1211             }
1212             #croak "Could not detect encoding of $opts->{inpath_sans_sheet}\n"
1213             confess "Could not detect encoding of $opts->{inpath_sans_sheet}\n"
1214             if @enclist > 1;
1215             $opts->{input_encoding} = $enclist[0];
1216             } #determine_input_encoding
1217              
1218             my sub readparse_csv($@) {
1219             my $fh = shift;
1220             my %csvopts = (
1221             @sane_CSV_read_options,
1222             defined($opts->{quote_char}) ? (quote_char=>$opts->{quote_char}) : (),
1223             defined($opts->{sep_char}) ? (sep_char=>$opts->{sep_char}) : (),
1224             auto_diag => 2, # throw on error
1225             @_
1226             );
1227             $csvopts{escape_char} = $csvopts{quote_char}; # must always be the same
1228              
1229             my $csv = Text::CSV->new (\%csvopts)
1230             or croak "Text::CSV->new: ", Text::CSV->error_diag(),
1231             dvis('\n## %csvopts\n');
1232             my @rows;
1233             while (my $F = $csv->getline( $fh )) {
1234             push(@rows, $F);
1235             }
1236             \@rows
1237             }
1238              
1239             my sub open_input($) {
1240             my $r2octets = shift;
1241             oops unless $opts->{input_encoding};
1242             my $fh;
1243             my $pathish = defined($$r2octets)
1244             ? \$$r2octets : $opts->{inpath_sans_sheet};
1245 1     1   8 open($fh, "<:encoding($opts->{input_encoding})", $pathish)
  1         2  
  1         9  
1246             or die "$pathish : $!";
1247             $fh
1248             }
1249              
1250             my sub determine_csv_q_sep($$) {
1251             my ($r2octets, $r2rows) = @_;
1252             return
1253             if defined($opts->{quote_char}) && defined($opts->{sep_char});
1254              
1255             my $fh = open_input($r2octets);
1256              
1257             # my $chars;
1258             # if (defined($$r2octets)
1259             # $chars = decode($opts->{input_encoding},$$r2octets,Encode::FB_CROAK);
1260             # }
1261              
1262             # Try combinations starting with the most-common '"' and ',' while
1263             # parsing the file for unsafe unquoted values (throws on syntax error).
1264             # The expectation is that the first try usually succeeds
1265             Q:
1266             for my $q (defined($opts->{quote_char})
1267             ? ($opts->{quote_char}) : ("\"", "'")) {
1268             my $found_q;
1269             SEP:
1270             for my $sep (defined($opts->{sep_char})
1271             ? ($opts->{sep_char}) : (",","\t")) {
1272             btw dvis '--- TRYING $q $sep ---' if $debug;
1273             # # Preliminary check for an illegal use of the quote char
1274             # if (defined($chars)
1275             # && $chars =~ /[^${q}${sep}\x{0D}\x{0A}]
1276             # ${q}
1277             # (?=[^${q}${sep}\x{0D}\x{0A}] | \z)/gx) {
1278             # btw ivis '>>>quote_char CAN NOT BE $q with sep=$sep because q exists mid-field before pos ${\(pos($chars))}'
1279             # if $debug;
1280             # next SEP
1281             # }
1282             $$r2rows = eval{ readparse_csv($fh, quote_char=>$q, sep_char=>$sep) };
1283             if ($@ eq "") {
1284             warn ivis '>> Detected quote_char=$q sep_char=$sep\n' if $debug;
1285             $opts->{quote_char} = $q;
1286             $opts->{sep_char} = $sep;
1287             last Q;
1288             }
1289             warn vis '$@\nq=$q sep=$sep did not work...\n' if $debug;
1290             seek $fh, 0, SEEK_SET;
1291             }
1292             }
1293             unless (defined($$r2rows)) {
1294             confess "Input file is not valid CSV (or we have a bug)\n"
1295             }
1296             }#determine_csv_q_sep
1297              
1298             my sub determine_csv_col_formats($$) {
1299             my ($r2octets, $r2rows) = @_;
1300             return
1301             if defined $opts->{col_formats};
1302             $$r2rows //= do{
1303             my $fh = open_input($r2octets);
1304             readparse_csv($fh);
1305             };
1306             my $max_cols = 0; for my $row (@{ $$r2rows }) { $max_cols = @$row if $max_cols < @$row }
1307             state $curr_yy = (localtime(time))[5];
1308             my @col_formats;
1309             my sub recognized($$$$;$) {
1310             my ($cx, $rx, $thing, $format, $as_msg) = @_;
1311             $col_formats[$cx] = $format;
1312             return unless $debug;
1313             $as_msg //= " as ".vis($col_formats[$cx])." format";
1314             if (length($thing) > 35) { $thing = substr($thing,0,32)."..."; }
1315             @_ = ("Recognized ",$thing," in ", cxrx2sheetaddr($cx,$rx), $as_msg);
1316             goto &btw
1317             }
1318             CX:
1319             for my $cx (0..$max_cols-1) {
1320             RX:
1321             for my $rx (0..$#{$$r2rows}) {
1322             my $row = $$r2rows->[$rx];
1323             next if $cx > $#$row; # row has fewer columns than others
1324             for ($row->[$cx]) {
1325             # recognize obvious Y/M/D or M/D/Y or D/M/Y date forms
1326             if (m#\b(?<y>(?:[12]\d)?\d\d)/(?<m>\d\d)/(?<d>\d\d)\b#) {
1327             if ($+{d} > 12 && $+{d} <= 31 && $+{m} >= 1 && $+{m} <= 12
1328             && ($+{y} < 100 || $+{y} >= 1000)) {
1329             recognized($cx,$rx,$_,"YY/MM/DD");
1330             next CX;
1331             }
1332             # If ambiguous YYYY/??/?? we can still assume it is a date and not text
1333             if (length($+{y})==4) {
1334             #recognized($cx,$rx,$_,""," as some kind of date, fmt unknown");
1335             next RX;
1336             }
1337             }
1338             if (m#\b(?<m>\d\d)/(?<d>\d\d)/(?<y>(?:[12]\d)?\d\d)\b#) {
1339             if ($+{y} < 100 || $+{y} >= 1000) {
1340             if ($+{d} > 12 && $+{d} <= 31 && $+{m} >= 1 && $+{m} <= 12) {
1341             recognized($cx,$rx,$_,"MM/DD/YY");
1342             next CX
1343             }
1344             elsif ($+{m} > 12 && $+{m} <= 31 && $+{d} >= 1 && $+{d} <= 12) {
1345             recognized($cx,$rx,$_,"DD/MM/YY");
1346             next CX
1347             }
1348             }
1349             # If ambiguous ??/??/YYYY we can still assume it is a date and not text
1350             if (length($+{y})==4) {
1351             #recognized($cx,$rx,$_,""," as some kind of date, fmt unknown");
1352             next RX;
1353             }
1354             }
1355             # Things to force to be read as text fields:
1356             # 1. Leading zeroes
1357             # 2. Leading ascii minus (\x{2D}) rather than math minus \N{U+2212}.
1358             # This prevents conversion to the Unicode math minus when LO
1359             # reads the CSV. The assumption is that if the input has an ascii
1360             # minus then the original spreadsheet format was "text" not
1361             # numeric.
1362             if (/^[\x{2D}0]/) {
1363             recognized($cx,$rx,$_,"text");
1364             next CX;
1365             }
1366             }
1367             }
1368             }
1369             $opts->{col_formats} = \@col_formats;
1370             }#determine_csv_col_formats
1371              
1372             # ==BODY==
1373             unless ($opts->{cvt_to}) {
1374             if ($opts->{outpath} && $opts->{outpath} =~ /\.([^.]+)$/) {
1375             $opts->{cvt_to} = $1;
1376             }
1377             croak "'cvt_to' was not specified and can not be intuited from 'outpath'"
1378             ,dvis('\n### $opts') ###TEMP
1379             unless $opts->{cvt_to};
1380             }
1381             unless ($opts->{cvt_from}) {
1382             if ($opts->{inpath_sans_sheet} =~ /\.([^.]+)$/) {
1383             $opts->{cvt_from} = $1;
1384             }
1385             }
1386             $opts->{cvt_from} =~ s/^\.txt$/.csv/i if $opts->{cvt_from};
1387              
1388             # Detect file format and, if CSV, encoding
1389             my ($octets, $rows);
1390             if (!$opts->{cvt_from} || $opts->{cvt_from} eq "csv") {
1391             determine_input_encoding(\$octets);
1392             }
1393             if (!$opts->{cvt_from}) {
1394             # Detect the file format by looking at the data. Actually, we only
1395             # support CSV in this case, so this is just a (half-baked) sanity check.
1396             eval {
1397             determine_csv_q_sep(\$octets, \$rows);
1398             if (!$opts->{cvt_from}) {
1399             $rows //= do{
1400             my $fh = open_input(\$octets);
1401             readparse_csv($fh);
1402             };
1403             }
1404             };
1405             if ($@ eq "") {
1406             warn "> Detected $opts->{inpath_sans_sheet} as a seemingly-valid CSV\n"
1407             if $debug;
1408             $opts->{cvt_from} = "csv";
1409             } else {
1410             croak "Can not detect what kind of file ",qsh($opts->{inpath})," is\n";
1411             }
1412             }
1413              
1414             if ($opts->{cvt_from} eq "csv") {
1415             determine_csv_col_formats(\$octets, \$rows);
1416             } else {
1417             oops if defined($octets) or defined($rows);
1418             }
1419              
1420             # Set default ouput_encoding if not specified
1421             $opts->{output_encoding} //= $default_output_encoding
1422             if $opts->{cvt_to} eq "csv";
1423              
1424             }#_determine_enc_tofrom
1425              
1426             sub _tool_extract_all_csvs($$) {
1427 0     0   0 my ($opts, $destdir) = @_;
1428              
1429 0         0 _get_exclusive_lock($opts);
1430 0     0   0 scope_guard { _release_lock($opts); };
  0         0  
1431              
1432 0         0 delete local $opts->{sheetname};
1433 0         0 local $opts->{allsheets} = 1;
1434 0 0       0 if (_openlibre_supports_allsheets()) {
    0          
1435 0         0 _convert_using_openlibre($opts, $opts->{inpath_sans_sheet}, $destdir);
1436             }
1437             elsif (_ssconvert_supports_allsheets()) {
1438 0         0 _convert_using_ssconvert($opts, $opts->{inpath_sans_sheet}, $destdir);
1439             }
1440 0         0 else { confess "Can't extract 'allsheets'. Please install LibreOffice 7.2 or newer" }
1441             }
1442              
1443             sub _tool_can_extract_csv_byname() {
1444 0 0   0   0 _openlibre_supports_named_sheet() || _ssconvert_supports_named_sheet()
1445             }
1446             sub _tool_extract_one_csv($$) {
1447 0     0   0 my ($opts, $destpath) = @_;
1448              
1449 0         0 _get_exclusive_lock($opts);
1450 0     0   0 scope_guard { _release_lock($opts); };
  0         0  
1451              
1452 0 0       0 if (_openlibre_features()->{available}) {
1453 0 0 0     0 oops if $opts->{sheetname} && !_openlibre_supports_named_sheet();
1454 0         0 _convert_using_openlibre($opts, $opts->{inpath_sans_sheet}, $destpath);
1455             } else {
1456 0         0 _convert_using_ssconvert($opts, $opts->{inpath_sans_sheet}, $destpath);
1457             }
1458             }
1459             sub _tool_can_extract_current_sheet() {
1460             _openlibre_features()->{available} || _ssconvert_features()->{available}
1461 0 0   0   0 }
1462              
1463             sub _tool_write_spreadsheet($$) {
1464 0     0   0 my ($opts, $destpath) = @_;
1465              
1466 0         0 _get_exclusive_lock($opts);
1467 0     0   0 scope_guard { _release_lock($opts); };
  0         0  
1468              
1469             # ssconvert allows specifying the sheetname when importing a csv but not LO
1470 0 0 0     0 if ($opts->{sheetname} && _ssconvert_supports_writing($opts->{cvt_to})) {
    0          
    0          
1471 0         0 _convert_using_ssconvert($opts, $opts->{inpath_sans_sheet}, $destpath);
1472             }
1473             elsif (_openlibre_supports_writing($opts->{cvt_to})) {
1474 0 0       0 if ($opts->{sheetname}) {
1475 0         0 carp "WARNING: Sheet name when creating a spreadsheet will be ignored\n";
1476 0         0 delete $opts->{sheetname};
1477             }
1478 0         0 _convert_using_openlibre($opts, $opts->{inpath_sans_sheet}, $destpath);
1479             }
1480             elsif (_ssconvert_supports_writing($opts->{cvt_to})) {
1481 0         0 _convert_using_ssconvert($opts, $opts->{inpath_sans_sheet}, $destpath);
1482             }
1483 0         0 else { croak "Can't create $opts->{cvt_to} spreadsheets. Please install LibreOffice 7.2 or newer" }
1484             }
1485              
1486              
1487             # Extract CSVs for every sheet into {outpath} (setting to tmpdir if not preset).
1488             # If cached CSVs are available they are moved into {outpath}/ .
1489             sub _extract_all_csvs($) {
1490 0     0   0 my ($opts) = @_;
1491 0         0 my $outpath = _final_outpath($opts);
1492 0         0 $outpath->mkpath; # nop if exists, croaks if conflicts with file
1493              
1494 0         0 _tool_extract_all_csvs($opts, $outpath); #logs
1495             }
1496              
1497              
1498             # Extract a single sheet into a CSV at {outpath} (defaulting to temp file).
1499             # If a cached CSV is available it is moved to {outpath}.
1500             sub _extract_one_csv($) {
1501             my ($opts) = @_;
1502             my $cachedirpath = _cachedir($opts);
1503              
1504             my sub _fill_csv_cache() {
1505             $cachedirpath->remove_tree;
1506             $cachedirpath->mkpath;
1507             { #local $opts->{verbose} = 0;
1508             #local $opts->{debug} = 0;
1509             _tool_extract_all_csvs($opts, $cachedirpath);
1510             }
1511             }
1512              
1513             my $outpath = _final_outpath($opts);
1514             $outpath->remove unless -d $outpath;
1515              
1516             if (defined($opts->{sheetname})) {
1517             my $fname = $opts->{sheetname}.".csv";
1518             my $cached_path = $cachedirpath->child($fname);
1519             if (! -e $cached_path) {
1520             if (_tool_can_extract_csv_byname()) {
1521             _tool_extract_one_csv($opts, $outpath); #logs
1522             return
1523             }
1524             warn ">>Emulating extract-by-name by extracting all csvs into cache...\n"
1525             if $opts->{debug};
1526             _fill_csv_cache;
1527             }
1528             croak "Sheet '$opts->{sheetname}' does not exist in $opts->{inpath_sans_sheet}\n"
1529             unless -e $cached_path;
1530             warn "> Moving cached $fname to $outpath\n" if $opts->{verbose};
1531             File::Copy::move($cached_path, $outpath);
1532             return
1533             }
1534             elsif (_tool_can_extract_current_sheet()) {
1535             _tool_extract_one_csv($opts, $outpath); #logs
1536             return
1537             }
1538             else {
1539             _fill_csv_cache;
1540             my @children = $cachedirpath->children;
1541             if (@children == 0) {
1542             croak "$opts->{inpath_sans_sheet} appears to have zero sheets!\n"
1543             }
1544             elsif (@children == 1) {
1545             my $fname = $children[0]->basename;
1546             my $cached_path = $cachedirpath->child($fname);
1547             warn "> Moving cached $fname to $outpath\n" if $opts->{verbose};
1548             File::Copy::move($cached_path, $outpath);
1549             return
1550             }
1551             else {
1552             croak "$opts->{inpath_sans_sheet} contains multiple sheets; you must specify a sheetname\n"
1553             }
1554             }
1555             }
1556             sub _write_spreadsheet($) {
1557 0     0   0 my ($opts) = @_;
1558              
1559 0         0 my $outpath = _final_outpath($opts);
1560 0 0       0 $outpath->remove unless -d $outpath;
1561              
1562 0         0 _tool_write_spreadsheet($opts, $outpath);
1563             }
1564              
1565             # If {outpath} is not set, set it to a unique output path in $tempdir
1566             # Always returns {outpath} as a Path::Tiny object.
1567             sub _final_outpath($) {
1568 0     0   0 my $opts = shift;
1569 0 0       0 if (defined $opts->{outpath}) {
1570 0         0 return path($opts->{outpath});
1571             } else {
1572 0 0       0 my $suf = $opts->{cvt_to} unless $opts->{allsheets};
1573             return(
1574 0         0 ($opts->{outpath}=_path_under_tempdir($opts, suf=>$suf))
1575             );
1576             }
1577             }
1578              
1579             sub convert_spreadsheet(@) {
1580             # Set inpath_sans_sheet, sheetname, ifbase, etc.
1581 2     2 1 4 my %opts = &_process_args;
1582 2 50       9 btw dvis('>>> convert_spreadsheet %opts\n') if $opts{debug};
1583 2         11 my %input_opts = %opts;
1584              
1585 2         9 _create_tempdir_if_needed(\%opts);
1586              
1587             # intuit cvt_from & cvt_to, detect encoding, and pre-process .csv input
1588             # if needed to avoid corruption of leading zeroes.
1589 2         42 _determine_enc_tofrom(\%opts);
1590              
1591 2         7 my $input_enc = $opts{input_encoding};
1592 2         4 my $output_enc = $opts{output_encoding};
1593              
1594             croak "Either input or output must be 'csv'\n"
1595 2 50 33     10 unless $opts{cvt_from} eq 'csv' || $opts{cvt_to} eq 'csv';
1596 2 50       7 if ($opts{allsheets}) {
1597             croak "'allsheets' is allowed only with cvt_to => 'csv'"
1598 0 0 0     0 unless ($opts{cvt_to}//"") eq "csv";
1599             croak "With 'allsheets', a sheet name may not be specified\n"
1600 0 0       0 if $opts{sheetname};
1601             croak "With 'allsheets', 'outpath' must be a directory if it exists\n"
1602 0 0 0     0 if $opts{outpath} && -e $opts{outpath} && ! -d _;
      0        
1603             }
1604              
1605 2         5 my $done;
1606 2 50       11 if ($opts{cvt_from} eq $opts{cvt_to}) { # csv to csv
1607 2 50       6 if (!$opts{allsheets}) {
1608 2 50       5 if ($input_enc ne $output_enc) {
1609             # Special case #1: in & out are CSVs but different encodings.
1610             warn "> Transcoding csv: $input_enc -> $output_enc\n"
1611 0 0       0 if $opts{debug};
1612 0         0 my $octets = $opts{inpath_sans_sheet}->slurp_raw;
1613 0         0 my $chars = decode($input_enc, $octets, Encode::FB_CROAK);
1614 0         0 $octets = encode($output_enc, $chars, Encode::FB_CROAK);
1615 0         0 path(_final_outpath(\%opts))->spew_raw($octets);
1616 0         0 $done = 1;
1617             } else {
1618             # Special case #2: No conversion is needed: Just copy the file or
1619             # return the input path itself as the output
1620 2 50       5 if (defined $opts{outpath}) {
1621             warn "> No conversion needed, copying into ",qsh($opts{outpath}),"\n"
1622 0 0       0 if $opts{verbose};
1623 0         0 $opts{inpath_sans_sheet}->copy($opts{outpath});
1624 0         0 $done = 1;
1625             } else {
1626 2         6 $opts{outpath} = $opts{inpath_sans_sheet};
1627             warn "> No conversion needed, returning ", qsh($opts{outpath}),"\n"
1628 2 50       5 if $opts{verbose};
1629 2         5 $done = 1;
1630             }
1631             }
1632             }
1633             else {
1634             # Special case #2: <allsheets> with input already a csv:
1635             # Leave a symlink to the input in the <outpath> directory.
1636 0 0       0 croak "transcoding not implemented in this situation"
1637             if ($input_enc ne $output_enc);
1638 0         0 my $outpath = path(_final_outpath(\%opts));
1639 0         0 $outpath->mkpath; # nop if exists, croaks if conflicts with file
1640 0         0 my $dest = $outpath->child( $opts{ifbase}.".csv" );
1641 0         0 my $inpath = $opts{inpath_sans_sheet};
1642 0         0 my $s = eval{ symlink($inpath, $dest) };
  0         0  
1643 0 0 0     0 if ($@ or !$s) { # symlink unimplmented or insufficient permissions
1644 0 0       0 btw dvis '>> $@' if $opts{debug};
1645             warn "> No conversion needed! Copying into ", qsh($dest),"\n"
1646 0 0       0 if $opts{verbose};
1647 0         0 $opts{inpath_sans_sheet}->copy($dest);
1648             } else {
1649             warn "> No conversion needed! Left symlink at ", qsh($dest),"\n"
1650 0 0       0 if $opts{verbose};
1651             }
1652 0         0 $done = 1;
1653             }
1654             }
1655 2 50       4 if (! $done) {
1656 0 0       0 if ($opts{allsheets}) {
1657 0         0 _extract_all_csvs(\%opts);
1658             }
1659             else {
1660             # Result will be a single file.
1661 0 0       0 if ($opts{cvt_to} eq "csv") {
1662 0         0 _extract_one_csv(\%opts);
1663             } else {
1664 0         0 _write_spreadsheet(\%opts);
1665             }
1666             }
1667             }
1668             my $result = {
1669             defined($output_enc) ? (encoding => $output_enc):(),
1670 8         33 (map{ my $v = $opts{$_};
1671 8 100       48 ($_ => (blessed($v) ? $v->stringify : $v))
1672 2 50       8 } grep{ defined $opts{$_} }
  10         22  
1673             qw/inpath_sans_sheet outpath cvt_from cvt_to sheetname/)
1674             };
1675             log_call [\%input_opts], [$result, \_fmt_outpath_contents($result)]
1676 2 50       8 if $opts{verbose};
1677              
1678 2         8 $result;
1679             }#convert_spreadsheet
1680              
1681             # Open as a CSV, intuiting input encoding, converting spreadsheet if necessary.
1682             #
1683             # :crlf translation is enabled on the resulting file handle, which converts
1684             # DOS CR,LF to \n while passing *nix bare LF through unmolested.
1685             #
1686             # Input argument(s) are the same as for convert_spreadsheet (except
1687             # outpath may not be specified).
1688             #
1689             # Returns a hash containing the file handle and other information.
1690             sub OpenAsCsv {
1691 2 50   2 1 17 my %opts = (
1692             (@_ == 1 ? (inpath => $_[0]) : (@_)),
1693             cvt_to => 'csv',
1694             );
1695             # TODO: Rename {path} to {inpath} in all usages and rm this cruft;
1696             carp "Obsolete OpenAsCsv usage: Change path to inpath\n"
1697 2 50 33     8 if exists($opts{path}) and !$opts{silent};
1698 2   33     26 $opts{inpath} //= delete $opts{path}; # be compatible with old API
1699              
1700 2         9 my $inpath = delete $opts{inpath};
1701 2 50       9 croak "OpenAsCsv: missing 'inpath' option\n" unless $inpath;
1702 2 50       7 croak "OpenAsCsv: outpath may not be specified\n" if $opts{outpath};
1703              
1704 2         10 my $h = convert_spreadsheet($inpath, %opts, verbose => $opts{debug});
1705 2 50       8 oops "sheetname key bug" if exists $h->{sheet};
1706              
1707 2   33     8 my $csvpath = $h->{outpath} // oops; # same as {inpath} if already a CSV
1708 2 50       96 open my $fh, "<", $csvpath or croak "$csvpath : $!\n";
1709 2 50       60 binmode $fh, ":crlf:encoding(".$h->{encoding}.")" or die "binmode:$!";
1710              
1711             my $r = {
1712             fh => $fh,
1713             csvpath => $csvpath,
1714             inpath => $inpath,
1715 2 100       130 (map{ exists($h->{$_}) ? ($_ => $h->{$_}) : () }
  10         44  
1716             qw/inpath_sans_sheet sheetname encoding tempdir raw_values/),
1717             };
1718              
1719 2         27 return $r;
1720             }
1721              
1722             1;
1723             __END__
1724              
1725             =pod
1726              
1727             =head1 NAME
1728              
1729             Spreadsheet::Edit::IO - convert between spreadsheet and csv files
1730              
1731             =head1 SYNOPSIS
1732              
1733             use Spreadsheet::Edit::IO qw/
1734             convert_spreadsheet OpenAsCsv
1735             cx2let let2cx
1736             @sane_CSV_read_options @sane_CSV_write_options/;
1737              
1738             # Open a CSV file or result of converting a sheet from a spreadsheet
1739             my $hash = OpenAsCsv("/path/to/spreadsheet.odt!Sheet1"); # single-arg form
1740             my $hash = OpenAsCsv(inpath => "/path/to/spreadsheet.odt",
1741             sheetname -> "Sheet1");
1742              
1743             print "Reading ",$hash->{csvpath}," with encoding ",$hash->{encoding},"\n";
1744             while (<$hash->{fh}>) { ... }
1745              
1746             # Convert CSV to spreadsheet in temp file (deleted at process exit)
1747             $hash = convert_spreadsheet(inpath => "mycsv.csv", cvt_to => "xlsx");
1748             print "Output is $hash->{outpath}\n"; # e.g. "/tmp/dwYT6qf/mycsv.xlsx"
1749              
1750             # Convert *all* sheets to CSV files in a temp directory
1751             $hash = convert_spreadsheet(inpath => "mywork.xls", cvt_to => "csv",
1752             allsheets => 1);
1753             opendir $dh, $hash->{outpath};
1754             while (readrir($h)) { say "Output csv file is $_" }
1755              
1756             # Transcode a CSV from windows-1252 to UTF-8
1757             convert_spreadsheet(
1758             inpath => "input.csv", input_encoding => 'windows-1252',
1759             outpath => "output.csv", output_encodoutg => 'UTF-8',
1760             );
1761              
1762             # Translate between 0-based column index and letter code (A, B, etc.)
1763             print cx2let(0); # "A"
1764             print let2cx("A"); # 0
1765             print cx2let(26); # "AA"
1766             print let2cx("ABC"); # 730
1767              
1768             # Extract components from "filepath!SHEETNAME" specifiers
1769             my $path = filepath_from_spec("/path/to/spreasheet.xls!Sheet1")
1770             my $sheetname = sheetname_from_spec("/path/to/spreasheet.xls!Sheet1")
1771              
1772             # Parse a csv file with sane options
1773             my $csv = Text::CSV->new({ @sane_CSV_read_options, eol => $hash->{eol} })
1774             or die "ERROR: ".Text::CSV->error_diag ();
1775             my @rows
1776             while (my $F = $csv->getline( $infh )) {
1777             push @rows, $F;
1778             }
1779             close $infh or die "Error reading ", $hash->csvpath(), ": $!";
1780              
1781             # Write a csv file with sane options
1782             my $ocsv = Text::CSV->new({ @sane_CSV_write_options })
1783             or die "ERROR: ".Text::CSV->error_diag ();
1784             open my $outfh, ">:encoding(utf8)", $outpath
1785             or die "$outpath: $!";
1786             foreach (@rows) { $ocsv->print($outfh, $_) }
1787             close $outfh or die "Error writing $outpath: $!";
1788              
1789             =head1 DESCRIPTION
1790              
1791             Convert between CSV and spreadsheet files using external tools, plus some utility functions
1792              
1793             Currently this uses LibreOffice or OpenOffice (whatever is installed). An external tool is not needed when only CSV files are involved.
1794              
1795             =head2 $hash = OpenAsCsv INPUT
1796              
1797             =head2 $hash = OpenAsCsv inpath => INPUT, sheetname => SHEETNAME, ...
1798              
1799             This is a thin wrapper for C<convert_spreadsheet> followed by C<open>
1800              
1801             If a single argument is given it specifies INPUT; otherwise all arguments must
1802             be specified as key => value pairs, and may include any options supported
1803             by C<convert_spreadsheet>.
1804              
1805             INPUT may be a csv or spreadsheet workbook path; if a spreadsheet,
1806             then a single "sheet" is converted, specified by either a !SHEETNAME suffix
1807             in the INPUT path, a separate C<< sheetname => SHEETNAME >> option,
1808             or unspecified to extract the only sheet (croaks if there is more than one).
1809              
1810             The resulting file handle refers to a guaranteed-seekable CSV file;
1811             this will either be a temporary file (auto-removed at process exit),
1812             or the original INPUT if it was already a seekable csv file.
1813              
1814             RETURNS: A ref to a hash containing the following:
1815              
1816             {
1817             fh => the resulting open file handle
1818             csvpath => the path {fh} refers to, which might be a temporary file
1819             sheetname => sheet name if the input was a spreadsheet
1820             }
1821              
1822             =head2 convert_spreadsheet INPUT, cvt_to=>suffix, OPTIONS
1823              
1824             =head2 convert_spreadsheet INPUT, cvt_to=>"csv", allsheets => 1, OPTIONS
1825              
1826             Convert CSV to spreadsheet or vice-versa, or transcode CSV to CSV.
1827              
1828             RETURNS: A ref to a hash containing:
1829              
1830             {
1831             outpath => path to the output file (or directory with 'allsheets')
1832             (a temporary file/dir or as you specified in OPTIONS).
1833              
1834             encoding => the encoding used when writing .csv files
1835             }
1836              
1837             INPUT is the input file path; it may be a separate first argument as
1838             shown above, or else included in OPTIONS as C<< inpath =E<gt> INPUT >>.
1839              
1840             If C<outpath =E<gt> OUTPATH> is specifed then results are I<always> saved
1841             to that path. With C<allsheets> this is a directory, which will be created
1842             if necessary.
1843              
1844             If C<outpath> is NOT specified in OPTIONS then, with one exception,
1845             results are saved to a temporary file or directory and that path is returned
1846             as C<outpath> in the result hash.
1847             The exception is if no conversion is necessary
1848             (i.e. C<cvt_from> is the same as C<cvt_to>), when the
1849             input file itself is returned as C<outpath>.
1850              
1851             In all cases C<outpath> in the result hash points to the results.
1852              
1853             C<cvt_to> or C<cvt_from> are filename suffixes (sans dot)
1854             e.g. "csv", "xlsx", etc., and need not be specified when indicated by
1855             C<outpath> or INPUT parameters.
1856              
1857             OPTIONS may also include:
1858              
1859             =over 4
1860              
1861             =item sheetname => "sheet name"
1862              
1863             The workbook 'sheet' name used when reading or writing a spreadsheet.
1864             An input sheet name may also be specified as "!sheetname" appended to
1865             the INPUT path.
1866              
1867             =item allsheets => BOOL
1868              
1869             B<All> sheets in the input
1870             are converted to separate .csv files named "SHEETNAME.csv" in
1871             the 'outpath' directory. C<< cvt_to =E<gt> 'csv' >> is also requred.
1872              
1873             =item input_encoding => ENCODING
1874              
1875             Specifies the encoding of INPUT if it is a csv file.
1876              
1877             ENCODING may be a comma-separated list of encoding
1878             names which will be tried in the order until one seems to work.
1879             If only one is specified it will be used without trying it first.
1880             The default is "UTF-8,windows-1252".
1881              
1882             =item output_encoding => ENCODING
1883              
1884             Used when writing csv file(s), defaults to 'UTF-8'.
1885              
1886             =item col_formats => [...]
1887              
1888             This specifies how CSV data is imported into a spreadsheet. Each element
1889             of the array may contain:
1890              
1891             undef, "standard" or "" (LibreOffice will auto-detect)
1892             "text" (imported as unmolested text)
1893             "MM/DD/YY",
1894             "DD/MM/YY",
1895             "YY/MM/DD",
1896             "ignore" (do not import this column)
1897              
1898             Elements may also contain the numeric format codes defined by LibreOffice
1899             at L<https://wiki.documentfoundation.org/Documentation/DevGuide/Spreadsheet_Documents#Filter_Options_for_the_CSV_Filter>
1900              
1901             B<Automatic format detection:>
1902             Input CSV data is pre-scanned to auto-detect column formats
1903             as much as possible. This usually works well as long as dates are
1904             represented unambiguously, e.g. "2021-01-01" or "Jan 1, 2023".
1905              
1906             Specifically, this detects leading zeroes such as in U.S. Zip Codes,
1907             and MM/DD/YY or DD/MM/YY dates when a DD happens to be more than 12.
1908              
1909             =item verbose => BOOL
1910              
1911             =back
1912              
1913             =head3 B<'binmode' Argument For Reading result CSVs>
1914              
1915             It is not possible to control the line-ending style in output CSV files,
1916             but the following incantation will correctly read either DOS/Windows (CR,LF)
1917             or *nix (LF) line endings properly, i.e. as a single \n:
1918              
1919             open my $fh, "<", $resulthash->{outpath};
1920             my $enc = $resulthash->{encoding};
1921             binmode($fh, ":raw:encoding($enc):crlf");
1922              
1923             =head2 @sane_CSV_read_options
1924              
1925             =head2 @sane_CSV_write_options
1926              
1927             These contain options you will always want to use with
1928             S<<< C<< Text::CSV->new() >> >>>.
1929             Specifically, quotes and embedded newlines are handled correctly.
1930              
1931             Not exported by default.
1932              
1933             =head2 cx2let COLUMNINDEX
1934              
1935             =head2 let2cx LETTERCODE
1936              
1937             Functions which translate between spreadsheet-column
1938             letter codes ("A", "B", etc.) and 0-based column indicies.
1939             Not exported by default.
1940              
1941             =head2 filepath_from_spec EXPR
1942              
1943             =head2 sheetname_from_spec EXPR
1944              
1945             Functions which decompose strings containing a spreadsheet path and possibly sheetname
1946             suffix, of the form "FILEPATH!SHEETNAME", "FILEPATH|||SHEETNAME", or "FILEPATH[SHEETNAME]".
1947             C<sheetname_from_spec> returns C<undef> if the input does not have a
1948             a sheetname suffix.
1949             Not exported by default.
1950              
1951             =head2 form_spec_with_sheetname(PATH, SHEENAME)
1952              
1953             Composes a combined string in a "preferred" format (currently "PATH!SHEETNAME").
1954             Not exported by default.
1955              
1956             =head1 Testing if LibreOffice etc. is Installed
1957              
1958             =head2 $bool = can_cvt_spreadsheets();
1959              
1960             =head2 $bool = sub can_extract_allsheets();
1961              
1962             =head2 $bool = can_extract_named_sheet();
1963              
1964             These feature-test functions return false if the corresponding operations
1965             are not possible because LibreOffice (or, someday gnumeric) is not installed
1966             or is an older version which does not have needed capabilities.
1967              
1968             =head2 $path = openlibreoffice_path();
1969              
1970             Returns the detected path of I<soffice> (Apache Open Office or Libre Office)
1971             or undef if not found.
1972              
1973             =head1 SEE ALSO
1974              
1975             L<Spreadsheet::Edit> and L<Text::CSV>
1976              
1977             =cut
1978