File Coverage

blib/lib/File/Unpack.pm
Criterion Covered Total %
statement 52 58 89.6
branch n/a
condition n/a
subroutine 18 18 100.0
pod n/a
total 70 76 92.1


line stmt bran cond sub pod time code
1             #
2             # (C) 2010-2014, jnw@cpan.org, all rights reserved.
3             # Distribute under the same license as Perl itself.
4             #
5             #
6             # sudo zypper -v in perl-Compress-Raw-Zlib
7             # -> 'nothing to do'
8             # sudo zypper -v in 'perl-Compress-Raw-Zlib >= 2.027'
9             # -> 'perl' providing 'perl-Compress-Raw-Zlib >= 2.027' is already installed.
10             # sudo zypper -v in --force perl-Compress-Raw-Zlib
11             # -> works,
12             # sudo zypper -v in --from 12 perl-Compress-Raw-Zlib
13             # -> works, if d.l.p is repo #12.
14             #
15             # TODO:
16             # * evaluate File::Extract - Extract Text From Arbitrary File Types
17             # (HTML, PDF, Plain, RTF, Excel)
18             #
19             # * make taint checks really check things, instead of $1 if m{^(.*)$};
20             #
21             # * Implement disk space monitoring.
22             #
23             # * formats:
24             # - use lzmadec/xzdec as fallback to lzcat.
25             # - glest has bzipped tar files named glest-1.0.10-data.tar.bz2.tar;
26             # - Not all suffixes are appended by . e.g. openh323-v1_15_2-src-tar.bz2 is different.
27             # - gzip -dc can unpack old compress .Z, add its mime-type
28             # - java-1_5_0-sun hides zip-files in shell scripts with suffix .bin
29             # - cpio fails on \.delta\.rpm
30             # - rpm files should extract all header info in readable format.
31             # - do we rely on rpm2cpio to handle them all:
32             # rpm -qp --nodigest --nosignature --qf "%{PAYLOADCOMPRESSOR}" $f
33             # - m{\.(otf|ttf|ps|eps)$}i
34             # - application/x-frame # xorg-modular/doc/xorg-docs/specs/XPRINT/xp_libraryTOC.doc
35             #
36             # * blacklisting?
37             # # th_en_US.dat is an 11MB thesaurus in OOo
38             # skip if $from =~ m{(/(ustar|pax)\-big\-\d+g\.tar\.bz2|/th_en_US\.dat|/testtar\.tar|\.html\.(ru|ja|ko\.euc-kr|fr|es|cz))$}
39             #
40             # * use LWP::Simple::getstore() if $archive =~ m{^\w+://}
41             # * application/x-debian-package is a 'application/x-archive' -> (ar xv /dev/stdin) < $qufrom";
42             # * application/x-iso9660 -> "isoinfo -d -i %(src)s"
43             # * PDF improvement: okular says: 'this document contains embedded files.' How can we grab those?
44              
45 1     1   30233 use warnings;
  1         3  
  1         35  
46 1     1   6 use strict;
  1         2  
  1         66  
47              
48             package File::Unpack;
49              
50             BEGIN
51             {
52             # Requires: shared-mime-info
53 1     1   65 eval 'use File::LibMagic;'; # only needed in mime(); mime() dies, if missing
  1     1   474  
  0         0  
  0         0  
54 1     1   54 eval 'use File::MimeInfo::Magic;'; # only needed in mime(); okay, if missing.
  1         6685  
  1         10787  
  1         65  
55             # unless builtin!
56 1     1   68 eval 'use Compress::Raw::Lzma;'; # only needed in mime(); for finding lzma.
  1         562  
  0         0  
  0         0  
57 1     1   61 eval 'use Compress::Raw::Bzip2;'; # only needed in mime(); for finding second level types
  1         1152  
  1         1869  
  1         168  
58 1     1   62 eval 'use Compress::Raw::Zlib;'; # only needed in mime(); for finding second level types
  1         1215  
  1         6458  
  1         312  
59 1     1   69 eval 'use BSD::Resource;'; # setrlimit
  1         1006  
  1         17702  
  1         7  
60 1     1   381 eval 'use Filesys::Statvfs;'; # statvfs();
  1         955  
  1         731  
  1         68  
61             }
62              
63 1     1   7 use Carp;
  1         2  
  1         65  
64 1     1   7 use File::Path;
  1         2  
  1         50  
65 1     1   1287 use File::Temp (); # tempdir() in _run_mime_helper.
  1         28265  
  1         26  
66 1     1   1041 use File::Copy ();
  1         12507  
  1         30  
67 1     1   1779 use File::Compare ();
  1         1795  
  1         22  
68 1     1   1573 use JSON;
  1         27428  
  1         7  
69 1     1   1280 use String::ShellQuote; # used in _prep_configdir
  1         1043  
  1         127  
70 1     1   503 use IPC::Run; # implements File::Unpack::run()
  0            
  0            
71             use Text::Sprintf::Named; # used to parse @builtin_mime_helpers
72             use Cwd 'getcwd'; # run(), moves us there and back.
73             use Data::Dumper;
74             use POSIX ();
75              
76             =head1 NAME
77              
78             File::Unpack - A strong bz2/gz/zip/tar/cpio/rpm/deb/cab/lzma/7z/rar/... archive unpacker, based on mime-types
79              
80             =head1 VERSION
81              
82             Version 0.69
83             =cut
84              
85             # We'll have 1.x versions only after minfree() has a baseline implementation.
86             # Please run perl Makefile.PL after changing the version here.
87             our $VERSION = '0.69';
88              
89             POSIX::setlocale(&POSIX::LC_ALL, 'C');
90             $ENV{PATH} = '/usr/bin:/bin';
91             $ENV{SHELL} = '/bin/sh';
92             delete $ENV{ENV};
93              
94             # what we name the temporary directories, while helpers are working.
95             my $TMPDIR_TEMPL = '_fu_XXXXX';
96              
97             # no longer used by the tick-tick ticker to show where we are.
98             # my $lsof = '/usr/bin/lsof';
99              
100             # Compress::Raw::Bunzip2 needs several 100k of input data, we special case this.
101             # File::LibMagic wants to read ca. 70k of input data, before it says application/vnd.ms-excel
102             # Anything else works with 1024.
103             my $UNCOMP_BUFSZ = 1024;
104              
105             # unpack will give up, after unpacking that many levels. It is more likely we
106             # got into a loop by then, than really have that many levels.
107             my $RECURSION_LIMIT = 200;
108              
109             # Suggested place, where admins should install the helpers bundled with this module.
110             sub _default_helper_dir { $ENV{FILE_UNPACK_HELPER_DIR}||'/usr/share/File-Unpack/helper' }
111              
112             # we use '=' in the mime_name, this expands to '/(x\-|ANY\+)?'
113             ##
114             ## Caution: always use (?: ... ) below for grouping, so that no extra capturing clauses are created.
115              
116             my @builtin_mime_helpers = (
117             # mimetype pattern # suffix_re # command with redirects, as defined with IPC::Run::run
118              
119             # Requires: xz bzip2 gzip unzip lzip
120             [ 'application=x-lzip', qr{(?:lz)}, [qw(/usr/bin/lzip -dc %(src)s)], qw(> %(destfile)s) ],
121             [ 'application=xz', qr{(?:xz|lz(ma)?)}, [qw(/usr/bin/lzcat)], qw(< %(src)s > %(destfile)s) ],
122             [ 'application=xz', qr{(?:xz|lz(ma)?)}, [qw(/usr/bin/xz -dc %(src)s)], qw(> %(destfile)s) ],
123             [ 'application=lzma', qr{(?:xz|lz(ma)?)}, [qw(/usr/bin/lzcat)], qw(< %(src)s > %(destfile)s) ],
124             [ 'application=lzma', qr{(?:xz|lz(ma)?)}, [qw(/usr/bin/xz -dc %(src)s)], qw(> %(destfile)s) ],
125             [ 'application=bzip2', qr{bz2}, [qw(/usr/bin/bunzip2 -dc -f %(src)s)], qw(> %(destfile)s) ],
126             [ 'application=gzip', qr{(?:gz|Z)}, [qw(/usr/bin/gzip -dc -f %(src)s)], qw(> %(destfile)s) ],
127             [ 'application=compress', qr{(?:gz|Z)}, [qw(/usr/bin/gzip -dc -f %(src)s)], qw(> %(destfile)s) ],
128              
129             # Requires: sharutils
130             [ 'text=uuencode', qr{uu}, [qw(/usr/bin/uudecode -o %(destfile)s %(src)s)] ],
131              
132             # Requires: upx
133             [ 'application=upx', qr{(?:upx\.exe|upx)}, [qw(/usr/bin/upx -q -q -q -d -o%(destfile)s %(lsrc)s) ] ],
134              
135             # xml.summary.Mono.Security.Authenticode is twice inside of monodoc-1.0.4.tar.gz/Mono.zip/ -> use -o
136             [ 'application=zip', qr{(?:zip|jar|sar)}, [qw(/usr/bin/unzip -P no_pw -q -o %(src)s)] ],
137              
138             # Requires: unrar
139             [ 'application=rar', qr{rar}, [qw(/usr/bin/unrar x -o- -p- -inul -kb -y %(src)s)] ],
140             # Requires: lha
141             [ 'application=x-lha', qr{lha}, [qw(/usr/bin/lha x -q %(src)s)] ],
142              
143             # Requires: binutils
144             [ 'application=archive', qr{(?:a|ar|deb)}, [qw(/usr/bin/ar x %(src)s)] ],
145             [ 'application=x-deb', qr{deb}, [qw(/usr/bin/ar x %(src)s)] ],
146             [ 'application=x-debian-package', qr{deb}, [qw(/usr/bin/ar x %(src)s)] ],
147              
148             # Requires: cabextract
149             [ 'application/vnd.ms-cab-compressed', qr{cab}, [qw(/usr/bin/cabextract -q %(src)s)] ],
150              
151             # Requires: p7zip
152             [ 'application/x-7z-compressed', qr{7z}, [qw(/usr/bin/7z x -pPass -y %(src)s)] ],
153              
154             # Requires: tar rpm cpio
155             [ 'application=tar', qr{(?:tar|gem)}, [\&_locate_tar, qw(-xf %(src)s)] ],
156             [ 'application=tar+bzip2', qr{(?:tar\.bz2|tbz)}, [\&_locate_tar, qw(-jxf %(src)s)] ],
157             [ 'application=tar+gzip', qr{t(?:ar\.gz|gz)}, [\&_locate_tar, qw(-zxf %(src)s)] ],
158             # [ 'application=tar+gzip', qr{t(?:ar\.gz|gz)}, [qw(/home/testy/src/C/slowcat)], qw(< %(src)s |), [\&_locate_tar, qw(-zxf -)] ],
159             [ 'application=tar+lzma', qr{tar\.(?:xz|lzma|lz)}, [qw(/usr/bin/lzcat)], qw(< %(src)s |), [\&_locate_tar, qw(-xf -)] ],
160             [ 'application=tar+lzma', qr{tar\.(?:xz|lzma|lz)}, [qw(/usr/bin/xz -dc -f %(src)s)], '|', [\&_locate_tar, qw(-xf -)] ],
161             [ 'application=rpm', qr{(?:src\.r|s|r)pm}, [qw(/usr/bin/rpm2cpio %(src)s)], '|', [\&_locate_cpio_i] ],
162             [ 'application=cpio', qr{cpio}, [\&_locate_cpio_i], qw(< %(src)s) ],
163              
164             # Requires: poppler-tools
165             [ 'application=pdf', qr{pdf}, [qw(/usr/bin/pdftotext %(src)s %(destfile)s.txt)], '&', [qw(/usr/bin/pdfimages -j %(src)s pdfimages)] ],
166             );
167              
168             ## CAUTION keep _my_shell_quote in sync with all _locate_* functions.
169             sub _locate_tar
170             {
171             my $self = shift;
172             return @{$self->{_locate_tar}} if defined $self->{_locate_tar};
173              
174             # cannot use tar -C %(destdir)s, we rely on being chdir'ed inside already :-)
175             # E: /bin/tar: /tmp/xxx/_VASn/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_: Cannot chdir: Permission denied
176              
177             my @tar = (-f '/bin/tar' ? '/bin/tar' : '/usr/bin/tar' );
178             ## osc co loves to create directories with : in them.
179             ## Tell tar to accept such directories as directores.
180             push @tar, "--force-local"
181             unless $self->run([@tar, "--force-local", "--help"], { out_err => '/dev/null' });
182             push @tar, "--no-unquote"
183             unless $self->run([@tar, "--no-unquote", "--help"], { out_err => '/dev/null'});
184              
185             $self->{_locate_tar} = \@tar;
186             return @tar;
187             }
188              
189             sub _locate_cpio_i
190             {
191             my $self = shift;
192             return @{$self->{_locate_cpio_i}} if defined $self->{_locate_cpio_i};
193              
194             my @cpio_i = ('/usr/bin/cpio', '-idm');
195             $cpio_i[1] .= 'u'
196             unless run(['/usr/bin/cpio', '-idmu', '--usage'], {out_err => '/dev/null'});
197             push @cpio_i, '--sparse'
198             unless run([@cpio_i, '--sparse', '--usage'], {out_err => '/dev/null'});
199             push @cpio_i, '--no-absolute-filenames'
200             unless run([@cpio_i, '--no-absolute-filenames', '--usage'], {out_err => '/dev/null'});
201             push @cpio_i, '--force-local'
202             unless run([@cpio_i, '--force-local', '--usage'], {out_err => '/dev/null'});
203              
204             @{$self->{_locate_cpio_i}} = \@cpio_i;
205             return @cpio_i;
206             }
207              
208             =head1 SYNOPSIS
209              
210             This perl module comes with an executable script:
211              
212             /usr/bin/file_unpack -h
213              
214             /usr/bin/file_unpack [-1] [-m] ARCHIVE_FILE ...
215              
216              
217             File::Unpack is an unpacker for archives and files
218             (bz2/gz/zip/tar/cpio/iso/rpm/deb/cab/lzma/7z/rar ... pdf/odf) based on
219             MIME types. We call it strong, because it is not fooled by file suffixes, or
220             multiply wrapped packages. It recursively descends into each archive found
221             until it finally exposes all unpackable payload contents.
222              
223             A logfile can be written, precisely describing MIME types and unpack actions.
224              
225             use File::Unpack;
226              
227             my $log;
228             my $u = File::Unpack->new(logfile => \$log);
229              
230             my $m = $u->mime('/etc/init.d/rc');
231             print "$m->[0]; charset=$m->[1]\n";
232             # text/x-shellscript; charset=us-ascii
233              
234             map { print "$_->{name}\n" } @{$u->mime_helper()};
235             # application/%rpm
236             # application/%tar+gzip
237             # application/%tar+bzip2
238             # ...
239              
240             $u->unpack("inputfile.tar.bz2");
241             while ($log =~ m{^\s*"(.*?)":}g) # it's JSON.
242             {
243             print "$1\n"; # report all files unpacked
244             }
245              
246             ...
247              
248             Most of the known archive file formats are supported. Shell-script-style
249             plugins can be added to support additinal formats.
250              
251             Helper shell-scripts can be added to support additional mime-types. Example:
252              
253             F<< $ echo "ar x $1" > /usr/share/File-Unpack/helper/application=x-debian-package >>
254              
255             F<< $ chmod a+x /usr/share/File-Unpack/helper/application=x-debian-package >>
256              
257             This example creates a trivial external equivalent of the builtin MIME helper for *.deb packages.
258             For details see the documentation of the C method.
259              
260             C examines the contents of an archive file or directory using an extensive
261             mime-type analysis. The contents is unpacked recursively to the given destination
262             directory; a listing of the unpacked files is reported through the built in
263             logging facility during unpacking. Most common archive file formats are handled
264             directly; more can easily be added as mime-type helper plugins.
265              
266             =head1 SUBROUTINES/METHODS
267              
268             =head2 new
269              
270             my $u = new(destdir => '.', logfile => \*STDOUT, maxfilesize => '2G', verbose => 1,
271             world_readable => 0, one_shot => 0, no_op => 0, archive_name_as_dir => 0,
272             follow_file_symlinks => 0,
273             log_params => {}, log_type => 'JSON');
274              
275             Creates an unpacker instance. The parameter C must be a writable location; all output
276             files and directories are placed inside this destdir. Subdirectories will be
277             created in an attempt to reflect the structure of the input. Destdir defaults
278             to the current directory; relative paths are resolved immediatly, so that
279             chdir() after calling new is harmless.
280              
281             The parameter C can be a reference to a scalar, a filename, or a filedescriptor.
282             The logfile starts with a JSON formatted prolog, where all lines start
283             with printable characters.
284             For each file unpacked, a one line record is appended, starting with a single
285             whitespace ' ', and terminated by "\n". The format is a JSON-encoded C<< "key":
286             {value},\n >> pair, where key is the filename, and value is a hash including 'mime',
287             'size', and other information.
288             The logfile is terminated by an epilog, where each line starts with a printable character.
289             As part of the epilog, a dummy file named "\" with an empty hash is added to the list.
290             It should be ignored while parsing.
291             Per default, the logfile is sent to STDOUT.
292              
293             The parameter C is a safeguard against compressed sparse files and
294             test-files for archivers. Such files could easily fill up any available disk
295             space when unpacked. Files hitting this limit will be silently truncated.
296             Check the logfile records or epilog to see if this has happened. BSD::Resource
297             is used manipulate RLIMIT_FSIZE.
298              
299             The parameter C can optionally be set to non-zero, to limit unpacking
300             to one step of unpacking. Unpacking of well known compressed archives like
301             e.g. '.tar.bz2' is considered one step only. If uncompressing an archive is
302             considered an extra step before unpacking the archive depends on the configured
303             mime helpers.
304              
305             The parameter C causes unpack() to only print one shell command to
306             STDOUT and exit. This implies one_shot=1.
307              
308             The parameter C causes unpack() change all directories to 0755,
309             and all files to 444. Otherwise 0700 and 0400 (user readable) is asserted.
310              
311             The parameter C causes some or all symlinks to files
312             to be included.
313             A value of 1 follows symlinks that exist in the input directory and point to a file.
314             This has no effect if the input is an archive file. A value of 2 also follows symlinks
315             that were extracted from archives. CAUTION: This may cause unpack() to visit
316             files or archives elsewhere in the local filesystem.
317             Directory symlinks are always excluded.
318              
319             The parameter C causes the unpacker to store all unpacked
320             files inside a directory with the same name as their archive.
321              
322             The default depends on how many files are unpacked from the archive: If exactly one
323             file (or one toplevel directory) is unpacked, then no extra directory is used.
324             E.g. F would unpack to F or
325             F would unpack to F and no files outside this directory.
326             If multiple files (or directories) are unpacked, and the suffix of the archive can
327             be removed with the C of its C, then the
328             shortened name is used as a directory. E.g. F would unpack to
329             F. Otherwise F<._> is appended to the archive name. E.g. F would unpack to
330             F.
331              
332             In any case, the suffix F<._> or F<._B> is used to avoid conflicts with
333             already existing names where B is a numeric value.
334              
335             =head2 exclude
336              
337             exclude(add => ['.svn', '*.orig' ], del => '.svn', force => 1, follow_file_symlinks => 0)
338              
339             Defines the exclude-list for unpacking. This list is advisory for the MIME helpers.
340             The exclude-list items are shell glob patterns, where '*' or '?' never match '/'.
341              
342             You can use force to have any of these removed after unpacking.
343             Use (vcs => 1) to exclude a long list of known version control system directories, use (vcs => 0) to remove them.
344             The default is C<< exclude(empty => 1) >>, which is the same as C<< exclude(empty_file => 1, empty_dir => 1) >> --
345             having the obvious meaning.
346              
347             (re => 1) returns the active exclude-list as a regexp pattern.
348             Otherwise C always returns the list as an array ref.
349              
350             Some symbolic links are included if {follow_file_symlinks} is nonzero. For details see C<>.
351              
352             If exclude patterns were effective, or if symlinks, fifos, sockets, ... were encountered during unpack(),
353             the logfile contains an additional 'skipped' keyword with statistics.
354              
355             =cut
356              
357             sub _glob_list_re
358             {
359             my @re;
360             return unless @_;
361             for my $text (@_)
362             {
363             # Taken from pdb2perl:glob2re() and adapted, to not match slashes in wildcards.
364             # This should be kept compatible with tar --exclude .
365              
366             $text =~ s{([\.\(\)\[\]\{\}])}{\\$1}g; ## protect magic re characters.
367             $text =~ s{\*}{[^/]*}g; ## * -> [^/]*
368             $text =~ s{\?}{[^/]}g; ## ? -> [^/]
369             push @re, $text;
370             }
371             return '(/|^)(' . join('|', @re) . ')(/|$)';
372             }
373              
374             sub _not_excluded
375             {
376             my $self = shift;
377             my ($dir, $file) = @_;
378              
379             return 1 unless my $re = $self->{exclude}{re};
380              
381             $dir ||= '';
382             $dir .= '/' unless $dir =~ m{/$};
383             $file = $dir . $file;
384              
385             return 0 if $file =~ m{$re};
386             return 1;
387             }
388              
389             sub exclude
390             {
391             my $self = shift;
392             my %opt = $#_ ? @_ : (add => $_[0]);
393            
394             # ADD to this list from: https://build.opensuse.org/project/show?project=devel%3Atools%3Ascm
395             my @vcs = qw(SCCS RCS CVS .svn .git .hg .osc);
396              
397             $opt{add} = [ $opt{add} ] unless ref $opt{add};
398             $opt{del} = [ $opt{del} ] unless ref $opt{del};
399              
400             push @{$opt{add}}, @vcs if defined $opt{vcs} and $opt{vcs};
401             push @{$opt{del}}, @vcs if defined $opt{vcs} and !$opt{vcs};
402              
403              
404             for my $a (@{$opt{add}})
405             {
406             $self->{exclude}{list}{$a}++ if defined $a;
407             }
408            
409             for my $a (@{$opt{del}})
410             {
411             delete $self->{exclude}{list}{$a} if defined $a;
412             }
413              
414             my @list = sort keys %{$self->{exclude}{list}};
415             $self->{exclude}{re} = _glob_list_re(@list);
416              
417             $opt{empty_dir} = $opt{empty_file} = $opt{empty} if defined $opt{empty};
418              
419             for my $o (qw(empty_file empty_dir force))
420             {
421             $self->{exclude}{$o} = $opt{$o} if defined $opt{$o};
422             }
423              
424             $self->{follow_file_symlinks} = $opt{follow_file_symlinks}
425             if defined $opt{follow_file_symlinks};
426              
427             return $opt{re} ? $self->{exclude}{re} : \@list;
428             }
429              
430             =begin private
431              
432             =item log, logf, loggable_pathname
433              
434             The C method is used by C to send text to the logfile.
435             The C method takes a filename and a hash, and logs a JSON formatted line.
436             The trailing newline character of a line is delayed; it is printed by the next call to
437             C or C. In case of C, a comma is emitted before the newline
438             from the second call onward.
439              
440             The C shortens a path to be relative to either
441             $self->{destdir} or $self->{input} unless $self->{log_fullpath} is true.
442             If a hash is provided as a second parameter and the path was found to be relative
443             to $self->{input}, then an entry { 'srcdir' => 'input' } is added to this hash.
444              
445             =end private
446              
447             =cut
448             sub log
449             {
450             my ($self, $text) = @_;
451             if (my $fp = $self->{lfp})
452             {
453             my $oldpos = eval { $fp->tell; }; # old perl at SLES11 has no IO::Handle::tell()
454             $fp->write($text) or die "log($self->{logfile}): write failed: $!\n";
455             my $r = eval { $fp->tell - $oldpos; };
456              
457             ## We do not expect any multibyte utf8 issues in here. It is plain 7-bit JSON.
458             ## E.g. /dev/null is not seekable. Be forgiving.
459             die "$oldpos,$r=log($self->{logfile}): write failed: $text\n" if $r and $r != length($text);
460             $self->{lfp_printed}++;
461             }
462             }
463              
464             sub loggable_pathname
465             {
466             my ($self, $file, $hash) = @_;
467              
468             unless ($self->{log_fullpath})
469             {
470             # very frequently, files are inside the destdir
471             unless ($file =~ s{^\Q$self->{destdir}\E/}{})
472             {
473             # less frequently, archives are logged inside the input dir
474             if ($self->{input})
475             {
476             if ($file =~ s{^\Q$self->{input}\E/}{\./input/./})
477             {
478             $hash->{srcdir} = 'input' if ref $hash eq 'HASH';
479             }
480             }
481             }
482             }
483             return $file;
484             }
485              
486             sub logf
487             {
488             my ($self,$file,$hash,$suff) = @_;
489             $suff = "" unless defined $suff;
490             my $json = $self->{json} ||= JSON->new()->ascii(1);
491             $file = $self->loggable_pathname($file, $hash);
492             if (my $fp = $self->{lfp})
493             {
494             if ($self->{log_type} eq 'plain')
495             {
496             my $str = $file . ' (';
497             $str .= $hash->{mime} if defined $hash->{mime};
498             $str .= ')';
499             $str = "# $str -> " . $hash->{unpacked} if $hash->{unpacked};
500             $str .= "\n";
501             $self->log($str);
502             }
503             else
504             {
505             $self->log(qq[{ "oops": "logf used before prolog??",\n"unpacked_files":{\n])
506             unless $self->{lfp_printed}; # sysseek($fp, 0, 1); # }} there is no systell() ...
507             my $str = $json->encode({$file => $hash});
508             $str =~ s{^\{}{}s;
509             $str =~ s{\}$}{}s;
510             my $pre = " ";
511             $pre = ",\n " if $self->{logf_continuation}++;
512             die "logf failed to encode newline char: $str\n" if $str =~ m{(?:\n|\r)};
513             $self->log("$pre$str$suff");
514             }
515             }
516             }
517              
518             $SIG{'XFSZ'} = sub
519             {
520             print STDERR "soft RLIMIT_FSIZE exceeded. SIGXFSZ recieved. Exiting\n";
521             exit;
522             };
523              
524             # if this returns 0, we test again and call it again, possibly.
525             # if this returns nonzero, we just continue.
526             sub _default_fs_warn
527             {
528             carp "Filesystem (@_) is almost full.\n $0 paused for 30 sec.\n";
529             sleep(30);
530             return 0;
531             }
532              
533             ## returns 1, if enough space free.
534             ## returns 0, if warn-method was called, and returned nonzero
535             ## returns -1, if no warn method
536             ## or does not return at all, and rechecks the status
537             ## with at least on second delay, if warn-method returns 0.
538             sub _fs_check
539             {
540             my ($self, $needed_b, $needed_i, $needed_p) = @_;
541             $needed_b = '1M' unless defined $needed_b; # bytes
542             $needed_i = 100 unless defined $needed_i; # inodes
543             $needed_p = 1.0 unless defined $needed_p; # percent
544             $needed_b = _bytes_unit($needed_b);
545              
546             my $DIR;
547             open $DIR, "<", $self->{destdir} or
548             opendir $DIR, $self->{destdir} or return;
549             ## fileno() does not work with opendir() handles.
550             my $fd = fileno($DIR); return unless defined $fd;
551              
552             for (;;)
553             {
554             my $st = eval { [ fstatvfs($fd) ] };
555             my $total_b = $st->[1] * $st->[2]; # f_frsize * f_blocks
556             my $free_b = $st->[0] * $st->[4]; # f_bsize * f_bavail
557             my $free_i = $st->[7]; # f_favail
558             my $perc = 100.0 * ($total_b - $free_b) / ($total_b||1);
559              
560             return 1 if $free_b >= $needed_b &&
561             $free_i >= $needed_i &&
562             (100-$perc > $needed_p);
563            
564             return -1 unless $self->{fs_warn};
565             my $w = $self->{fs_warn}->($self->{destdir}, $perc, $free_b, $free_i);
566             return 0 if $w;
567             sleep 1;
568             }
569             }
570              
571             sub new
572             {
573             my $self = shift;
574             my $class = ref($self) || $self;
575             my %obj = (ref $_[0] eq 'HASH') ? %{$_[0]} : @_;
576              
577             $obj{verbose} = 1 unless defined $obj{verbose};
578             $obj{destdir} ||= '.';
579             $obj{logfile} ||= \*STDOUT;
580             $obj{log_type} ||= 'json'; # or 'plain'
581             $obj{log_type} = lc $obj{log_type};
582             $obj{maxfilesize} = $ENV{'FILE_UNPACK_MAXFILESIZE'}||'2.5G' unless defined $obj{maxfilesize};
583             $obj{maxfilesize} = _bytes_unit($obj{maxfilesize});
584             $ENV{'FILE_UNPACK_MAXFILESIZE'} = $obj{maxfilesize}; # so that children see the same.
585              
586             mkpath($obj{destdir}); # abs_path is unreliable if destdir does not exist
587             $obj{destdir} = Cwd::fast_abs_path($obj{destdir});
588             $obj{destdir} =~ s{(.)/+$}{$1}; # assert no trailing '/'.
589              
590             # used in unpack() to jail mime_helpers deep inside destdir:
591             $obj{dot_dot_safeguard} = 20 unless defined $obj{dot_dot_safeguard};
592             $obj{jail_chmod0} ||= 0;
593             # used in unpack, print only:
594             $obj{no_op} ||= 0;
595             # used in unpack, blocks recursion after archive unpacking:
596             $obj{one_shot} ||= $obj{no_op};
597              
598             # With $self->{within_archives} we know the difference between symlinks found in
599             # the given repository or symlinks that were unpacked from an archive.
600             # Those from an archive are followed only with follow_file_symlinks == 2.
601             $obj{follow_file_symlinks} ||= 0;
602              
603             warn "WARNING: We are running as root: Malicious archives may clobber your filesystem.\n" if $obj{verbose} and !$>;
604              
605             if (ref $obj{logfile} eq 'SCALAR' or !(ref $obj{logfile}))
606             {
607             open $obj{lfp}, ">", $obj{logfile} or croak "open logfile $obj{logfile} failed: $!\n";
608             }
609             else
610             {
611             $obj{lfp} = $obj{logfile};
612             }
613             # make $obj{lfp} unbuffered, so that other processes can read line by line...
614             $obj{lfp}->autoflush(1);
615             $obj{lfp_printed} = 0;
616              
617             $obj{readable_file_modes} = [ 0400 ];
618             $obj{readable_dir_modes} = [ 0700, 0500 ];
619              
620             if ($obj{world_readable})
621             {
622             unshift @{$obj{readable_file_modes}}, 0444;
623             unshift @{$obj{readable_dir_modes}}, 0755;
624             }
625              
626             if ($obj{maxfilesize})
627             {
628             eval
629             {
630             no strict;
631             # helper/application=x-shellscript calls File::Unpack->new(), with defaults...
632             my @have = BSD::Resource::getrlimit(RLIMIT_FSIZE);
633             if ($have[0] == RLIM_INFINITY or $have[0] > $obj{maxfilesize})
634             {
635             # if RLIM_INFINITY is seen as an attempt to increase limits, we would fail. Ignore this.
636             BSD::Resource::setrlimit(RLIMIT_FSIZE, $obj{maxfilesize}, RLIM_INFINITY) or
637             BSD::Resource::setrlimit(RLIMIT_FSIZE, $obj{maxfilesize}, $obj{maxfilesize}) or
638             warn "RLIMIT_FSIZE($obj{maxfilesize}), limit=($have[0],$have[1]) failed\n";
639             }
640             };
641             if ($@)
642             {
643             carp "WARNING maxfilesize=$obj{maxfilesize} ignored:\n $@ $!\n Maybe package perl-BSD-Resource is not installed??\n\n";
644             }
645             }
646              
647             $obj{minfree}{factor} = 10 unless defined $obj{minfree}{factor};
648             $obj{minfree}{bytes} = '1M' unless defined $obj{minfree}{bytes};
649             $obj{minfree}{percent} = '1%' unless defined $obj{minfree}{percent};
650             minfree(\%obj, warning => $obj{fs_warn}||\&_default_fs_warn);
651              
652             $obj{exclude}{empty_dir} = 1 unless defined $obj{exclude}{empty_dir};
653             $obj{exclude}{empty_file} = 1 unless defined $obj{exclude}{empty_file};
654              
655             $self = bless \%obj, $class;
656              
657             for my $h (@builtin_mime_helpers)
658             {
659             $self->mime_helper(@$h);
660             }
661             $obj{helper_dir} = _default_helper_dir unless exists $obj{helper_dir};
662             $self->mime_helper_dir($obj{helper_dir}) if defined $obj{helper_dir} and -d $obj{helper_dir};
663              
664             unless ($ENV{PERL5LIB})
665             {
666             # in case we are using non-standard perl lib dirs, put them into the environment,
667             # so that any helper scripts see them too. They might need them, if written in perl.
668              
669             use Config;
670             my $pat = qr{^(?:\Q$Config{vendorlib}\E|\Q$Config{sitelib}\E|\Q$Config{privlib}\E)\b};
671             my @add; # all dirs, that come before the standard dirs.
672             for my $i (@INC)
673             {
674             last if $i =~ m{$pat};
675             push @add, $i;
676             }
677             $ENV{PERL5LIB} = join ':', @add if @add;
678             }
679              
680             return $self;
681             }
682              
683             sub DESTROY
684             {
685             my $self = shift;
686             # when unpack() processes an input, it should delete {lfp} afterwards.
687             # Added some 'or' cases, as $self->{input} might be empty, although we had processed an input.
688             #
689             # We rather catch an error, than produce incomplete output.
690             # This happens with ksh/ast-base.2012-08-01.tar.bz2 after unpack('.../ast-base.2012-08-01/src/cmd/pax/data/a'): not much file or directory
691             #
692             if (($self->{input} or
693             ($self->{lfp_printed}||0) or
694             ($self->{recursion_level}||0)) and $self->{lfp})
695             {
696             if ($self->{log_type} eq 'plain')
697             {
698             # pass
699             }
700             else
701             {
702             $self->log(sprintf(qq[{"pid":"%d", "unpacked":{], $$)) unless $self->{lfp_printed};
703             }
704             my $r = $self->{recursion_level}||0;
705              
706             # this should never happen.
707             # always delete $self->{lfp} manually, when done.
708             ## {{
709             my $msg = "unexpected destructor seen";
710             $msg = join('; ', @{$self->{error}}) if $self->{error};
711             if ($self->{log_type} eq 'plain')
712             {
713             $self->log("# error: (l=$self->{lfp_printed},r=$r): $msg\n");
714             }
715             else
716             {
717             $self->log(qq[\n}, "error":"(l=$self->{lfp_printed},r=$r): $msg"}\n]);
718             }
719             close $self->{lfp} if $self->{lfp} ne $self->{logfile};
720             delete $self->{lfp};
721             delete $self->{lfp_printed};
722             }
723             if ($self->{configdir})
724             {
725             rmtree($self->{configdir});
726             delete $self->{configdir};
727             }
728             }
729              
730             =head2 unpack
731              
732             $u->unpack($archive, [$destdir])
733              
734             Determines the contents of an archive and recursivly extracts its files.
735             An archive may be the pathname of a file or directory. The extracted contents will be
736             stored in F, where dest_name is the filename
737             component of archive without any leading pathname components, and possibly
738             stripped or added suffix. (Subdir defaults to ''.) If archive is a directory,
739             then dest_name will also be a directory. If archive is a file, the type of
740             dest_name depends on the type of packing: If the archive expands to multiple
741             files, dest_name will be a directory, otherwise it will be a file. If a file of
742             the same name already exists in the destination subdir, an additional subdir
743             component is created to avoid any conflicts.
744              
745             For each extracted file, a record is written to the logfile.
746             When unpacking is finished, the logfile contains one valid JSON structure.
747             Unpack achieves this by writing suitable prolog and epilog lines to the logfile.
748             The logfile can also be parsed line by line. All file records is one line and start
749             with a ' ' whitespace, and end in a ',' comma. Everything else is prolog or epilog.
750              
751             The actual unpacking is dispatched to MIME type specific helpers,
752             selected using C. A MIME helper can either be built-in code, or an
753             external shell-script found in a directory registered with
754             C. The standard place for external helpers is
755             F; it can be changed by the environment variable
756             F or the C parameter C.
757              
758             The naming of helper scripts is described under C.
759              
760             A MIME helper must have executable permission and is called with 6 parameters:
761             source_path, destfile, destination_path, mimetype, description, and config_dir.
762             Note, that destination_path is a freshly created empty working directory, even
763             if the unpacker is expected to unpack only a single file. The unpacker is
764             called after chdir into destination_path, so you usually do not need to
765             evaluate the third parameter.
766              
767             The directory C contains unpack configuration in .sh, .js and possibly
768             other formats. A MIME helper may use this information, but need not.
769             All data passed into C is reflected there, as well as the active exclude-list.
770             Using the config information can help a MIME helper to skip unwanted
771             work or otherwise optimize unpacking.
772              
773             C monitors the available filesystem space in destdir. If there is less space
774             than configured with C, a warning can be printed and unpacking is
775             optionally paused. It also monitors the MIME helpers progress reading the archive
776             at source_path and reports percentages to STDERR (if verbose is 1 or more).
777              
778             After the MIME helper is finished, C examines the files it created.
779             If it created no files in F, an error is reported, and the
780             F may be passed to other unpackers, or finally be added to the log as is.
781              
782             If the MIME helper wants to express that F is already unpacked as far as possible
783             and should be added to the log without any error messages, it creates a symbolic link
784             F pointing to F.
785              
786              
787             The system considers replacing the
788             directory with a file, if all of the following conditions are met:
789              
790             =over
791              
792             =item *
793              
794             There is exactly one file in the directory.
795              
796             =item *
797              
798             The file name is identical with the directory name,
799             except for one changed or removed
800             suffix-word. (*.tar.gz -> *.tar; or *.tgz -> *.tar)
801              
802             =item *
803              
804             The file must not already exist in the parent directory.
805              
806             =back
807              
808             C prepares 20 empty subdirectory levels and chdirs the unpacker
809             in there. This number can be adjusted using C<< new(dot_dot_safeguard => 20) >>.
810             A directory 20 levels up from the current working dir has mode 0 while
811             the MIME helper runs. C can optionally chmod(0) the parent of the subdirectory
812             after it chdirs the unpacker inside. Use C<< new(jail_chmod0 => 1) >> for this, default
813             is off. If enabled, a MIME helper trying to place files outside of the specified
814             destination_path may receive 'permission denied' conditions.
815              
816             These are special hacks to keep badly constructed
817             tar-balls, cpio-, or zip-archives at bay.
818              
819             Please note, that this can help against archives containing relative paths
820             (like starting with '../../../foo'), but will be ineffective with absolute paths
821             (starting with '/foo').
822             It is the responsibility of MIME helpers to not create absolute paths;
823             C should not be run as the root user, to minimize the risk of
824             compromising the root filesystem.
825              
826             A missing MIME helper is skipped, and subsequent helpers may take effect. A
827             MIME helper is expected to return an exit status of 0 upon success. If it runs
828             into a problem, it should print lines
829             starting with the affected filenames to stderr.
830             Such errors are recorded in the log with the unpacked archive, and as far as
831             files were created, also with these files.
832              
833             Symbolic links are ignored while unpacking.
834              
835             Currently you can call C only once.
836              
837             =cut
838              
839             sub unpack
840             {
841             ## as long as $archive is outside $self->{destdir}, we construct our destdir by
842             ## replacing $self->{input_dir} with $self->{destdir}.
843             ## This $self->{input_dir} must be created and kept constant at the earliest
844             ## possible call.
845             ## When the $archive is inside $self->{destdir}, we do not use $self->{input_dir},
846             ## we then use the current $in_dir as destdir.
847             ##
848             ## Whenever an archive path outside $self->{destdir} is found,
849             ## it is first passed through Cwd::fast_abs_path before any other processing occurs.
850             ##
851             my ($self, $archive, $destdir) = @_;
852             $destdir = $self->{destdir} unless defined $destdir;
853              
854             $destdir = $1 if $destdir =~ m{^(.*)$}s; # brute force untaint
855              
856             if (($self->{recursion_level}||0) > $RECURSION_LIMIT)
857             {
858             push @{$self->{error}}, "unpack('$archive','$destdir'): recursion limit $RECURSION_LIMIT";
859             ## this is only an emergency stop.
860             return 1;
861             }
862              
863             if ($archive !~ m{^/} or $archive !~ m{^\Q$self->{destdir}\E/})
864             {
865             # Cwd::fast_abs_path($archive) not only makes nice absolute paths, but it also expands
866             # file symlinks. This is a bad idea for two reasons:
867             # * when we allow {follow_file_symlinks} the link destination gets into the log file,
868             # rather than the (expected) link itself.
869             # * Also, this could easily trigger "path escaped" below .
870             ######
871             if ($self->{follow_file_symlinks} && $archive =~ m{^(.*)/(.*?)$})
872             {
873             # we solve both issues by doing this:
874             # chop off the filename; expand the path; re-add the filename.
875             my ($a_path, $a_file) = ($1,$2);
876             $a_path = Cwd::fast_abs_path($a_path) if -e $a_path;
877             $archive = $a_path . '/' . $a_file;
878             }
879             else
880             {
881             $archive = Cwd::fast_abs_path($archive) if -e $archive;
882             }
883             }
884              
885             my $start_time = time;
886             if ($self->{recursion_level}++ == 0)
887             {
888             print STDERR "unpack: starting...\n" if $self->{verbose} > 1;
889             ## State that needs to be reset when (re)starting goes in here.
890             #
891             # CAUTION: recursion_level decrements again, as we return from unpack()
892             # how do we assert, that this code only runs at the start,
893             # and not once again at the end?
894             $self->{inside_archives} = 0;
895             $self->{json} ||= JSON->new()->ascii(1); # used often, create it unconditionally here and once.
896             $self->{iput} = $archive;
897             $self->{progress_tstamp} = $start_time;
898             ($self->{input_dir}, $self->{input_file}) = ($1, $2) if $archive =~ m{^(.*)/([^/]*)$};
899              
900             if ($self->{log_type} eq 'plain')
901             {
902             # pass
903             }
904             else
905             {
906             # logfile prolog
907             my $prolog = {destdir=>$self->{destdir}, fu=>$VERSION, pid=>$$, input => $archive, start => scalar localtime};
908             $prolog->{params} = $self->{log_params} if keys %{$self->{log_params}};
909             my $s = $self->{json}->encode($prolog);
910             $s =~ s@}$@, "unpacked":{\n@;
911             $self->log($s);
912             }
913             }
914              
915             unless (-e $archive)
916             {
917             # contstucted $archive wrongly
918             # e.g. we have 'pax/data/a/' instead of 'pax/data/_fu_3CEuA/a/'
919             push @{$self->{error}}, "unpack('$archive'): not much file or directory; ";
920             return 1;
921             }
922              
923             unless ($self->{input_dir})
924             {
925             push @{$self->{error}}, "unpack('$archive'); internal error: no {input_dir}";
926             return 1;
927             }
928              
929             my ($in_dir, $in_file) = ('/', '');
930             ($in_dir, $in_file) = ($1, $2) if $archive =~ m{^(.*/)([^/]*)$};
931              
932             my $inside_destdir = 1;
933             my $subdir = $in_dir; # remainder after stripping $orig_archive_prefix / $self->{destdir}
934             unless ($subdir =~ s{^\Q$self->{destdir}\E/+}{})
935             {
936             $inside_destdir = 0;
937             die "$archive path escaped. Neither inside original $self->{input_dir} nor inside destdir='$self->{destdir}'\n"
938             unless $subdir =~ s{^\Q$self->{input_dir}\E/+}{};
939             }
940              
941             print STDERR "unpack: r=$self->{recursion_level} in_dir=$in_dir, in_file=$in_file, destdir=$destdir\n" if $self->{verbose} > 1;
942              
943             my @missing_unpacker;
944              
945             if ($self->{progress_tstamp} + 10 < $start_time)
946             {
947             printf "T: %d files ...\n", $self->{file_count}||0;
948             $self->{progress_tstamp} = $start_time;
949             }
950              
951             if (-d $archive)
952             {
953             $self->_chmod_add($archive, @{$self->{readable_dir_modes}});
954             if (opendir DIR, $archive)
955             {
956             my @f = sort grep { $_ ne '.' && $_ ne '..' } readdir DIR;
957             closedir DIR;
958             print STDERR "dir = @f\n" if $self->{verbose} > 1;
959             for my $f (@f)
960             {
961             if ($self->{exclude}{re} && $f =~ m{$self->{exclude}{re}})
962             {
963             $self->{skipped}{exclude}++;
964             }
965             my $new_in = "$archive/$f";
966             ## if $archive is $inside_destdir, then $archive is normally indentical to $destdir.
967             ## ($inside_destdir means inside $self->{destdir}, actually)
968             my $new_destdir = $destdir; $new_destdir .= "/$f" if -d $new_in;
969             my $symlink_to_skip = -l $new_in;
970             my $dangeous_symlink = $self->{inside_archives} ? 1 : 0;
971             if ($symlink_to_skip and ($self->{follow_file_symlinks} > $dangeous_symlink))
972             {
973             $symlink_to_skip = 0 if -f $new_in;
974             # directory and dead symlinks we always skip.
975             # directory symlinks could cause us to recurse out of the current tree.
976             }
977              
978             if ($symlink_to_skip)
979             {
980             # test -l first, as -f could be also true here...
981             print STDERR "symlink $new_in: skipped\n" if $self->{verbose} > 1;
982             $self->{skipped}{symlink}++;
983             }
984             elsif (-f $new_in or -d _)
985             {
986             $self->unpack($new_in, $new_destdir);
987             }
988             else
989             {
990             print STDERR "special file $new_in: skipped\n" if $self->{verbose} > 1;
991             $self->{skipped}{device_node}++;
992             }
993             $self->{progress_tstamp} = time;
994             }
995             }
996             else
997             {
998             push @{$self->{error}}, "unpack dir ($archive) failed: $!";
999             }
1000             }
1001             elsif (-f $archive)
1002             {
1003             if ($self->_not_excluded($subdir, $in_file) and
1004             !defined($self->{done}{$archive}))
1005             {
1006             $self->_chmod_add($archive, @{$self->{readable_file_modes}});
1007              
1008             my $m = $self->mime($archive);
1009             my ($h, $more) = $self->find_mime_helper($m);
1010             my $data = { mime => $m->[0] };
1011             if ($more)
1012             {
1013             $data->{found} = $more;
1014             push @missing_unpacker, @{$more->{missing}} if $more->{missing};
1015             }
1016              
1017             if ($m->[0] eq 'text/plain' or !$h)
1018             {
1019             # not really an archive.
1020             unless ($archive =~ m{^\Q$self->{destdir}\E/})
1021             {
1022             mkpath($destdir) unless $self->{no_op};
1023             my $destdir_in_file;
1024             $destdir_in_file = $1 if "$destdir/$in_file" =~ m{^(.*)$}s; # brute force untaint
1025              
1026             if (-e "$destdir_in_file")
1027             {
1028             print STDERR "unpack copy in: $destdir_in_file already exists, " if $self->{verbose};
1029             $destdir = File::Temp::tempdir($TMPDIR_TEMPL, DIR => $destdir);
1030             $destdir_in_file = $1 if "$destdir/$in_file" =~ m{^(.*)$}s; # brute force untaint
1031             print STDERR "using $destdir_in_file instead.\n" if $self->{verbose};
1032             }
1033             $data->{error} = "copy($archive): $!" unless File::Copy::copy($archive, $destdir_in_file);
1034             $self->logf($destdir_in_file => $data);
1035             }
1036             else
1037             {
1038             $self->logf($archive => $data);
1039             }
1040             $self->{file_count}++;
1041             }
1042             else
1043             {
1044             # really an archive.
1045             if ($self->{archive_name_as_dir})
1046             {
1047             print STDERR "archive_name_as_dir: expanding destdir $destdir\n" if $self->{verbose};
1048             $destdir = _unused_pathname($destdir, $in_file);
1049             print STDERR "archive_name_as_dir: to $destdir\n" if $self->{verbose};
1050             }
1051             mkpath($destdir) unless $self->{no_op};
1052             $self->{configdir} = $self->_prep_configdir() unless exists $self->{configdir};
1053              
1054             ## new_name is a suggestion for the mime_helper only.
1055             my $new_name = $in_file;
1056            
1057             # Either shorten the name from e.g. foo.txt.bz2 to foo.txt or append
1058             # something: foo.pdf to foo.pdf._;
1059             # Normally a suffix is appended by '.', but we also see '-' or '_' in real life.
1060             unless ($h->{suffix_re} and $new_name =~ s{[\._-]$h->{suffix_re}(?:\._\d*)?$}{}i)
1061             {
1062             # avoid unary notation of recursion couning. There may be a 256 char limit per
1063             # directory entry. Start counting in decimal, if two or more.
1064             # Hmm, the /e modifier is not mentioned in perlre, but it works. Is it deprecated??
1065             $new_name .= "._";
1066             $new_name =~ s{\._\._$}{\._2};
1067             $new_name =~ s{\._(\d+)\._$}{ "._".($1+1) }e;
1068             }
1069              
1070             ## if consumer of logf wants to do progress indication himself,
1071             ## then tell him what we do before we start. (Our timer tick code may be an alternative...)
1072             #
1073             # if ($archive =~ m{^\Q$self->{destdir}\E})
1074             # {
1075             # $self->logf($archive => { unpacking => $h->{fmt_p} });
1076             # }
1077            
1078             my ($unpacked, $diag) =
1079             $self->_run_mime_helper($h, $archive, $new_name, $destdir,
1080             $m->[0], $m->[2], $self->{configdir});
1081              
1082             # die Dumper "_run_mime_helper: $archive, $new_name, $destdir", readlink($unpacked), $unpacked;
1083              
1084             unless (ref $unpacked or -e $unpacked)
1085             {
1086             warn("archive=$archive, new_name=$new_name\n");
1087             die("assert -e '$unpacked'")
1088             }
1089              
1090             return 0 if $self->{no_op};
1091             if (ref $unpacked)
1092             {
1093             # a ref here means, something went wrong.
1094             $data->{failed} = $h->{fmt_p};
1095             $data->{error} = $unpacked->{error};
1096             $data->{stderr} = $unpacked->{stderr} if defined $unpacked->{stderr};
1097             $self->logf($archive => $data);
1098             $self->{file_count}++;
1099             $self->{helper_errors}++;
1100             }
1101             elsif (readlink($unpacked)||'' eq $archive)
1102             {
1103             # a symlink backwards means, there is nothing to unpack here. take it as is.
1104             unlink $unpacked;
1105             rmdir $destdir if $self->{archive_name_as_dir}; # now an empty dir.
1106             $data->{passed} = $h->{name};
1107             $data->{input} = $self->loggable_pathname($archive);
1108             $data->{cmd} = $h->{fmt_p};
1109             {
1110             local $Data::Dumper::Terse = 1;
1111             local $Data::Dumper::Indent = 0;
1112             $data->{diag} = Dumper $diag if $diag;
1113             }
1114              
1115             if ($archive =~ m{^\Q$self->{destdir}\E})
1116             {
1117             # if inside, we just flag it done and log it.
1118             $self->{done}{$archive} = $archive;
1119             $self->logf($archive => $data);
1120             }
1121             else
1122             {
1123             # if the archive itself was outside destdir,
1124             # we copy it in, flag it done there, and log it here.
1125             if (File::Copy::copy($archive, $unpacked))
1126             {
1127             $self->{done}{$archive} = $unpacked;
1128             $self->logf($unpacked => $data);
1129             }
1130             else
1131             {
1132             $data->{error} = "copy($archive, $unpacked): $!";
1133             $self->logf($archive => $data);
1134             }
1135             }
1136             $self->{file_count}++;
1137             }
1138             else
1139             {
1140             # normal case: mime helper placed all
1141             # in a directory (or file) called $unpacked
1142              
1143              
1144             if ($archive =~ m{^\Q$self->{destdir}\E})
1145             {
1146             # to delete it, we should know if it was created during unpack.
1147             }
1148             $self->{done}{$archive} = $unpacked;
1149             $data->{cmd} = $h->{fmt_p};
1150             $data->{unpacked} = $self->loggable_pathname($unpacked);
1151             $self->logf($archive => $data);
1152             $self->{file_count}++;
1153             $self->{inside_archives}++;
1154              
1155             my $newdestdir = $unpacked;
1156             $newdestdir =~ s{/+[^/]+}{} unless -d $newdestdir; # make sure it is a directory
1157             $newdestdir = $destdir unless $newdestdir =~ m{^\Q$self->{destdir}\E/}; # make sure it does not escape
1158             if ($self->{one_shot})
1159             {
1160             local $self->{mime_orcish};
1161             local $self->{mime_helper};
1162              
1163             $self->unpack($unpacked, $newdestdir);
1164             }
1165             else
1166             {
1167             $self->unpack($unpacked, $newdestdir);
1168             }
1169             $self->{progress_tstamp} = time;
1170             $self->{inside_archives}--;
1171             }
1172             }
1173             }
1174             }
1175             else
1176             {
1177             $self->logf($archive => { "skipped" => "special file"});
1178             $self->{file_count}++;
1179             }
1180              
1181             if (--$self->{recursion_level} == 0)
1182             {
1183             if ($self->{log_type} eq 'plain')
1184             {
1185             for my $m (@missing_unpacker)
1186             {
1187             $self->log("# missing unpacker: $m\n");
1188             }
1189             for my $s (sort keys %{$self->{skipped}})
1190             {
1191             $self->log("# skipped: $s: $self->{skipped}{$s}\n");
1192             }
1193             $self->log("# error: ".join('; ', @{$self->{error}})."\n") if $self->{error};
1194             }
1195             else
1196             {
1197             my $epilog = {end => scalar localtime, sec => time-$start_time };
1198             $epilog->{skipped} = $self->{skipped} if $self->{skipped};
1199             $epilog->{error}{msg} = $self->{error} if $self->{error}; # just in case some errors were non-fatal.
1200             $epilog->{error}{helper} = $self->{helper_errors} if $self->{helper_errors}; # counting
1201             $epilog->{missing_unpacker} = \@missing_unpacker if @missing_unpacker;
1202             my $s = $self->{json}->encode($epilog);
1203              
1204             $s =~ s@^{@\n},@;
1205             $self->log($s . "\n");
1206             }
1207              
1208             if ($self->{lfp} ne $self->{logfile})
1209             {
1210             close $self->{lfp} or carp "logfile write ($self->{logfile}) failed: $!\n";
1211             }
1212             delete $self->{lfp};
1213             delete $self->{lfp_printed};
1214             }
1215              
1216             # FIXME: should return nonzero if we had any unrecoverable errors.
1217             return $self->{error} ? 1 : 0;
1218             }
1219              
1220             # Try a few modes to add to the current permission bits.
1221             # The first mode that succeeds ends the list.
1222             sub _chmod_add
1223             {
1224             my ($self, $file, @modes) = @_;
1225             $file = $1 if $file =~ m{^(.*)$}m;
1226             my $perm = (stat $file)[2] & 07777;
1227             for my $m (@modes)
1228             {
1229             last if chmod($perm|$m, $file); # may or may not succeed. Harmless here.
1230             }
1231             }
1232              
1233             =head2 run
1234              
1235             $u->run([argv0, ...], @redir, ... { init => sub ..., in, out, err, watch, every, prog, ... })
1236              
1237             A general purpose fork-exec wrapper, based on IPC::Run. STDIN is closed, unless you specify
1238             an C<< in => >> as described in IPC::Run. STDERR and STDOUT are both printed to
1239             STDOUT, prefixed with 'E: ' and 'O: ' respectively, unless you specify C<< out => >>,
1240             C<< err => >>, or C<< out_err => >> ... for both.
1241              
1242             Using redirection operators in @redir takes precedence over the above in/out/err
1243             redirections. See also L. If you use the options in/out/err, you should
1244             restrict your redirection operators to the forms '<', '0<', '1>', '2>', or '>&' due
1245             to limitations in the precedence logic. Piping via '|' is properly recognized,
1246             but background execution '&' may confuse the precedence logic.
1247              
1248             This C method is completly independent of the rest of File::Unpack. It works both
1249             as a static function and as a method call.
1250             It is used internally by C, but is exported to be of use elsewhere.
1251              
1252             Init is run after construction of redirects. Calling chdir() in init thus has no
1253             effect on redirects with relative paths.
1254              
1255             Return value in scalar context is the first nonzero result code, if any. In list context
1256             all return values are returned.
1257             =cut
1258              
1259             sub run
1260             {
1261             shift if ref $_[0] ne 'ARRAY'; # toss $self object handle.
1262             my (@cmd) = @_;
1263             my $opt;
1264             $opt = pop @cmd if ref $cmd[-1] eq 'HASH';
1265              
1266             my $cmdname = $cmd[0][0]; $cmdname =~ s{^.*/}{};
1267              
1268             # run the command with
1269             # - STDIN closed, unless you specify an { in => ... }
1270             # - STDERR and STDOUT printed prefixed with 'E: ', 'O: ' to STDOUT,
1271             # unless you specify out =>, err =>, or out_err => ... for both.
1272             $opt->{in} ||= \undef;
1273             $opt->{out} ||= $opt->{out_err};
1274             $opt->{err} ||= $opt->{out_err};
1275             $opt->{out} ||= sub { print "O: ($cmdname) @_\n"; };
1276             $opt->{err} ||= sub { print "E: ($cmdname) @_\n"; };
1277              
1278             my $has_i_redir = 0;
1279             my $has_o_redir = 0;
1280             my $has_e_redir = 0;
1281              
1282             ## The ugly truth is, there might be multiple commands with pipes.
1283             ## We need to provide all of them with the proper redirects.
1284             ## A command that pipes somewhere else, has_o_redir outbound through the pipe.
1285             ## A command that is piped into, has_i_redir inbound from the pipe.
1286             my @run = ();
1287              
1288              
1289             for my $c (@cmd)
1290             {
1291             if (ref $c)
1292             {
1293             push @run, $c;
1294              
1295             # put init early, so that it is run, before any IO redirects access relative paths.
1296             push @run, init => $opt->{init} if $opt->{init};
1297             next; # don't look into argvs, but
1298             }
1299             # look only into redirection operators
1300             $has_i_redir++ if $c =~ m{^0?<};
1301             $has_o_redir++ if $c =~ m{^1?>};
1302             $has_e_redir++ if $c =~ m{^(?:2>|>&$)};
1303             if ($c eq '|')
1304             {
1305             push @run, '0<', $opt->{in} unless $has_i_redir;
1306             $has_i_redir = 'piped';
1307             push @run, "2>", $opt->{err} unless $has_e_redir;
1308             $has_e_redir = $has_o_redir = 0;
1309             }
1310             push @run, $c; # $1 if $c =~ m{^(.*)$}s; # brute force untaint
1311             }
1312              
1313             push @run, '0<', $opt->{in} unless $has_i_redir;
1314             push @run, "1>", $opt->{out} unless $has_o_redir;
1315             push @run, "2>", $opt->{err} unless $has_e_redir;
1316              
1317             # die Dumper \@run if $cmd[0][0] eq '/usr/bin/rpm2cpio';
1318              
1319             my $t;
1320             $t = IPC::Run::timer($opt->{every}-0.6) if $opt->{every};
1321             push @run, $t if $t;
1322              
1323             $run[0][0] = $1 if $run[0][0] =~ m{^(.*)$}s;
1324             push @run, debug => $opt->{debug} if $opt->{debug};
1325             my $h = eval { IPC::Run::start @run; };
1326             return wantarray ? (undef, $@) : undef unless $h;
1327              
1328             while ($h->pumpable)
1329             {
1330             # eval {} guards against 'process ended prematurely' errors.
1331             # This happens on very fast commands, despite pumpable().
1332             eval { $h->pump };
1333             if ($t && $t->is_expired)
1334             {
1335             $t->{has_fired}++;
1336             $opt->{prog}->($h, $opt);
1337             $t->start($opt->{every});
1338             }
1339             }
1340             $h->finish;
1341             $opt->{finished} = 1;
1342              
1343             ## call it once more, to get the 100% printout, or somthing else...
1344             $opt->{prog}->($h, $opt) if $t->{has_fired};
1345              
1346             return wantarray ? $h->full_results : $h->result;
1347             }
1348              
1349             =head2 fmt_run_shellcmd
1350              
1351             File::Unpack::fmt_run_shellcmd( $m->{argvv} )
1352              
1353             Static function to pretty print the return value $m of method find_mime_helper();
1354             It formats a command array used with run() as a properly escaped shell command string.
1355              
1356             =cut
1357              
1358             sub _my_shell_quote
1359             {
1360             my @a = @_;
1361             my $sub;
1362             if (@a and defined $a[0])
1363             {
1364             $sub = '\\&_locate_tar' if $a[0] eq \&_locate_tar;
1365             $sub = '\\&_locate_cpio_i' if $a[0] eq \&_locate_cpio_i;
1366             }
1367              
1368             if ($sub)
1369             {
1370             shift @a;
1371             return "$sub " . shell_quote(@a);
1372             }
1373             return shell_quote(@a);
1374             }
1375              
1376             sub fmt_run_shellcmd
1377             {
1378             my @a = @_;
1379             @a = @{$a[0]{argvv}} if ref $a[0] eq 'HASH';
1380             my @r = ();
1381             for my $a (@a)
1382             {
1383             push @r, ref($a) ? '('._my_shell_quote(@$a).')' : _my_shell_quote($a);
1384             }
1385             my $r = join ' ', @r;
1386             $r =~ s{^\((.*)\)$}{$1} unless $#a; # parenthesis around a single cmd are unneeded.
1387             return $r;
1388             }
1389              
1390             ## not a method, officially.
1391             #
1392             ## Chdir in and out of a jail is done here, as IPC::Run::run({init}->())
1393             ## has bad timing for our purposes.
1394             #
1395             ## fastjar extracts happily to ../../..
1396             ## this happens in cups-1.2.1/scripting/java/cups.jar
1397             #
1398             ## FIXME:
1399             # "/tmp/xxxx/cups-1.2.4-11.5.1.el5/cups-1.2.4/scripting/java/cups.jar":
1400             # {"cmd":"/usr/bin/unzip -P no_pw -q -o '%(src)s'",
1401             # "unpacked":"/tmp/xxxx/cups-1.2.4-11.5.1.el5/cups-1.2.4/_Knw_"}
1402             # Two issues:
1403             # a) _run_mime_helper in /tmp/xxxx/cups-1.2.4-11.5.1.el5/cups-1.2.4
1404             # should be /tmp/xxxx/cups-1.2.4-11.5.1.el5/cups-1.2.4/scripting/java
1405             # b) _Knw_ should never appear in the end result ...
1406             #
1407              
1408             sub _run_mime_helper
1409             {
1410             my ($self, $h, @argv) = @_;
1411              
1412             for my $i (0..$#argv)
1413             {
1414             $argv[$i] = $1 if $argv[$i] =~ m{^(.*)$}s; # brute force untaint
1415             }
1416              
1417             my $destdir = $argv[2];
1418             my $dot_dot_safeguard = $self->{dot_dot_safeguard}||0;
1419             $dot_dot_safeguard = 2 if $dot_dot_safeguard < 2;
1420              
1421             my $jail_base = '/dev/null';
1422             my $jail = $jail_base;
1423             unless ($self->{no_op})
1424             {
1425             mkpath($destdir);
1426             $jail_base = File::Temp::tempdir($TMPDIR_TEMPL, DIR => $destdir);
1427             $jail = $jail_base . ("/_" x $dot_dot_safeguard);
1428             mkpath($jail);
1429             }
1430              
1431             my $args =
1432             {
1433             src => $argv[0], # abs_path() - but not symlink resolved, so that the unpacker sees 'our' name
1434             destfile => $argv[1], # filename() - a suggested name, simply based on src, in case the unpacker needs it.
1435             destdir => $jail, # abs_path() - for now...
1436             mime => $argv[3],
1437             descr => $argv[4], # mime_descr
1438             configdir => $argv[5] # abs_path()
1439             };
1440             $args->{lsrc} = Cwd::realpath($args->{src}); # symlinks resolved; use this with a stupid unpacker like 'upx'
1441             die "src must be an abs_path." unless $args->{src} =~ m{^/};
1442            
1443             my @cmd;
1444             for my $a (@{$h->{argvv}})
1445             {
1446             if (ref $a)
1447             {
1448             my @c = ();
1449             for my $b (@$a)
1450             {
1451             push @c, _subst_args($b, $args);
1452             }
1453             push @cmd, [@c];
1454             }
1455             else
1456             {
1457             push @cmd, _subst_args($a, $args);
1458             }
1459             }
1460              
1461             if ($self->{no_op})
1462             {
1463             print fmt_run_shellcmd(@cmd) . "\n";
1464             return undef;
1465             }
1466              
1467             print STDERR "_run_mime_helper in $destdir: " . fmt_run_shellcmd(@cmd) . "\n" if $self->{verbose} > 1;
1468              
1469             my $cwd = getcwd() or carp "cannot fetch initial working directory, getcwd: $!";
1470             $cwd = $1 if $cwd =~ m{^(.*)$}s; # brute force untaint. Whereever you go, there you are.
1471             chdir $jail or die "chdir '$jail'";
1472             chmod 0, $jail_base if $self->{jail_chmod0};
1473             # Now have fully initialzed in the parent before forking.
1474             # This is needed, as all redirect operators are executed in the parent before forking.
1475             # init => sub { ... } is no longer needed. sigh, I really wanted to the init sub for the chdir.
1476             # But hey, mkpath() and rmtree() change the cwd so often, and restore it, so why shouldn't we?
1477              
1478              
1479             my $run_error = undef; # we capture the first error line for the logfile.
1480             my @r = $self->run(@cmd,
1481             {
1482             debug => ($self->{verbose} > 2) ? $self->{verbose} - 2 : 0,
1483             watch => $args->{src}, every => 5, fu_obj => $self, mime_helper => $h,
1484             err => sub { print "E: @_\n" if $self->{verbose}; $run_error = "@_" unless length $run_error },
1485             prog => sub
1486             {
1487             $_[1]{tick}++;
1488             my $name = $_[1]{watch}; $name =~ s{.*/}{};
1489             if ($_[1]{finished})
1490             {
1491             printf "T: %s (%s, done)\n", $name, _unit_bytes(-s $_[1]{watch},1)
1492             if $self->{verbose};
1493             }
1494             elsif (my $p = _children_fuser($_[1]{watch}, POSIX::getpid()))
1495             {
1496             _fuser_offset($p);
1497             # we may get muliple process with multiple filedescriptors.
1498             # select the one that moves fastest.
1499             my $largest_diff = -1;
1500             for my $pid (keys %$p)
1501             {
1502             for my $fd (keys %{$p->{$pid}{fd}})
1503             {
1504             my $diff = ($p->{$pid}{fd}{$fd}{pos}||0) - ($_[1]{fuser}{$pid}{fd}{$fd}{pos}||0);
1505             if ($diff > $largest_diff)
1506             {
1507             $largest_diff = $diff;
1508             $p->{fastest_fd} = $p->{$pid}{fd}{$fd};
1509             }
1510             }
1511             }
1512             # Stick with the one we had before, if none moves.
1513             $p->{fastest_fd} = $_[1]{fuser}{fastest_fd} if $largest_diff <= 0;
1514             $_[1]{fuser} = $p;
1515             my $off = $p->{fastest_fd}{pos}||0;
1516             my $tot = $p->{fastest_fd}{size}||(-s $_[1]{watch})||1;
1517             printf "T: %s (%s, %.1f%%)\n", $name, _unit_bytes($off,1), ($off*100)/$tot
1518             if $self->{verbose};
1519             }
1520             else
1521             {
1522             print "T: $name tick_tick $_[1]{tick}\n"
1523             if $self->{verbose};
1524             }
1525             },
1526             });
1527            
1528             # system("ls -la $jail_base/..; find $jail_base");
1529             # print STDERR Dumper \@r;
1530              
1531             chmod 0700, $jail_base if $self->{jail_chmod0};
1532             chdir $cwd or die "cannot chdir back to cwd: chdir($cwd): $!";
1533             my @nonzero = grep { $_ } @r;
1534              
1535             # TODO: handle failure
1536             # - remove all,
1537             # - retry with a fallback helper , if any.
1538             printf STDERR "Non-Zero return value: $nonzero[0]: %s\n", fmt_run_shellcmd(@cmd)
1539             if $nonzero[0] and $self->{verbose};
1540              
1541             # FIXME: fallback helper not implemented
1542             # t/data/pdftxt-a.txt is really plain/text altthough it begins with "PDF-1.4..." and
1543             # thus fools the mime-type tests.
1544             # should run other helpers, and finally 'strings -' as a trivial fallback.
1545             if ($nonzero[0])
1546             {
1547             rmtree($jail_base); # empty or has unusable contents now.
1548             ## FIXME: we should at least copy in the original file as is...
1549             return { error => "nonzero retval:\n " . Dumper(\@r), stderr => $run_error };
1550             }
1551              
1552             # loop through all _: if it only contains one item , replace it with this item,
1553             # be it a file or dir. This uses $jail_tmp, an unused pathname.
1554             my $jail_tmp = File::Temp::tempdir($TMPDIR_TEMPL, DIR => $destdir);
1555             rmdir $jail_tmp;
1556              
1557             # if only one file in $jail, move it up, and return
1558             # the filename instead of the dirname here.
1559             # (We don't search for $args->{destfile}, it is the unpackers choice to use it or not.)
1560             my $wanted_name;
1561             for (my $i = 0; $i <= $dot_dot_safeguard; $i++)
1562             {
1563             opendir DIR, $jail_base or last;
1564             my @found = grep { $_ ne '.' and $_ ne '..' } readdir DIR;
1565             closedir DIR;
1566             my $found0;
1567             $found0 = $1 if defined($found[0]) and $found[0] =~ m{^(.*)$}s; # brute force untaint
1568             print STDERR "dot_dot_safeguard=$dot_dot_safeguard, i=$i, found=$found0\n" if $self->{verbose} > 2;
1569             unless (@found)
1570             {
1571             rmdir $jail_base;
1572             my $name;
1573             $name = $1 if $args->{src} =~ m{/([^/]+)$};
1574             print STDERR "oops(i=$i): nothing unpacked?? Adding $name as is.\n" if $self->{verbose};
1575             return { error => "nothing unpacked" };
1576             }
1577             last if scalar @found != 1;
1578             $wanted_name = $found0 if $i == $dot_dot_safeguard;
1579             last unless -d $jail_base . "/" . $found0;
1580             # assert writable dirs. needed for ksh/ast-base.2012-08-01.tar.bz2/src/cmd/pax/data/ro.dat
1581             chmod(0755, $jail_base . "/" . $found0);
1582             rename $jail_base, $jail_tmp or die("4:$i rename($jail_base, $jail_tmp) failed: $!");
1583             rename $jail_tmp . "/" . $found0, $jail_base or die("5:$i rename($jail_tmp .'/'. $found0, $jail_base) failed: $!");
1584             rmdir $jail_tmp or last;
1585             }
1586              
1587             ## this message is broken.
1588             # print STDERR "Hmmm, unpacker did not use destname: $args->{destfile}\n" if $self->{verbose} and !defined $wanted_name;
1589              
1590             # say nothing, if $args->{destfile} is equal to or a prefix of $wanted_name.
1591             print STDERR "Hmmm, unpacker saw destname: $args->{destfile}, but used destname: $wanted_name\n"
1592             if $self->{verbose} > 1 and defined($wanted_name) and $wanted_name !~ m{^\Q$args->{destfile}};
1593              
1594             $wanted_name = $args->{destfile} unless defined $wanted_name;
1595             my $wanted_path;
1596             $wanted_path = _unused_pathname($destdir, $wanted_name) if defined $wanted_name;
1597             my $unpacked = $jail_base;
1598             if (defined($wanted_name) and !-e $wanted_path)
1599             {
1600             if (-d $jail_base)
1601             {
1602             ## find out, if the unpacker created exactly one file or one directory,
1603             ## in this case we can move one level further.
1604             opendir DIR, $jail_base;
1605             my @found = grep { $_ ne '.' and $_ ne '..' } readdir DIR;
1606             closedir DIR;
1607             my $found0;
1608             $found0 = $1 if defined($found[0]) and $found[0] =~ m{^(.*)$}s; # brute force untaint
1609              
1610             if ($#found == 0 and $found0 eq $wanted_name)
1611             {
1612             rename "$jail_base/$found0", $wanted_path or die "1 rename($jail_base/$found0, $wanted_path) failed: $!";
1613             rmdir $jail_base;
1614             }
1615             else
1616             {
1617             rename $jail_base, $wanted_path or die "2 rename($jail_base, $wanted_path) failed: $!";
1618             }
1619             }
1620             else
1621             {
1622             rename $jail_base, $wanted_path or die "3 rename($jail_base, $wanted_path) failed: $!";
1623             }
1624             $unpacked = $wanted_path;
1625             }
1626              
1627             # catch some patholigical cases.
1628             if (-f $unpacked and !-l $unpacked)
1629             {
1630             if (!-s $unpacked)
1631             {
1632             print STDERR "Ooops, only one empty file -> symlink back\n" if $self->{verbose};
1633             unlink $unpacked;
1634             symlink $args->{src}, $unpacked;
1635             }
1636             elsif (-s $unpacked eq (my $s = -s $args->{src}))
1637             {
1638             print STDERR "Hmm, same size ($s bytes) after unpacking???\n" if $self->{verbose};
1639             ## xz -dc -f behaves like cat, if called on an unknown file.
1640             ## Compare the files. If they are identical, stop this:
1641             if (File::Compare::cmp($args->{src}, $unpacked) == 0)
1642             {
1643             print STDERR "Oops, identical -> symlink back\n" if $self->{verbose};
1644             unlink $unpacked;
1645             symlink $args->{src}, $unpacked;
1646             }
1647             }
1648             }
1649              
1650             my $diag = undef;
1651             $diag->{stderr} = $run_error if defined $run_error;
1652             return ($unpacked, $diag);
1653             }
1654              
1655             sub _unused_pathname
1656             {
1657             my ($destdir, $wanted_name) = @_;
1658             my $wanted_path = $destdir . "/" . $wanted_name;
1659              
1660             if (-e $wanted_path)
1661             {
1662             ## try to come up with a very similar name, just different suffix.
1663             ## be compatible with path name shortening in unpack()
1664             my $test_path = $wanted_path . '._';
1665             for my $i ('', 1..999)
1666             {
1667             # All our mime detectors work on file contents, rather than on suffixes.
1668             # Thus messing with the suffix should be okay here.
1669             unless (-e $test_path.$i)
1670             {
1671             $wanted_path = $test_path.$i;
1672             last;
1673             }
1674             }
1675             }
1676             die "_unused_pathname failed: last attempt $wanted_path\n" if -e $wanted_path;
1677             return $wanted_path;
1678             }
1679              
1680              
1681             sub _children_fuser
1682             {
1683             my ($file, $ppid) = @_;
1684             $ppid ||= 1;
1685             $file = Cwd::abs_path($file);
1686              
1687             opendir DIR, "/proc" or die "opendir /proc failed: $!\n";
1688             my %p = map { $_ => {} } grep { /^\d+$/ } readdir DIR;
1689             closedir DIR;
1690              
1691             # get all procs, and their parent pids
1692             for my $p (keys %p)
1693             {
1694             if (open IN, "<", "/proc/$p/stat")
1695             {
1696             # don't care if open fails. the process may have exited.
1697             my $text = join '', ;
1698             close IN;
1699             if ($text =~ m{\((.*)\)\s+(\w)\s+(\d+)}s)
1700             {
1701             $p{$p}{cmd} = $1;
1702             $p{$p}{state} = $2;
1703             $p{$p}{ppid} = $3;
1704             }
1705             }
1706             }
1707              
1708             # Weed out those who are not in our family
1709             if ($ppid > 1)
1710             {
1711             for my $p (keys %p)
1712             {
1713             my $family = 0;
1714             my $pid = $p;
1715             while ($pid)
1716             {
1717             # Those that have ppid==1 may also belong to our family.
1718             # We never know.
1719             if ($pid == $ppid or $pid == 1)
1720             {
1721             $family = 1;
1722             last;
1723             }
1724             last unless $p{$pid};
1725             $pid = $p{$pid}{ppid};
1726             }
1727             delete $p{$p} unless $family;
1728             }
1729             }
1730              
1731             my %o; # matching open files are recorded here
1732              
1733             # see what files they have open
1734             for my $p (keys %p)
1735             {
1736             if (opendir DIR, "/proc/$p/fd")
1737             {
1738             my @l = grep { /^\d+$/ } readdir DIR;
1739             closedir DIR;
1740             for my $l (@l)
1741             {
1742             my $r = readlink("/proc/$p/fd/$l");
1743             next unless defined $r;
1744             # warn "$p, $l, $r\n";
1745             if ($r eq $file)
1746             {
1747             $o{$p}{cmd} ||= $p{$p}{cmd};
1748             $o{$p}{fd}{$l} = { file => $file };
1749             }
1750             }
1751             }
1752             }
1753             return \%o;
1754             }
1755              
1756             # see if we can read the file offset of a file descriptor, and the size of its file.
1757             sub _fuser_offset
1758             {
1759             my ($p) = @_;
1760             for my $pid (keys %$p)
1761             {
1762             for my $fd (keys %{$p->{$pid}{fd}})
1763             {
1764             if (open IN, "/proc/$pid/fdinfo/$fd")
1765             {
1766             while (defined (my $line = ))
1767             {
1768             chomp $line;
1769             $p->{$pid}{fd}{$fd}{$1} = $2 if $line =~ m{^(\w+):\s+(.*)\b};
1770             }
1771             }
1772             close IN;
1773             $p->{$pid}{fd}{$fd}{size} = -s $p->{$pid}{fd}{$fd}{file};
1774             }
1775             }
1776             }
1777              
1778              
1779             sub _prep_configdir
1780             {
1781             my ($self) = @_;
1782             my $dir = "/tmp/file_unpack_$$/";
1783             mkpath($dir);
1784             my $j = $self->{json}->allow_nonref();
1785              
1786             open my $SH, ">", "$dir/config.sh";
1787             open my $JS, ">", "$dir/config.js";
1788              
1789             print $JS "{\n";
1790              
1791             for my $group ('', 'minfree', 'exclude')
1792             {
1793             my $h_ref = ($group eq '') ? $self : $self->{$group};
1794             for my $k (sort keys %$h_ref)
1795             {
1796             my $val = $h_ref->{$k};
1797             next if $k eq 'recursion_level';
1798             next if ref $val; # we only take scalars.
1799             my $name = ($group eq '') ? $k : "${group}_$k";
1800             printf $SH "%s=%s\n", shell_quote(uc "fu_$name"), shell_quote($val);
1801             printf $JS "%s:%s,\n", $j->encode($name), $j->encode($val);
1802             }
1803             }
1804              
1805             print $SH "FU_VERSION=$VERSION\n";
1806             print $JS qq["fu_version":"$VERSION"\n}\n];
1807              
1808             close $SH;
1809             close $JS;
1810             return $dir;
1811             }
1812              
1813              
1814             =head2 mime_helper_dir mime_helper
1815              
1816             $u->mime_helper_dir($dir, ...)
1817             $u->mime_helper($mime_name, $suffix_regexp, \@argv, @redir, ...)
1818              
1819             Registers one or more directories where external MIME helper programs are found.
1820             Helpers plugins are shellscripts that server as specialized MIME type handlers for unpacking.
1821             A list of helpers comes builtin interfacing most well-known archivers. This list can be appended to using the mime_helper_dir() or mime_helper() methods.
1822             Multiple directories can be registered, They are searched in reverse order, i.e.
1823             last added takes precedence. Any external MIME helper takes precedence over built-in code.
1824              
1825             The suffix_regexp is used to derive the destination name from the source name.
1826             It is not used for selecting helpers.
1827              
1828             When collecting external helper scripts via C, there is no C. Instead,
1829             external helper scripts can explicitly create a toplevel directory with the desired name.
1830              
1831             Helpers are mapped to MIME types by their mime_name. The name can be constructed
1832             from the MIME type by replacing the '/' with a '=' character, and by using the
1833             word 'ANY' as a wildcard component. The '=' character is interpreted as an
1834             implicit '=ANY+' if needed.
1835              
1836             Examples:
1837              
1838             Mimetype helper names tried from top to bottom
1839             -----------------------------------------------------------------
1840             image/png image=png
1841             image=ANY
1842             image
1843             ANY=png
1844             ANY=ANY
1845             ANY
1846              
1847             application/vnd.oasis+zip application=vnd.oasis+zip
1848             application=ANY+zip
1849             application=ANYzip
1850             application=zip
1851             application=ANY
1852             ...
1853            
1854             A trailing '=ANY' is implicit, as shown by these examples.
1855             The rules for precedence are this:
1856              
1857             =over
1858              
1859             =item *
1860              
1861             Search in the latest directory is exhaused first, then the previously added directory is considered in turn,
1862             up to all directories have been traversed, or until a matching helper is found.
1863            
1864             =item *
1865              
1866             A matching name with wildcards has lower precedence than a matching name without.
1867              
1868             =item *
1869              
1870             A wildcard before the '=' sign lowers precedence more than one after it.
1871              
1872             =back
1873              
1874             The mapping takes place when C is called. Adding helper scripts to a directory
1875             afterwards has no effect. C does not do any implicit expansions. Call it
1876             multiple times with the same helper command and different names if needed.
1877             The default argument list is "%(src)s %(destfile)s %(destdir)s %(mime)s %(descr)s %(configdir)s" --
1878             this is applied, if no args are given and no redirections are given. See also C for more semantics and how a helper should behave.
1879              
1880             Both methods return an ARRAY-ref of HASHes describing all known (old and newly added) mime helpers.
1881              
1882             =cut
1883             my @def_mime_helper_fmt = qw(%(src)s %(destfile)s %(destdir)s %(mime)s %(descr)s %(configdir)s);
1884              
1885             sub _subst_args
1886             {
1887             my $f = Text::Sprintf::Named->new({fmt => $_[0]});
1888             return $f->format({args => $_[1]});
1889             }
1890              
1891             sub mime_helper
1892             {
1893             my ($self, $name, $suffix_re, @args) = @_;
1894             @args = ($name) unless @args;
1895             @args = ([@args]) unless ref $args[0];
1896             push @{$args[0]}, @def_mime_helper_fmt unless $#{$args[0]} or defined $args[1];
1897              
1898             # cut away the path prefix from name. And use / instead of = in the mime name.
1899             $name =~ s{(.*/)?(.*?)=(.*?)$}{$2=$3};
1900              
1901             unless ($name =~ m{[/=]})
1902             {
1903             print STDERR "mime_helper '$name' needs a '=' or '/'.\n" if $self->{verbose};
1904             return $self->{mime_helper};
1905             }
1906              
1907             my $pat = "^\Q$name\E\$";
1908             $pat =~ s{\\=}{/(?:x-|ANY\\+)?};
1909             $pat =~ s{\\%}{ANY}g;
1910             $pat =~ s{^\^ANY}{};
1911             $pat =~ s{ANY\$$}{};
1912             $pat =~ s{ANY}{\\b\[\^\/\]+\\b}g;
1913             unshift @{$self->{mime_helper}},
1914             {
1915             name => $name, pat => $pat, suffix_re => $suffix_re,
1916             fmt_p => fmt_run_shellcmd(@args), argvv => \@args
1917             };
1918              
1919             delete $self->{mime_orcish}; # to be rebuilt in find_mime_helper()
1920              
1921             return $self->{mime_helper};
1922             }
1923              
1924             =head2 list
1925              
1926             Returns an ARRAY of preformatted patterns and MIME helpers.
1927              
1928             Example:
1929              
1930             printf @$_ for $u->list();
1931              
1932             =cut
1933              
1934             sub list
1935             {
1936             my ($self) = @_;
1937              
1938             my $width = 10;
1939             for my $m (@{$self->{mime_helper}})
1940             {
1941             $width = length($m->{pat}) if length($m->{pat}) > $width;
1942             }
1943              
1944             my @r;
1945             for my $m (@{$self->{mime_helper}})
1946             {
1947             push @r, [ "%-${width}s %s\n", $m->{pat}, $m->{fmt_p} ];
1948             }
1949             return @r;
1950             }
1951              
1952             sub mime_helper_dir
1953             {
1954             my ($self, @dirs) = @_;
1955              
1956             for my $d (@dirs)
1957             {
1958             my %h;
1959             if (opendir DIR, $d)
1960             {
1961             %h = map { $_ => { a => "$d/$_" } } grep { -f "$d/$_" } readdir DIR;
1962             closedir DIR;
1963             }
1964             else
1965             {
1966             carp "Cannot opendir $d: $!, skipped\n";
1967             }
1968              
1969             # add =ANY suffix, if missing
1970             for my $h (keys %h)
1971             {
1972             if ($h !~ m{[/=]})
1973             {
1974             my $h2 = $h . "=ANY";
1975             $h{$h2} = { %{$h{$h}} } unless defined $h{$h2};
1976             }
1977             }
1978              
1979             # not needed, this is implicit in mime_helper()/$pat
1980             #
1981             # # add expansion of = to =ANY+, if missing
1982             # for my $h (keys %h)
1983             # {
1984             # next if $h =~ m{=ANY+};
1985             # my $h2 = $h; $h2 =~ s{=}{=ANY+};
1986             # $h{$h2} = $h{$h} unless defined $h{$h2};
1987             # }
1988              
1989             # calculate priorities
1990             for my $h (keys %h)
1991             {
1992             my $n = 1000000;
1993             my $p = 1000;
1994             while ($h =~ m{(ANY|=)}g)
1995             {
1996             if ($1 eq '=')
1997             {
1998             $n = 1000;
1999             }
2000             else
2001             {
2002             $p += $n;
2003             }
2004             }
2005             # longer length has prio over shorter length. Hmm, this is ineffective, isnt it?
2006             $h{$h}{p} = $p - length($h);
2007             }
2008              
2009             # Now push them, sorted by prio.
2010             # Smaller prio_number is better. Later addition is prefered.
2011             for my $h (sort { $h{$b}{p} <=> $h{$a}{p} } keys %h)
2012             {
2013             # do not ruin the original name by resolving symlinks and such.
2014             $self->mime_helper($h, undef, [Cwd::fast_abs_path($h{$h}{a})]);
2015             }
2016             }
2017             return $self->{mime_helper};
2018             }
2019              
2020             =head2 find_mime_helper
2021              
2022             $u->find_mime_helper($mimetype)
2023              
2024             Returns a MIME helper suitable for unpacking the given $mimetype.
2025             If called in list context, a second return value indicates which
2026             mime helpers would be suitable, but could not be found in the system.
2027              
2028             =cut
2029              
2030             sub find_mime_helper
2031             {
2032             my ($self, $mimetype) = @_;
2033             $mimetype = $mimetype->[0] if ref $mimetype eq 'ARRAY';
2034              
2035             return $self->{mime_orcish}{$mimetype}
2036             if defined $self->{mime_orcish}{$mimetype} and
2037             -f $self->{mime_orcish}{$mimetype}{argvv}[0][0];
2038            
2039             my $r = undef;
2040             for my $h (@{$self->{mime_helper}})
2041             {
2042             if ($mimetype =~ m{$h->{pat}})
2043             {
2044             $self->_finalize_argvv($h);
2045             unless (-f $h->{argvv}[0][0])
2046             {
2047             push @{$r->{missing}}, $h->{argvv}[0][0];
2048             next;
2049             }
2050             $self->{mime_orcish}{$mimetype} = $h;
2051             return wantarray ? ($h, $r) : $h;
2052             }
2053             }
2054             return wantarray ? (undef, $r) : undef;
2055             }
2056              
2057             #
2058             # _finalize_argvv() executes a sub in 3 places:
2059             # The argvv ptr itself can be a sub:
2060             # this should return an array, where the
2061             # first element is the command (as an array-ref) and subsequent elements are
2062             # redirects. See run() for details.
2063             # One of the argvv elements is a sub:
2064             # this should return the command as an array-ref, if it is argvv[0],
2065             # or return one or more redirects.
2066             # One element of argvv[0] is a sub:
2067             # this should return one or more command names, options, arguments,
2068             #
2069             # Tricky part of the implementation is the in-place array expansion while iterating.
2070             #
2071             sub _finalize_argvv
2072             {
2073             my ($self, $h) = @_;
2074              
2075             my $update_fmt_p = 0;
2076             if (ref $h->{argvv} eq 'CODE')
2077             {
2078             $h->{argvv} = [ $h->{argvv}->($self) ];
2079             $update_fmt_p++;
2080             }
2081              
2082             # If any part of LIST is an array, "foreach" will get very confused if you add or
2083             # remove elements within the loop body, for example with "splice". So don't do
2084             # that.
2085             # Sigh, we want do do exactly that, a sub may replace itself by any number of elements. Use booring C-style loop.
2086             my $last = $#{$h->{argvv}};
2087             for (my $idx = 0; $idx <= $last; $idx++)
2088             {
2089             if (ref $h->{argvv}[$idx] eq 'CODE')
2090             {
2091             my @r = $h->{argvv}[$idx]($self);
2092             splice @{$h->{argvv}}, $idx, 1, @r;
2093             $idx += $#r;
2094             $last +=$#r;
2095             $update_fmt_p++;
2096             }
2097             }
2098             $last = $#{$h->{argvv}};
2099             for (my $idx = 0; $idx <= $last; $idx++)
2100             {
2101             next unless ref $h->{argvv}[$idx] eq 'ARRAY';
2102             my $last1 = $#{$h->{argvv}[$idx]};
2103             for (my $idx1 = 0; $idx1 <= $last1; $idx1++)
2104             {
2105             if (ref $h->{argvv}[$idx][$idx1] eq 'CODE')
2106             {
2107             my @r = $h->{argvv}[$idx][$idx1]->($self);
2108             splice @{$h->{argvv}[$idx]}, $idx1, 1, @r;
2109             $idx1 += $#r;
2110             $last1 +=$#r;
2111             $update_fmt_p++;
2112             }
2113             }
2114             }
2115              
2116             $h->{fmt_p} = fmt_run_shellcmd($h) if $update_fmt_p;
2117             }
2118              
2119             =head2 minfree
2120              
2121             $u->minfree(factor => 10, bytes => '100M', percent => '3%', warning => sub { .. })
2122              
2123             THESE TESTS ARE TO BE IMPLEMENTED.
2124              
2125             Guard the filesystem (destdir) against becoming full during C.
2126             Before unpacking each source archive, the free space is measured and compared against three conditions:
2127              
2128             =over
2129              
2130             =item *
2131              
2132             The archive size multiplied with the given factor must fit into the filesystem.
2133              
2134             =item *
2135              
2136             The given number of bytes (in optional K, M, G, or T units) must be free.
2137              
2138             =item *
2139              
2140             The filesystem must have at least the given free percentage. The '%' character is optional.
2141            
2142             =back
2143              
2144             The warning method is called if any of the above conditions fail. Its signature is:
2145             &warning->($pathname, $full_percentage, $free_bytes, $free_inodes);
2146             It is expected to print an appropriate warning message, and delay a few seconds.
2147             It should return 0 to cause a retry. It should return nonzero to continue unpacking.
2148             The default warning method prints a message to STDERR, waits 30 seconds, and returns 0.
2149              
2150             The filesystem may still become full and unpacking may fail, if e.g. factor was chosen lower than
2151             the average compression ratio of the archives.
2152              
2153             =cut
2154              
2155             sub _bytes_unit
2156             {
2157             my ($text) = @_;
2158             return int($1*1024) if $text =~ m{([\d\.]+)k}i;
2159             return int($1*1024*1024) if $text =~ m{([\d\.]+)m}i;
2160             return int($1*1024*1024*1024) if $text =~ m{([\d\.]+)g}i;
2161             return int($1*1024*1024*1024*1024) if $text =~ m{([\d\.]+)t}i;
2162             return int($text);
2163             }
2164              
2165             sub _unit_bytes
2166             {
2167             my ($number, $dec_places) = @_;
2168             $dec_places = 2 unless defined $dec_places;
2169             my $div = 1;
2170             my $unit = '';
2171             my $neg = '';
2172             if ($number < 0)
2173             {
2174             $neg = '-'; $number = -$number;
2175             }
2176             if ($number > $div * 1024)
2177             {
2178             $div *= 1024; $unit = 'k';
2179             if ($number > $div * 1024)
2180             {
2181             $div *= 1024; $unit = 'm';
2182             if ($number > $div * 1024)
2183             {
2184             $div *= 1024; $unit = 'g';
2185             if ($number > $div * 1024)
2186             {
2187             $div *= 1024; $unit = 't';
2188             }
2189             }
2190             }
2191             }
2192             return sprintf "%s%.*f%s", $neg, $dec_places, ($number / $div), $unit;
2193             }
2194              
2195             # see fs.pm/check_fs_health()
2196              
2197             sub minfree
2198             {
2199             my $self = shift;
2200             my %opt = @_;
2201              
2202             for my $i (qw(factor bytes percent))
2203             {
2204             $self->{minfree}{$i} = $opt{$i} if defined $opt{$i};
2205             $self->{minfree}{$i} ||= 0;
2206             }
2207             $self->{minfree}{bytes} = _bytes_unit($self->{minfree}{bytes});
2208             $self->{minfree}{percent} =~ s{%$}{};
2209             $self->{fs_warn} = $opt{warning} if ref $opt{warning};
2210             }
2211              
2212             =head2 mime
2213              
2214             $u->mime($filename)
2215              
2216             $u->mime(file => $filename)
2217              
2218             $u->mime(buf => "#!/bin ...", file => "what-was-read")
2219              
2220             $u->mime(fd => \*STDIN, file => "what-was-opened")
2221              
2222             Determines the MIME type (and optionally additional information) of a file.
2223             The file can be specified by filename, by a provided buffer or an opened file descriptor.
2224             For the latter two cases, specifying a filename is optional, and used only for diagnostics.
2225              
2226             C uses libmagic by Christos Zoulas exposed via File::LibMagic and also uses
2227             the shared-mime-info database from freedesktop.org exposed via
2228             File::MimeInfo::Magic, if available. Either one is sufficient, but having both
2229             is better. LibMagic sometimes says 'text/x-pascal', although we have a F<.desktop>
2230             file, or says 'text/plain', but has contradicting details in its description.
2231              
2232             C is consulted where the libmagic output is dubious. E.g. when
2233             the desciption says something interesting like 'Debian binary package (format 2.0)' but the
2234             mimetype says 'application/octet-stream'. The combination of both libraries gives us
2235             excellent reliability in the critical field of MIME type recognition.
2236              
2237             This implementation also features multi-level MIME type recognition for efficient unpacking.
2238             When e.g. unpacking a large bzipped tar archive, this saves us from creating a
2239             huge temporary tar-file which C would extract in a second step. The multi-level recognition
2240             returns 'application/x-tar+bzip2' in this case, and allows for a MIME helper
2241             to e.g. pipe the bzip2 contents into tar (which is exactly what 'tar jxvf'
2242             does, making a very simple and efficient MIME helper).
2243              
2244             C returns a 3 or 4 element arrayref with mimetype, charset, description, diff;
2245             where diff is only present when the libfile and shared-mime-info methods disagree.
2246              
2247             In case of 'text/plain', an additional rule based on file name suffix is used to allow
2248             recognition of well known plain text pack formats.
2249             We return 'text/x-suffix-XX+plain', where XX is one of the recognized suffixes
2250             (in all lower case and without the dot). E.g. a plain mmencoded file has no
2251             header and looks like 'plain/text' to all the known magic libraries. We
2252             recognize the suffixes .mm, .b64, and .base64 for this (case insignificant).
2253             A similar rule exitst for 'application/octect-stream'. It may trigger e.g. for
2254             LZMA compressed files which fail to provide a magic number.
2255              
2256             Examples:
2257            
2258             [ 'text/x-perl', 'us-ascii', 'a /usr/bin/perl -w script text']
2259              
2260             [ 'text/x-mpegurl', 'utf-8', 'M3U playlist text',
2261             [ 'text/plain', 'application/x-mpegurl']]
2262              
2263             [ 'application/x-tar+bzip2, 'binary',
2264             "bzip2 compressed data, block size = 900k\nPOSIX tar archive (GNU)", ...]
2265              
2266             =cut
2267              
2268             sub mime
2269             {
2270             my ($self, @in) = @_;
2271              
2272             my %in;
2273             %in = %{$in[0]} if !$#in and ref $in[0] eq 'HASH';
2274             unshift @in, 'file' if !$#in and !ref $in[0];
2275             %in = @in if $#in > 0;
2276              
2277             my $flm = $self->{flm} ||= File::LibMagic->new();
2278              
2279             unless (defined $in{buf})
2280             {
2281             my $fd = $in{fd};
2282             unless ($fd)
2283             {
2284             open $fd, "<", $in{file} or
2285             return [ 'x-system/x-error', undef, "cannot open '$in{file}': $!" ];
2286             }
2287              
2288             my $f = $in{file}||'-';
2289             $in{buf} = '';
2290             my $pos = tell $fd;
2291             ##bzip2 below needs a long buffer, or it returns 0.
2292             my $len = read $fd, $in{buf}, $UNCOMP_BUFSZ;
2293             return [ 'x-system/x-error', undef, "read '$f' failed: $!" ] unless defined $len;
2294             return [ 'x-system/x-error', undef, "read '$f' failed: $len: $!" ] if $len < 0;
2295             return [ 'text/x-empty', undef, 'empty' ] if $len == 0;
2296             seek $fd, $pos, 0;
2297              
2298             close $fd unless $in{fd};
2299             }
2300              
2301              
2302             ## flm can say 'cannot open \'IP\' (No such file or directory)'
2303             ## flm can say 'CDF V2 Document, corrupt: Can\'t read SAT' (application/vnd.ms-excel)
2304             my $mime1 = $flm->checktype_contents($in{buf});
2305             if ($mime1 =~ m{, corrupt: } or $mime1 =~ m{^application/octet-stream\b})
2306             {
2307             # application/x-iso9660-image is reported as application/octet-stream if the buffer is short.
2308             # iso images usually start with 0x8000 bytes of all '\0'.
2309             print STDERR "mime: readahead buffer $UNCOMP_BUFSZ too short\n" if $self->{verbose} > 2;
2310             if (defined $in{file} and -f $in{file})
2311             {
2312             print STDERR "mime: reopening $in{file}\n" if $self->{verbose} > 1;
2313             $mime1 = $flm->checktype_filename($in{file});
2314             }
2315             }
2316             print STDERR "flm->checktype_contents: $mime1\n" if $self->{verbose} > 1;
2317             $in{file} = '-' unless defined $in{file};
2318            
2319             return [ 'x-system/x-error', undef, $mime1 ] if $mime1 =~ m{^cannot open};
2320              
2321             # in SLES11 we get 'text/plain charset=utf-8' without semicolon.
2322             my $enc; ($mime1, $enc) = ($1,$2) if $mime1 =~ m{^(.*?);\s*(.*)$} or
2323             $mime1 =~ m{^(.*?)\s+(.*)$};
2324             $enc =~ s{^charset=}{} if defined $enc;
2325             my @r = ($mime1, $enc, $flm->describe_contents($in{buf}) );
2326             my $mime2;
2327              
2328            
2329             if ($mime1 =~ m{^application/xml})
2330             {
2331             # This is horrible from a greedy text cruncher perspective:
2332             # although xml is a plain text syntax, it is reported by flm to be
2333             # outside text/*
2334             $r[0] = "text/x-application-xml";
2335             }
2336              
2337             if ($mime1 =~ m{^text/x-(?:pascal|fortran)$})
2338             {
2339             # xterm.desktop
2340             # ['text/x-pascal; charset=utf-8','UTF-8 Unicode Pascal program text']
2341             # 'application/x-desktop'
2342             #
2343             # Times-Roman.afm
2344             # ['text/x-fortran; charset=us-ascii','ASCII font metrics']
2345             # 'application/x-font-afm'
2346             #
2347             # debian/rules
2348             # ['text/x-pascal; charset=us-ascii','a /usr/bin/make -f script text']
2349             # 'text/x-makefile'
2350             if ($mime2 ||= eval { open my $fd,'<',\$in{buf}; File::MimeInfo::Magic::magic($fd); })
2351             {
2352             $r[0] = "text/$1" if $mime2 =~ m{/(\S+)};
2353             }
2354             }
2355             elsif (($mime1 eq 'text/plain' and $r[2] =~ m{(?:PostScript|font)}i)
2356             or ($mime1 eq 'application/postscript'))
2357             {
2358             # 11.3 says:
2359             # IPA.pfa
2360             # ['text/plain; charset=us-ascii','PostScript Type 1 font text (OmegaSerifIPA 001.000)']
2361             # sles11 says:
2362             # IPA.pfa
2363             # ['application/postscript', undef, 'PostScript document text']
2364             #
2365             # mime2 = 'application/x-font-type1'
2366             # $mime2 = eval { File::MimeInfo::Magic::mimetype($in{file}); };
2367             $mime2 ||= eval { open my $fd,'<',\$in{buf}; File::MimeInfo::Magic::magic($fd); };
2368             if ($mime2 and $mime2 =~ m{^(.*)/(.*)$})
2369             {
2370             my ($a,$b) = ($1,$2);
2371             $a = 'text' if $r[2] =~ m{\btext\b}i;
2372             $r[0] = "$a/$b";
2373             }
2374             }
2375              
2376             if ($r[0] eq 'text/plain' or
2377             $r[0] eq 'application/octet-stream')
2378             {
2379             # hmm, are we sure? No, if the description contradicts:
2380             #
2381             $r[0] = "text/x-uuencode" if $r[2] eq 'uuencoded or xxencoded text';
2382              
2383             # bin/floor
2384             # ['text/x-pascal; charset=us-ascii','a /usr/bin/tclsh script text']
2385             # 'text/plain'
2386             $r[0] = "text/x-$2" if $r[2] =~ m{^a (\S*/)?([^/\s]+) .*script text$}i;
2387             if ($r[2] =~ m{\bimage\b})
2388             {
2389             # ./opengl/test.tga
2390             # ['application/octet-stream; charset=binary','Targa image data - RGB 128 x 128']
2391             # 'image/x-tga'
2392             $mime2 ||= eval { open my $fd,'<',\$in{buf}; File::MimeInfo::Magic::magic($fd); };
2393             $r[0] = $mime2 if $mime2 and $mime2 =~ m{^image/};
2394             }
2395             }
2396              
2397             if ($r[0] eq 'application/octet-stream')
2398             {
2399             # it can't get much worse, can it?
2400             ##
2401             # dotdot.tar.lzma
2402             # {'File::MimeInfo::Magic' => 'application/x-lzma-compressed-tar'} -- no, that was suffix based!
2403             # {'File::LibMagic' => ['application/octet-stream; charset=binary','data']}
2404             $mime2 ||= eval { open my $fd,'<',\$in{buf}; File::MimeInfo::Magic::magic($fd); };
2405             #
2406             # File::LibMagic misreads monotone-0.99.1/monotone.info-1 as app/bin
2407             # File::MimeInfo::Magic::magic() returns undef for that one.
2408             # But perl itself does not agree:
2409             $mime2 ||= 'application/x-text-mixed' if -T $in{file};
2410              
2411             $r[0] = $mime2 if $mime2;
2412             }
2413              
2414             if ($r[0] eq 'application/octet-stream')
2415             {
2416             if ($r[2] =~ m{\bcpio\s+archive\b}i)
2417             {
2418             # Mac pax files are gzipped cpio: 'ASCII cpio archive (pre-SVR4 or odc)'
2419             $r[0] = 'application/x-cpio';
2420             }
2421             }
2422              
2423             my $uncomp_buf = '';
2424              
2425             if ($r[0] eq 'application/octet-stream')
2426             {
2427             ## lzma is an extremly bad format. It has no magic.
2428             #
2429             # WARNING from Compress::unLZMA
2430             # "This version only implements in-memory decompression (patches are welcomed).
2431             # There is no way to recognize a valid LZMA encoded file with the SDK.
2432             # So, in some cases, you can crash your script if you try to uncompress a
2433             # non valid LZMA encoded file."
2434             # Does this also apply to us?
2435             #
2436             # -- hmm, maybe we better leave it at calling lzcat.
2437             # Trade in "always a bit expensive" versus "sometimes crashing"...
2438             #
2439             # my $lztest = `sh -c "/usr/bin/lzcat < $in{file} | head -c 1k > /dev/null" 2>&1`;
2440             # # -> /usr/bin/lzcat: (stdin): File format not recognized
2441             # if ($lztest !~ m{(not recognized|error)}i)
2442             # {
2443             # $r[0] = 'application/x-lzma';
2444             # }
2445              
2446             if (10 < length $in{buf})
2447             {
2448             no strict 'subs'; # Compress::Raw::Lzma::AloneDecoder, LZMA_OK, LZMA_STREAM_END
2449              
2450             my $saved_input = $in{buf};
2451             my ($lz, $stat) = eval { Compress::Raw::Lzma::AloneDecoder->new(-Bufsize => $UNCOMP_BUFSZ, -LimitOutput => 1); };
2452             if ($lz)
2453             {
2454             $stat = $lz->code($in{buf}, $uncomp_buf);
2455             if (($stat == LZMA_OK or $stat == LZMA_STREAM_END)
2456             and
2457             (length($uncomp_buf) > length($saved_input)))
2458             {
2459             $r[0] = "application/x-lzma";
2460             $r[2] = "LZMA compressed data, no magic";
2461             }
2462             # This decompressor consumes the input.
2463             $in{buf} = $saved_input;
2464             }
2465             }
2466             }
2467             # printf STDERR "in-buf = %d bytes\n", length($in{buf});
2468              
2469             if ($r[0] =~ m{^application/(?:x-)?gzip$})
2470             {
2471             my ($gz, $stat) = eval { new Compress::Raw::Zlib::Inflate( -WindowBits => WANT_GZIP() ); };
2472             if ($gz)
2473             {
2474             my $stat = $gz->inflate($in{buf}, $uncomp_buf);
2475             # printf STDERR "stat=%s, uncomp=%d bytes \n", $stat, length($uncomp_buf);
2476             }
2477             }
2478              
2479             ## bzip2 is not nice for stacked mime checking.
2480             ## It needs a huge input buffer that we do not normally provide.
2481             ## We only support it at the top of a stack, where we acquire enough additional
2482             ## input until bzip2 is happy.
2483             if ($r[0] =~ m{^application/(?:x-)?bzip2$} && !$in{recursion})
2484             {
2485             my $limitOutput = 1;
2486             my ($bz, $stat) = eval { new Compress::Raw::Bunzip2 0, 0, 0, 0, $limitOutput; };
2487             if ($bz)
2488             {
2489             ## this only works if this is a first level call.
2490             open my $IN, "<", $in{file} unless $in{file} eq '-';
2491             seek $IN, length($in{buf}), 0;
2492             while (!length $uncomp_buf)
2493             {
2494             my $stat = $bz->bzinflate($in{buf}, $uncomp_buf);
2495             # $bz->bzflush($uncomp_buf); # wishful thinking....
2496             last if length($in{buf}); # did not consume, strange.
2497             last if length $stat; # something wrong, or file ends.
2498             last unless read $IN, $in{buf}, 10*1024, length($in{buf}); # try to get more data
2499             }
2500             my $slurped = tell $IN; # likely to get ca. 800k yacc!
2501             close $IN;
2502             # use Data::Dumper; warn Dumper $stat, length($in{buf}), length($uncomp_buf), "slurped=$slurped";
2503             }
2504             }
2505              
2506             ## try to get at the second level mime type, for some well known linear compressors.
2507             while (length $uncomp_buf && $r[0] =~ m{^application/(x-)?([+\w]+)$})
2508             {
2509             my $compname = $2;
2510             my $next_uncomp_buf = '';
2511              
2512             # use Data::Dumper; printf STDERR "calling mime with buf=%d bytes, compname=$compname\n", length($uncomp_buf);
2513              
2514             #########
2515             ## FIXME: adding +$compname to the filename prevents reopening in mime, if needed.
2516             ## Why did I do this in the first place?
2517             # my $m2 = $self->mime(buf => $uncomp_buf, file => "$in{file}+$compname", uncomp => \$next_uncomp_buf, recursion => 1);
2518             #########
2519              
2520             my $m2 = $self->mime(buf => $uncomp_buf, file => $in{file}, uncomp => \$next_uncomp_buf, recursion => 1);
2521             my ($a,$xminus,$b) = ($m2->[0] =~ m{^(.*)/(x-)?(.*)$});
2522             if ($a eq 'application')
2523             {
2524             $r[0] = "application/x-$b+$compname"
2525             }
2526             else
2527             {
2528             $r[0] = "application/x-$a-$b+$compname"
2529             }
2530             $r[2] .= "\n" . $m2->[2];
2531             $uncomp_buf = $next_uncomp_buf;
2532             # print Dumper "new: ", \@r, $m2, $compname, length($uncomp_buf);
2533             }
2534              
2535             # use Data::Dumper;
2536             # die Dumper \@r, "--------------------";
2537              
2538             if ($r[0] eq 'application/unknown+zip' and $r[2] =~ m{\btext\b}i)
2539             {
2540             # empty.odt
2541             # ['application/unknown+zip; charset=binary','Zip archive data, at least v2.0 to extract, mime type application/vnd OpenDocument Text']
2542             # application/vnd.oasis.opendocument.text
2543             if ($mime2 ||= eval { open my $fd,'<',\$in{buf}; File::MimeInfo::Magic::magic($fd); })
2544             {
2545             $mime2 .= '+zip' unless $mime2 =~ m{\+zip}i;
2546             $r[0] = $mime2 if $mime2 =~ m{^application/};
2547             }
2548             }
2549             $r[0] .= '+zip' if $r[0] =~ m{^application/vnd\.oasis\.opendocument\.text$};
2550              
2551             if ($r[0] eq 'text/plain' and $in{file} =~ m{\.(mm|b64|base64)$}i)
2552             {
2553             my $suf = lc $1;
2554             $r[0] = "text/x-suffix-$suf+plain";
2555             }
2556              
2557             if ($r[0] eq 'application/octet-stream' and $in{file} =~ m{\.(lzma|zx|lz)$}i)
2558             {
2559             my $suf = lc $1;
2560             $r[0] = "application/x-suffix-$suf+octet-stream";
2561             }
2562              
2563             if ($r[0] =~ m{^application/x-(ms-dos-|)executable$})
2564             {
2565             if (-x '/usr/bin/upx')
2566             {
2567             # upx refuses to read symlinks. Work around this.
2568             my $in_file = $in{file};
2569             $in_file = readlink($in{file}) if -l $in{file};
2570             $r[0] .= '+upx' unless run(['/usr/bin/upx', '-q', '-q', '-t', $in_file]);
2571             }
2572             }
2573              
2574             ${$in{uncomp}} = $uncomp_buf if ref $in{uncomp} eq 'SCALAR';
2575             $r[3] = [ $mime1, $mime2 ] if $mime1 ne $r[0] or ($mime2 and $mime2 ne $mime1);
2576              
2577             return \@r;
2578             }
2579              
2580             =head1 AUTHOR
2581              
2582             Juergen Weigert, C<< >>
2583              
2584             =head1 BUGS
2585              
2586             The implementation of C is an ugly hack. We suffer from the existence of
2587             multiple file magic databases, and multiple conflicting implementations. With
2588             Perl we have at least 5 modules for this; here we use two.
2589              
2590             The builtin list of MIME helpers is incomplete. Please submit your handler code.
2591              
2592             Please report any bugs or feature requests to C, or through
2593             the web interface at L. I will be notified, and then you'll
2594             automatically be notified of progress on your bug as I make changes.
2595              
2596              
2597             =head1 RELATED MODULES
2598              
2599             While designing File::Unpack, a range of other perl modules were examined. Many modules provide valuable service to File::Unpack and became dependencies or are recommended.
2600             Others exposed drawbacks during closer examination and may find some of their
2601             wheels re-invented here.
2602              
2603             =head2 Used Modules
2604              
2605             =over
2606              
2607             =item File::LibMagic
2608              
2609             This is the prefered mimetype engine. It disregards the suffix, recognizes more
2610             types than any of the alternatives, and uses exactly the same engine as
2611             /usr/bin/file in openSUSE systems. It also returns charset and description
2612             information. We crossreference the description with the mimetype to detect
2613             weaknesses, and consult File::MimeInfo::Magic and some own logic, for e.g.
2614             detecting LZMA compression which fails to provide any recognizable magic.
2615             Required if you use C; otherwise not a hard requirement.
2616              
2617             =item File::MimeInfo::Magic
2618              
2619             Uses both magic information and file suffixes to determine the mimetype. Its
2620             magic() function is used in a few cases, where File::LibMagic fails. E.g. as
2621             of June 2010, libmagic does not recognize 'image/x-targa'.
2622             File::MimeInfo::Magic may be slower, but it features the shared-mime-info
2623             database from freedesktop.org . Recommended if you use C.
2624              
2625             =item String::ShellQuote
2626              
2627             Used to call external MIME helpers. Required.
2628              
2629             =item BSD::Resource
2630              
2631             Used to reliably restrict the maximum file size. Recommended.
2632              
2633             =item File::Path
2634              
2635             mkpath(). Required.
2636              
2637             =item Cwd
2638              
2639             fast_abs_path(). Required.
2640              
2641             =item JSON
2642              
2643             Used for formatting the logfile. Required.
2644              
2645             =back
2646              
2647             =head2 Modules Not Used
2648              
2649             =over
2650              
2651             =item Archive::Extract
2652              
2653             Archive::Extract tries first to determine what type of archive you are passing
2654             it, by inspecting its suffix. 'Maybe this module should use something like
2655             "File::Type" to determine the type, rather than blindly trust the suffix'.
2656             [quoted from perldoc]
2657              
2658             Set $Archive::Extract::PREFER_BIN to 1, which will prefer the use of command
2659             line programs and won't consume so much memory. Default: use "Archive::Tar".
2660              
2661             =item Archive::Zip
2662              
2663             If you are just going to be extracting zips (and/or other archives) you are
2664             recommended to look at using Archive::Extract . [quoted from perldoc]
2665             It is pure perl, so it's a lot slower then your '/usr/bin/zip'.
2666              
2667             =item Archive::Tar
2668              
2669             It is pure Perl, so it's a lot slower then your "/bin/tar".
2670             It is heavy on memory, all will be read into memory. [quoted from perldoc]
2671              
2672             =item File::MMagic, File::MMagic::XS, File::Type
2673              
2674             Compared to File::LibMagic and File::MimeInfo::Magic, these three are inferior.
2675             They often say 'text/plain' or 'application/octet-stream' where the latter two report
2676             useful mimetypes.
2677              
2678             =back
2679              
2680             =head1 SUPPORT
2681              
2682             You can find documentation for this module with the perldoc command.
2683              
2684             perldoc File::Unpack
2685              
2686              
2687             You can also look for information at:
2688              
2689             =over 4
2690              
2691             =item * RT: CPAN's request tracker
2692              
2693             L
2694              
2695             =item * AnnoCPAN: Annotated CPAN documentation
2696              
2697             L
2698              
2699             =item * CPAN Ratings
2700              
2701             L
2702              
2703             =item * Search CPAN
2704              
2705             L
2706              
2707             =back
2708              
2709             =head1 SOURCE REPOSITORY
2710              
2711             L
2712              
2713             L
2714              
2715             git clone L
2716              
2717              
2718             =head1 ACKNOWLEDGEMENTS
2719              
2720             MIME type recognition relies heavily on libmagic by Christos Zoulas. I had long
2721             hesitated implementing File::Unpack, but set to work, when I dicovered
2722             that File::LibMagic brings your library to perl. Thanks Christos. And thanks
2723             for tcsh too.
2724              
2725             =head1 LICENSE AND COPYRIGHT
2726              
2727             Copyright 2010,2011,2012,2013 Juergen Weigert.
2728              
2729             This program is free software; you can redistribute it and/or modify it
2730             under the terms of either: the GNU General Public License as published
2731             by the Free Software Foundation; or the Artistic License.
2732              
2733             See http://dev.perl.org/licenses/ for more information.
2734              
2735              
2736             =cut
2737              
2738             1; # End of File::Unpack