File Coverage

blib/lib/Treex/PML/IO.pm
Criterion Covered Total %
statement 190 431 44.0
branch 55 276 19.9
condition 28 180 15.5
subroutine 37 60 61.6
pod 25 26 96.1
total 335 973 34.4


line stmt bran cond sub pod time code
1             # -*- cperl -*-
2              
3             =head1 NAME
4              
5             Treex::PML::IO - I/O support functions used by Treex::PML
6              
7             =head1 DESCRIPTION
8              
9             This module implements various I/O and filesystem related functions
10             used by L.
11              
12             The current implementation supports the following protocols for
13             reading:
14              
15             http, https, ftp, gopher, news - reading (POSIX and Windows)
16              
17             ssh, fish, sftp - reading/writing on POSIX systems via secure shell copy
18             or the kioclient from KDE.
19              
20             The module attempts to handle GNU Zip-compressed files (suffix .gz)
21             transparently.
22              
23             =head1 FUNCTIONS
24              
25             =cut
26              
27             package Treex::PML::IO;
28 6     6   43 use Exporter;
  6         11  
  6         266  
29 6     6   3839 use File::Temp 0.14 qw();
  6         66384  
  6         165  
30 6     6   2456 use IO::File;
  6         5167  
  6         600  
31 6     6   2425 use IO::Pipe;
  6         6191  
  6         182  
32 6     6   38 use strict;
  6         14  
  6         109  
33 6     6   29 use URI;
  6         14  
  6         107  
34 6     6   2547 use URI::file;
  6         26255  
  6         194  
35 6     6   43 use URI::Escape;
  6         13  
  6         314  
36 6     6   35 use Scalar::Util qw(blessed);
  6         11  
  6         243  
37 6     6   34 use UNIVERSAL::DOES;
  6         13  
  6         164  
38 6     6   32 use Carp;
  6         13  
  6         269  
39 6     6   3673 use LWP::UserAgent;
  6         197163  
  6         258  
40 6     6   59 use File::Spec;
  6         15  
  6         139  
41 6     6   30 use Fcntl qw(SEEK_SET);
  6         14  
  6         406  
42              
43 6     6   40 use Cwd qw(getcwd);
  6         15  
  6         309  
44              
45 6         1085 use vars qw(@ISA $VERSION @EXPORT @EXPORT_OK
46             %UNLINK_ON_CLOSE
47             $Debug
48             $kioclient $kioclient_opts
49             $ssh $ssh_opts
50             $curl $curl_opts
51             $gzip $gzip_opts
52             $zcat $zcat_opts
53             $reject_proto
54             $lwp_user_agent
55 6     6   37 );
  6         10  
56              
57             sub DOES {
58 0     0 0 0 my ($self,$role)=@_;
59 0 0 0     0 if ($role eq 'IO' or $role eq __PACKAGE__) { # backward compatibility
60 0         0 return 1;
61             } else {
62 0         0 return $self->SUPER::DOES($role);
63             }
64             }
65              
66             {
67             package Treex::PML::IO::UserAgent;
68 6     6   53 use base qw(LWP::UserAgent);
  6         13  
  6         1748  
69             }
70              
71             #$Debug=0;
72             my %input_protocol_handler;
73              
74             BEGIN {
75             *_find_exe = eval {
76             require File::Which;
77             \&File::Which::which
78 6   50 6   24 } || sub {};
79              
80 6         16 $VERSION = '2.24'; # version template
81 6         74 @ISA=qw(Exporter);
82 6         34 @EXPORT_OK = qw($kioclient $kioclient_opts
83             $ssh $ssh_opts
84             $curl $curl_opts
85             $gzip $gzip_opts
86             $zcat $zcat_opts
87             &set_encoding
88             &open_backend &open_uri &close_backend &close_uri
89             &get_protocol "e_filename
90             &rename_uri);
91              
92 6   33     49 $zcat ||= _find_exe('zcat');
93 6   33     1307 $gzip ||= _find_exe('gzip');
94 6   33     1016 $kioclient ||= _find_exe('kioclient');
95 6   33     1058 $ssh ||= _find_exe('ssh');
96 6   33     879 $curl ||= _find_exe('curl');
97 6   50     1064 $ssh_opts ||= '-C';
98 6   50     35 $reject_proto ||= '^(pop3?s?|imaps?)\$';
99 6         66 $lwp_user_agent = Treex::PML::IO::UserAgent->new(keep_alive=>1);
100 6         25045 $lwp_user_agent->agent("Treex::PML_IO/$VERSION");
101             };
102              
103              
104             =over 4
105              
106             =item C
107              
108             Returns directory part of a given path (including volume).
109              
110             =cut
111              
112             sub DirPart {
113 8     8 1 497 return File::Spec->catpath(
114             (File::Spec->splitpath($_[0]))[0,1],''
115             );
116             }
117              
118             =item C
119              
120             If called without an argument, returns the directory of the perl
121             module or macro-file that invoked this macro.
122              
123             If a relative path is given as an argument, a respective absolute path
124             is computed based on the caller's directory and returned.
125              
126             =cut
127              
128             sub CallerDir {
129             return
130 8 50   8 1 88 @_>0
131             ? File::Spec->rel2abs($_[0], DirPart( (caller)[1] ))
132             : DirPart( (caller)[1] );
133             }
134              
135             =item C
136              
137             Register a callback to fetch URIs of a given protocol. C<$scheme> is
138             the URI scheme of the protocol (i.e. the first part of an URI
139             preceding the comma, e.g. 'ftp' or 'https'). <$callback> is either a
140             CODE reference or an ARRAY reference whose first element is a CODE
141             reference and the other elements are additional arguments to be passed
142             to the callback prior to the standard arguments.
143              
144             When the library attempts to fetch a resource from an URI matching
145             given scheme, the callback is invoked with the (optional) user
146             parameters followed by the URI.
147              
148             The callback function must either return a new URI (typically a
149             file:// URI pointing to a temporary file) and a boolean flag
150             indicating whether the library should attempt to delete the
151             returned file after it finished reading.
152              
153             If the callback returns the same or another URI with the C<$scheme>,
154             the callback is not reinvoked, but passed on to further processing
155             (i.e. by Treex::PML I/O backends).
156              
157             =cut
158              
159             sub register_input_protocol_handler {
160 0     0 1 0 my ($proto,$handler)=@_;
161 0 0 0     0 if (ref($handler) eq 'CODE' or ref($handler) eq 'ARRAY') {
162 0 0       0 if (exists($input_protocol_handler{$proto})) {
163 0         0 carp(__PACKAGE__."::register_input_protocol_handler: WARNING: redefining protocol handler for '$proto'");
164             }
165 0         0 $input_protocol_handler{$proto}=$handler;
166             } else {
167 0         0 croak("Wrong arguments. Usage: Treex::PML::IO::register_input_protocol_handler(protocol=>callback)");
168             }
169             }
170              
171             =item unregister_input_protocol_handler($scheme)
172              
173             Unregister a handler for a given URI scheme.
174              
175             =cut
176              
177             sub unregister_input_protocol_handler {
178 0     0 1 0 my ($proto)=@_;
179 0         0 return delete $input_protocol_handler{$proto};
180             }
181              
182             =item get_input_protocol_handler($scheme)
183              
184             Returns the user-defined handler registered for a given URI scheme; if
185             none, undef is returned.
186              
187             =cut
188              
189             sub get_input_protocol_handler {
190 0     0 1 0 my ($proto)=@_;
191 0         0 return $input_protocol_handler{$proto};
192             }
193              
194             =item set_encoding($filehandle, $encoding)
195              
196             Safely resets Perl I/O-layer on a given filehandle to decode or encode
197             from/to a given encoding. This is equivalent to:
198              
199             binmode($filehandle,":raw:perlio:encoding($encoding)");
200              
201             except that errors are turned into warnings.
202              
203             =cut
204              
205             sub set_encoding {
206 323     323 1 785 my ($fh,$encoding) = @_;
207 6     6   4585 no integer;
  6         87  
  6         30  
208 323 50 33     1443 if (defined($fh) and defined($encoding) and ($]>=5.008)) {
      33        
209 0         0 eval {
210 0         0 binmode($fh,":raw:perlio:encoding($encoding)");
211             };
212 0 0       0 warn $@ if $@;
213             }
214 323         1227 return $fh;
215             }
216              
217             =item get_protocol($filename_or_URI)
218              
219             If the argument is a filename, returns 'file'; if the argument is an
220             URI, returns the URI's scheme. Note: unless the argument is an URI
221             object, a heuristic is used to determine the scheme. To avoid
222             reporting Windows drive names as URI schemes, only URI schemes
223             consisting of at least two characters are supported, i.e. C:foo is
224             considered a file name wheres CC:foo would be an URI with the scheme
225             'CC'.
226              
227             =cut
228              
229             # to avoid collision with Win32 drive-names, we only support protocols
230             # with at least two letters
231             sub get_protocol {
232 416     416 1 838 my ($uri) = @_;
233 416 100 66     1933 if (blessed($uri) and $uri->isa('URI')) {
234 95   50     330 return $uri->scheme || 'file';
235             }
236 321 100       989 if ($uri =~ m{^\s*([[:alnum:]][[:alnum:]]+):}) {
237 20         77 return $1;
238             } else {
239 301         1167 return 'file';
240             }
241             }
242              
243             =item quote_filename($string)
244              
245             Returns given string in shell-quotes with special characters (\, $, ")
246             escaped.
247              
248             =cut
249              
250             sub quote_filename {
251 0     0 1 0 my ($uri)=@_;
252 0         0 $uri =~ s{\\}{\\\\}g;
253 0         0 $uri =~ s{\$}{\\\$}g;
254 0         0 $uri =~ s{"}{\\"}g;
255 0         0 return '"'.$uri.'"';
256             }
257              
258             =item get_filename($URI_or_filename)
259              
260             Upgrades given string to an URI and if the resulting URI is in the
261             'file' scheme (e.g. file:///bar/baz), returns the file-name portion of
262             the URI (e.g. /bar/baz). Otherwise returns nothing.
263              
264             =cut
265              
266             sub get_filename {
267 464     464 1 965 my ($uri)=@_;
268 464         918 $uri=make_URI($uri); # cast to URI or make a copy
269 464 100       15055 $uri->scheme('file') if !$uri->scheme;
270 464 50       14812 if ($uri->scheme eq 'file') {
271 464         6503 return $uri->file;
272             }
273 0         0 return;
274             }
275              
276             =item make_abs_URI($URL_or_filename)
277              
278             Upgrades a given string (URL or filename) into an URI object with
279             absolute path (relative URIs are resolved using the current working
280             directory obtained via Cwd::getcwd())
281              
282             =cut
283              
284             sub make_abs_URI {
285 215     215 1 605 my ($url)=@_;
286 215         432 my $uri = make_URI($url);
287 215         12935 my $cwd = getcwd();
288 215 50       1041 $cwd = VMS::Filespec::unixpath($cwd) if $^O eq 'VMS';
289 215         797 $cwd = URI::file->new($cwd);
290 215 50       23304 $cwd .= "/" unless substr($cwd, -1, 1) eq "/";
291 215         2787 return $uri->abs($cwd);
292             }
293              
294             =item make_URI($URL_or_filename)
295              
296             Upgrades a given string (URL or filename) into an URI object.
297              
298             =cut
299              
300             sub make_URI {
301 1040     1040 1 1702 my ($url)=@_;
302 1040         3244 my $uri = URI->new($url);
303 1040 100 66     53442 return $uri if blessed($url) and $url->isa('URI'); # return a copy if was URI already
304 349 100 33     2586 if (($uri eq $url or URI::Escape::uri_unescape($uri) eq $url)
      66        
305             and $url =~ m(^\s*[[:alnum:]]+://)) { # looks like it is URL already
306 22         248 return $uri;
307             } else {
308 327         4148 return URI::file->new($url);
309             }
310             }
311              
312             =item make_relative_URI($URL,$baseURI)
313              
314             Returns a relative URI based in a given base URI. The arguments
315             are automatically upgraded using make_URI() if necessary.
316              
317             =cut
318              
319             sub make_relative_URI {
320 28     28 1 1771 my ($href,$base)=@_;
321             # if (Treex::PML::_is_url($href)) {
322 28 50 33     326 $href = URI->new(make_URI($href)) unless blessed($href) and $href->isa('URI');
323 28         99 $base = make_URI($base);
324             ### $href = $href->abs($base)->rel($base);
325 28         3404 $href = $href->rel($base);
326             }
327              
328             =item strip_protocol($URI)
329              
330             Returns the scheme-specific part of the URI (everything between the
331             scheme and the fragment). If the scheme of the URI was 'file', returns
332             the URI as a file name.
333              
334             =cut
335              
336             sub strip_protocol {
337 0     0 1 0 my ($uri)=@_;
338 0         0 $uri=make_URI($uri); # make a copy
339 0 0       0 $uri->scheme('file') if !$uri->scheme;
340 0 0       0 if ($uri->scheme eq 'file') {
341 0         0 return $uri->file;
342             }
343 0         0 return $uri->opaque;
344             }
345              
346             # =item is_gzip($filename)
347              
348             # Auxiliary:
349             # Returns true if the filename ends with the suffix .gz or .gz~.
350              
351             # =cut
352              
353             sub _is_gzip {
354 373 100   373   1389 ($_[0] =~/.gz~?$/) ? 1 : 0;
355             }
356              
357             =item is_same_filename($URI_1,$URI_2)
358              
359             Checks if $URI_1 and $URI_2 point to the same resource. For filenames
360             and URIs in the 'file' scheme checks that the referred files (if
361             exist) are the same using is_same_file(); for other schemes simply
362             checks for string equality on canonical versions of the URIs (see
363             URI->canonical).
364              
365             =cut
366              
367             sub is_same_filename {
368 0     0 1 0 my ($f1,$f2)=@_;
369 0 0       0 return 1 if $f1 eq $f2;
370 0 0 0     0 my $u1 = (blessed($f1) and $f1->isa('URI')) ? $f1 : make_URI($f1);
371 0 0 0     0 my $u2 = (blessed($f2) and $f2->isa('URI')) ? $f2 : make_URI($f2);
372 0 0       0 return 1 if $u1 eq $u2;
373 0 0       0 return 1 if $u1->canonical eq $u2->canonical;
374 0 0 0     0 if (!ref($f1) and !ref($f2) and $^O ne 'MSWin32' and -f $f1 and -f $f2) {
      0        
      0        
      0        
375 0         0 return is_same_file($f1,$f2);
376             }
377 0         0 return 0;
378             }
379              
380             =item is_same_file($filename_1,$filename_2)
381              
382             Uses device and i-node numbers (reported by stat()) to check if the
383             two filenames point to the same file on the filesystem. Returns 1 if
384             yes, 0 otherwise.
385              
386             =cut
387              
388             sub is_same_file {
389 0     0 1 0 my ($f1,$f2) = @_;
390 0 0       0 return 1 if $f1 eq $f2;
391 0         0 my ($d1,$i1)=stat($f1);
392 0         0 my ($d2,$i2)=stat($f2);
393 0 0 0     0 return ($d1==$d2 and $i1!=0 and $i1==$i2) ? 1 : 0;
394             }
395              
396             =item open_pipe($filename,$mode,$command)
397              
398             Returns a filehandle of a newly open pipe in a given mode.
399              
400             In write mode ($mode = 'w'), opens a writing pipe to a given
401             command redirecting the standard output of the command to a given
402             file. Moreover, if the last suffix of the $filename is '.gz' or
403             '.gz~', the output of the command is gzipped before saving to
404             $filename.
405              
406             In read mode ($mode = 'r'), opens a reading pipe to a given
407             command redirecting the content of the given file to the standard
408             input of the command. Moreover, if the last suffix of the $filename is
409             '.gz' or '.gz~', the output of the command is un-gzipped before it is passed
410             to the command.
411              
412             =cut
413              
414             sub open_pipe {
415 0     0 1 0 my ($file,$rw,$pipe) = @_;
416 0         0 my $fh;
417 0 0       0 if (_is_gzip($file)) {
418 0 0 0     0 if (-x $gzip && -x $zcat) {
419 0 0       0 if ($rw eq 'w') {
420 0   0     0 open $fh, "| $pipe | $gzip $gzip_opts > ".quote_filename($file) || undef $fh;
421             } else {
422 0   0     0 open $fh, "$zcat $zcat_opts < ".quote_filename($file)." | $pipe |" || undef $fh;
423             }
424             } else {
425 0         0 warn "Need a functional gzip and zcat to open this file\n";
426             }
427             } else {
428 0 0       0 if ($rw eq 'w') {
429 0   0     0 open $fh, "| $pipe > ".quote_filename($file) || undef $fh;
430             } else {
431 0   0     0 open $fh, "$pipe < ".quote_filename($file)." |" || undef $fh;
432             }
433             }
434 0         0 return $fh;
435             }
436              
437             # _open_file_zcat:
438             #
439             # Note: This function represents the original strategy used on POSIX
440             # systems. It turns out, however, that the calls to zcat/gzip cause
441             # serious penalty on btred when loading large amount of files and also
442             # cause the process' priority to lessen. It also turns out that we
443             # cannot use IO::Zlib filehandles directly with some backends, such as
444             # StorableBackend.
445             #
446             # I'm leaving the function here, but it is not used anymore.
447              
448             sub _open_file_zcat {
449 0     0   0 my ($file,$rw) = @_;
450 0         0 my $fh;
451 0 0       0 if (_is_gzip($file)) {
452 0 0       0 if (-x $gzip) {
453 0         0 $fh = new IO::Pipe();
454 0 0       0 if ($rw eq 'w') {
455 0 0       0 $fh->writer("$gzip $gzip_opts > ".quote_filename($file)) || undef $fh;
456             } else {
457 0 0       0 $fh->reader("$zcat $zcat_opts < ".quote_filename($file)) || undef $fh;
458             }
459             }
460 0 0       0 unless ($fh) {
461 0 0       0 eval {
462 0         0 require IO::Zlib;
463 0         0 $fh = new IO::Zlib;
464             } || return;
465 0 0       0 $fh->open($file,$rw."b") || undef $fh;
466             }
467             } else {
468 0         0 $fh = new IO::File();
469 0 0       0 $fh->open($file,$rw) || undef $fh;
470             }
471 0         0 return $fh;
472             }
473              
474             =item open_file($filename,$mode)
475              
476             Opens a given file for reading ($mode = 'r') or writing ($mode =
477             'w'). If the last suffix of the filename is '.gz' or '.gz~', the data
478             are transparently un-gzipped (when reading) or gzipped (when writing).
479              
480             =cut
481              
482             sub open_file {
483 201     201 1 505 my ($file,$rw) = @_;
484 201         470 my $fh;
485 201 100       421 if (_is_gzip($file)) {
486 5         10 eval {
487 5         63 $fh = File::Temp->new(UNLINK => 1);
488             };
489 5 50       5018 die if $@;
490 5 50       28 return unless $fh;
491 5 50       66 if ($rw eq 'w') {
492 5 50       21 print STDERR __PACKAGE__.": Storing ZIPTOFILE: $rw\n" if $Debug;
493 5         14 ${*$fh}{'ZIPTOFILE'}=$file;
  5         23  
494             } else {
495 0         0 my $tmp;
496 0 0 0     0 eval {
497 0         0 require IO::Zlib;
498 0         0 $tmp = new IO::Zlib();
499             } && $tmp || return;
500 0 0       0 $tmp->open($file,"rb") || return;
501 0         0 my $buffer;
502 0         0 my $length = 1024*1024;
503 0         0 while (read($tmp,$buffer,$length)) {
504 0         0 $fh->print($buffer);
505             }
506 0         0 $tmp->close();
507 0         0 seek($fh,0,SEEK_SET);
508             }
509 5         26 return $fh;
510             } else {
511 196         1372 $fh = new IO::File();
512 196 50       7487 $fh->open($file,$rw) || return;
513             }
514 196         16234 return $fh;
515             }
516              
517             sub _callback {
518 0     0   0 my $callback = shift;
519 0 0       0 if (ref($callback) eq 'CODE') {
    0          
520 0         0 return $callback->(@_);
521             } elsif (ref($callback) eq 'ARRAY') {
522 0         0 my ($cb,@args)=@{$callback};
  0         0  
523 0         0 $cb->(@args,@_);
524             }
525             }
526              
527             sub _fetch_file {
528 173     173   334 my ($uri) = @_;
529 173         2529 my $proto = get_protocol($uri);
530 173 50 0     1914 if ($proto eq 'file') {
    0          
    0          
531 173         488 my $file = get_filename($uri);
532 173 50       21848 print STDERR __PACKAGE__.": _fetch_file: $file\n" if $Debug;
533 173 100       3888 die("File does not exist: $file\n") unless -e $file;
534 172 50       2472 die("File is not readable: $file\n") unless -r $file;
535 172 50       1951 die("File is empty: $file\n") if -z $file;
536 172         973 return ($file,0);
537             } elsif ($proto eq 'ntred' or $proto =~ /$reject_proto/) {
538 0         0 return ($uri,0);
539             } elsif (exists($input_protocol_handler{$proto})) {
540 0         0 my ($new_uri,$unlink) = _callback($input_protocol_handler{$proto},$uri);
541 0         0 my $new_proto = get_protocol($new_uri);
542 0 0       0 if ($new_proto ne $proto) {
543 0         0 return _fetch_file($new_uri);
544             } else {
545 0         0 return ($new_uri,$unlink);
546             }
547             } else {
548 0 0       0 if ($^O eq 'MSWin32') {
549 0         0 return _fetch_file_win32($uri,$proto);
550             } else {
551 0         0 return _fetch_file_posix($uri,$proto);
552             }
553             }
554             }
555              
556              
557             =item fetch_file($uri)
558              
559             Fetches a resource from a given URI and returns a path to a local file
560             with the content of the resource and a boolean unlink flag. If the
561             unlink flag is true, the caller is responsible for removing the local
562             file when finished using it. Otherwise, the caller should not remove
563             the file (usually when it points to the original resource). The
564             caller may assume that the resource is already un-gzipped if the URI
565             had the '.gz' or '.gz~' suffix.
566              
567             =cut
568              
569             sub fetch_file {
570 173     173 1 357 my ($uri) = @_;
571 173         485 my ($file,$unlink) = &_fetch_file;
572 172 100 66     571 if (get_protocol($file) eq 'file' and _is_gzip($uri)) {
573 15         550 my ($fh,$ungzfile) = File::Temp::tempfile("tredgzioXXXXXX",
574             DIR => File::Spec->tmpdir(),
575             UNLINK => 0,
576             );
577 15 50       6160 die "Cannot create temporary file: $!" unless $fh;
578 15         38 my $tmp;
579 15 50 33     46 eval {
580 15         596 require IO::Zlib;
581 15         48414 $tmp = new IO::Zlib();
582             } && $tmp || die "Cannot load IO::Zlib: $@";
583 15 50       854 $tmp->open($file,"rb") || die "Cannot read $uri ($file)";
584 15         40158 my $buffer;
585 15         33 my $length = 1024*1024;
586 15         83 while (read($tmp,$buffer,$length)) {
587 15         53600 $fh->print($buffer);
588             }
589 15         7132 $tmp->close();
590 15         2799 $fh->close;
591 15 50       529 unlink $file if $unlink;
592 15         166 return ($ungzfile,1);
593             } else {
594 157         1073 return ($file,$unlink);
595             }
596             }
597              
598              
599             sub _fetch_cmd {
600 0     0   0 my ($cmd, $filename)=@_;
601 0 0       0 print STDERR __PACKAGE__.": _fetch_cmd: $cmd\n" if $Debug;
602 0 0       0 if (system($cmd." > ".$filename)==0) {
603 0         0 return ($filename,1);
604             } else {
605 0         0 warn "$cmd > $filename failed (code $?): $!\n";
606 0         0 return $filename,0;
607             }
608             }
609              
610             sub _fetch_with_lwp {
611 0     0   0 my ($uri,$fh,$filename)=@_;
612 0         0 my $status = $lwp_user_agent->get($uri, ':content_file' => $filename);
613 0 0 0     0 if ($status and $status->is_error and $status->code == 401) {
      0        
614             # unauthorized
615             # Got authorization error 401, maybe the nonce is stale, let's try again...
616 0         0 $status = $lwp_user_agent->get($uri, ':content_file' => $filename);
617             }
618 0 0       0 if ($status->is_success()) {
619 0         0 close $fh;
620 0         0 return ($filename,1);
621             } else {
622 0         0 unlink $fh;
623 0         0 close $fh;
624 0         0 die "Error occured while fetching URL $uri $@\n".
625             $status->status_line()."\n";
626             }
627             }
628              
629             sub _fetch_file_win32 {
630 0     0   0 my ($uri,$proto)=@_;
631 0 0       0 my ($fh,$filename) = File::Temp::tempfile("tredioXXXXXX",
632             DIR => File::Spec->tmpdir(),
633             SUFFIX => (_is_gzip($uri) ? ".gz" : ""),
634             UNLINK => 0,
635             );
636 0 0       0 print STDERR __PACKAGE__.": fetching URI $uri as proto $proto to $filename\n" if $Debug;
637 0 0       0 if ($proto=~m(^https?|ftp|gopher|news)) {
638 0         0 return _fetch_with_lwp($uri,$fh,$filename);
639             }
640 0         0 return($uri,0);
641             }
642              
643             sub _fetch_file_posix {
644 0     0   0 my ($uri,$proto)=@_;
645 0 0       0 print STDERR __PACKAGE__.": fetching file using protocol $proto ($uri)\n" if $Debug;
646 0 0       0 my ($fh,$tempfile) = File::Temp::tempfile("tredioXXXXXX",
647             DIR => File::Spec->tmpdir(),
648             SUFFIX => (_is_gzip($uri) ? ".gz" : ""),
649             UNLINK => 0,
650             );
651 0 0       0 print STDERR __PACKAGE__.": tempfile: $tempfile\n" if $Debug;
652 0 0       0 if ($proto=~m(^https?|ftp|gopher|news)) {
653 0         0 return _fetch_with_lwp($uri,$fh,$tempfile);
654             }
655 0         0 close($fh);
656 0 0 0     0 if ($ssh and -x $ssh and $proto =~ /^(ssh|fish|sftp)$/) {
      0        
657 0 0       0 print STDERR __PACKAGE__.": using plain ssh\n" if $Debug;
658 0 0       0 if ($uri =~ m{^\s*(?:ssh|sftp|fish):(?://)?([^-/][^/]*)(/.*)$}) {
659 0         0 my ($host,$file) = ($1,$2);
660 0 0       0 print STDERR __PACKAGE__.": tempfile: $tempfile\n" if $Debug;
661             return
662 0         0 _fetch_cmd($ssh." ".$ssh_opts." ".quote_filename($host).
663             " /bin/cat ".quote_filename(quote_filename($file)),$tempfile);
664             } else {
665 0         0 die "failed to parse URI for ssh $uri\n";
666             }
667             }
668 0 0 0     0 if ($kioclient and -x $kioclient) {
669 0 0       0 print STDERR __PACKAGE__.": using kioclient\n" if $Debug;
670             # translate ssh protocol to fish protocol
671 0 0       0 if ($proto eq 'ssh') {
672 0         0 ($uri =~ s{^\s*ssh:(?://)?([/:]*)[:/]}{fish://$1/});
673             }
674 0         0 return _fetch_cmd($kioclient." ".$kioclient_opts.
675             " cat ".quote_filename($uri),$tempfile);
676             }
677 0 0 0     0 if ($curl and -x $curl and $proto =~ /^(?:https?|ftps?|gopher)$/) {
      0        
678 0         0 return _fetch_cmd($curl." ".$curl_opts." ".quote_filename($uri),$tempfile);
679             }
680 0         0 warn "No handlers for protocol $proto\n";
681 0         0 return ($uri,0);
682             }
683              
684             sub _open_upload_pipe {
685 0     0   0 my ($need_gzip,$user_pipe,$upload_pipe)=@_;
686 0         0 my $fh;
687 0 0 0     0 $user_pipe="| ".$user_pipe if defined($user_pipe) and $user_pipe !~ /^\|/;
688 0         0 $user_pipe.=" ";
689 0         0 my $cmd;
690 0 0       0 if ($need_gzip) {
691 0 0       0 if (-x $gzip) {
692 0         0 $cmd = $user_pipe."| $gzip $gzip_opts | $upload_pipe ";
693             } else {
694 0         0 die "Need a functional gzip and zcat to open this file\n";
695             }
696             } else {
697 0         0 $cmd = $user_pipe."| $upload_pipe ";
698             }
699 0 0       0 print STDERR __PACKAGE__.": upload: $cmd\n" if $Debug;
700 0   0     0 open $fh, $cmd || undef $fh;
701 0         0 return $fh;
702             }
703              
704             sub _get_upload_fh_win32 {
705 0     0   0 my ($uri,$proto,$userpipe)=@_;
706 0         0 die "Can't save files using protocol $proto on Windows\n";
707             }
708              
709             sub _get_upload_fh_posix {
710 0     0   0 my ($uri,$proto,$userpipe)=@_;
711 0 0       0 print STDERR __PACKAGE__.": uploading file using protocol $proto ($uri)\n" if $Debug;
712 0 0 0     0 return if $proto eq 'http' or $proto eq 'https';
713 0 0 0     0 if ($ssh and -x $ssh and $proto =~ /^(ssh|fish|sftp)$/) {
      0        
714 0 0       0 print STDERR __PACKAGE__.": using plain ssh\n" if $Debug;
715 0 0       0 if ($uri =~ m{^\s*(?:ssh|sftp|fish):(?://)?([^-/][^/]*)(/.*)$}) {
716 0         0 my ($host,$file) = ($1,$2);
717 0         0 return _open_upload_pipe(_is_gzip($uri), $userpipe, "$ssh $ssh_opts ".
718             quote_filename($host)." /bin/cat \\> ".
719             quote_filename(quote_filename($file)));
720             } else {
721 0         0 die "failed to parse URI for ssh $uri\n";
722             }
723             }
724 0 0 0     0 if ($kioclient and -x $kioclient) {
725 0 0       0 print STDERR __PACKAGE__.": using kioclient\n" if $Debug;
726             # translate ssh protocol to fish protocol
727 0 0       0 if ($proto eq 'ssh') {
728 0         0 $uri =~ s{^\s*ssh:(?://)?([/:]*)[:/]}{fish://$1/};
729             }
730 0         0 return _open_upload_pipe(_is_gzip($uri),$userpipe,
731             "$kioclient $kioclient_opts put ".quote_filename($uri));
732             }
733 0 0 0     0 if ($curl and -x $curl and $proto =~ /^(?:ftps?)$/) {
      0        
734 0         0 return _open_upload_pipe("$curl --upload-file - $curl_opts ".quote_filename($uri));
735             }
736 0         0 die "No handlers for protocol $proto\n";
737             }
738              
739             =item get_store_fh ($uri, $command?)
740              
741             If $command is provided, returns a writable filehandle for a pipe to a given
742             command whose output is redirected to an uploader to the given $URI
743             (for file $URIs this simply redirects the output of the command to the
744             given file (gzipping the data first if the $URI ends with the '.gz' or
745             '.gz~' suffix).
746              
747             If $command is not given, simly retuns a writable file handle to a
748             given file (possibly performing gzip if the file name ends with the
749             '.gz' or '.gz~' suffix).
750              
751             =cut
752              
753             sub get_store_fh {
754 31     31 1 95 my ($uri,$user_pipe) = @_;
755 31         113 my $proto = get_protocol($uri);
756 31 50 0     112 if ($proto eq 'file') {
    0          
757 31         98 $uri = get_filename($uri);
758 31 50       3350 if ($user_pipe) {
759 0         0 return open_pipe($uri,'w',$user_pipe);
760             } else {
761 31         149 return open_file($uri,'w');
762             }
763             } elsif ($proto eq 'ntred' or $proto =~ /$reject_proto/) {
764 0         0 return $uri;
765             } else {
766 0 0       0 if ($^O eq 'MSWin32') {
767 0         0 return _get_upload_fh_win32($uri,$proto,$user_pipe);
768             } else {
769 0         0 return _get_upload_fh_posix($uri,$proto,$user_pipe);
770             }
771             }
772             }
773              
774             =item unlink_uri($URI)
775              
776             Delete the resource point to by a given URI (if supported by the
777             corresponding protocol handler).
778              
779             =cut
780              
781             sub unlink_uri {
782 0 0   0 1 0 ($^O eq 'MSWin32') ? &_unlink_uri_win32 : &_unlink_uri_posix;
783             }
784              
785             sub _unlink_uri_win32 {
786 0     0   0 my ($uri) = @_;
787 0         0 my $proto = get_protocol($uri);
788 0 0       0 if ($proto eq 'file') {
789 0         0 unlink get_filename($uri);
790             } else {
791 0         0 die "Can't unlink file $uri\n";
792             }
793             }
794              
795             sub _unlink_uri_posix {
796 0     0   0 my ($uri)=@_;
797 0         0 my $proto = get_protocol($uri);
798 0 0       0 if ($proto eq 'file') {
799 0         0 return unlink get_filename($uri);
800             }
801 0 0       0 print STDERR __PACKAGE__.": unlinking file $uri using protocol $proto\n" if $Debug;
802 0 0 0     0 if ($ssh and -x $ssh and $proto =~ /^(ssh|fish|sftp)$/) {
      0        
803 0 0       0 print STDERR __PACKAGE__.": using plain ssh\n" if $Debug;
804 0 0       0 if ($uri =~ m{^\s*(?:ssh|sftp|fish):(?://)?([^-/][^/]*)(/.*)$}) {
805 0         0 my ($host,$file) = ($1,$2);
806 0 0       0 return (system("$ssh $ssh_opts ".quote_filename($host)." /bin/rm ".
807             quote_filename(quote_filename($file)))==0) ? 1 : 0;
808             } else {
809 0         0 die "failed to parse URI for ssh $uri\n";
810             }
811             }
812 0 0 0     0 if ($kioclient and -x $kioclient) {
813             # translate ssh protocol to fish protocol
814 0 0       0 if ($proto eq 'ssh') {
815 0         0 $uri =~ s{^\s*ssh:(?://)?([/:]*)[:/]}{fish://$1/};
816             }
817 0 0       0 return (system("$kioclient $kioclient_opts rm ".quote_filename($uri))==0 ? 1 : 0);
818             }
819 0         0 die "No handlers for protocol $proto\n";
820             }
821              
822             =item rename_uri($URI_1,$URI_2)
823              
824             Rename the resource point to by $URI_1 to $URI_2 (if supported by the
825             corresponding protocol handlers). The URIs must point to the same
826             physical storage.
827              
828             =cut
829              
830             sub rename_uri {
831 20 50   20 1 231 print STDERR __PACKAGE__.": rename @_\n" if $Debug;
832 20 50       136 ($^O eq 'MSWin32') ? &_rename_uri_win32 : &_rename_uri_posix;
833             }
834              
835              
836             sub _rename_uri_win32 {
837 0     0   0 my ($uri1,$uri2) = @_;
838 0         0 my $proto1 = get_protocol($uri1);
839 0         0 my $proto2 = get_protocol($uri2);
840 0 0 0     0 if ($proto1 eq 'file' and $proto2 eq 'file') {
841 0         0 my $uri1 = get_filename($uri1);
842 0 0       0 return unless -f $uri1;
843 0         0 rename $uri1, get_filename($uri2);
844             } else {
845 0         0 die "Can't rename file $uri1 to $uri2\n";
846             }
847             }
848              
849             sub _rename_uri_posix {
850 20     20   82 my ($uri1,$uri2) = @_;
851 20         81 my $proto = get_protocol($uri1);
852 20         271 my $proto2 = get_protocol($uri2);
853 20 50       72 if ($proto ne $proto2) {
854 0         0 die "Can't rename file $uri1 to $uri2\n";
855             }
856 20 50       84 if ($proto eq 'file') {
857 20         77 my $uri1 = get_filename($uri1);
858 20 50       3201 return unless -f $uri1;
859 20         138 return rename $uri1, get_filename($uri2);
860             }
861 0 0       0 print STDERR __PACKAGE__.": rename file $uri1 to $uri2 using protocol $proto\n" if $Debug;
862 0 0 0     0 if ($ssh and -x $ssh and $proto =~ /^(ssh|fish|sftp)$/) {
      0        
863 0 0       0 print STDERR __PACKAGE__.": using plain ssh\n" if $Debug;
864 0 0       0 if ($uri1 =~ m{^\s*(?:ssh|sftp|fish):(?://)?([^-/][^/]*)(/.*)$}) {
865 0         0 my ($host,$file) = ($1,$2);
866 0 0 0     0 if ($uri2 =~ m{^\s*(?:ssh|sftp|fish):(?://)?([^-/][^/]*)(/.*)$} and $1 eq $host) {
867 0         0 my $file2 = $2;
868 0 0       0 return (system("$ssh $ssh_opts ".quote_filename($host)." /bin/mv ".
869             quote_filename(quote_filename($file))." ".
870             quote_filename(quote_filename($file2)))==0) ? 1 : 0;
871             } else {
872 0         0 die "failed to parse URI for ssh $uri2\n";
873             }
874             } else {
875 0         0 die "failed to parse URI for ssh $uri1\n";
876             }
877             }
878 0 0 0     0 if ($kioclient and -x $kioclient) {
879             # translate ssh protocol to fish protocol
880 0 0       0 if ($proto eq 'ssh') {
881 0         0 $uri1 =~ s{^\s*ssh:(?://)?([/:]*)[:/]}{fish://$1/};
882 0         0 $uri2 =~ s{^\s*ssh:(?://)?([/:]*)[:/]}{fish://$1/};
883             }
884 0 0       0 return (system("$kioclient $kioclient_opts mv ".quote_filename($uri1).
885             " ".quote_filename($uri2))==0 ? 1 : 0);
886             }
887 0         0 die "No handlers for protocol $proto\n";
888             }
889              
890              
891              
892             =item open_backend (filename,mode,encoding?)
893              
894             Open given file for reading or writing (depending on mode which may be
895             one of "r" or "w"); Return the corresponding object based on
896             File::Handle class. Only files the filename of which ends with '.gz'
897             are considered to be gz-commpressed. All other files are opened using
898             IO::File.
899              
900             Optionally, in perl ver. >= 5.8, you may also specify file character
901             encoding.
902              
903             =cut
904              
905              
906             sub open_backend {
907 201     201 1 544 my ($filename, $rw,$encoding)=@_;
908 201         2427 $filename =~ s/^\s*|\s*$//g;
909 201 100       870 if ($rw eq 'r') {
    50          
910 170   50     546 return set_encoding(open_file($filename,$rw)||undef,$encoding);
911             } elsif ($rw eq 'w') {
912 31   50     175 return set_encoding(get_store_fh($filename)||undef,$encoding);
913             } else {
914 0         0 croak "2nd argument to open_backend must be 'r' or 'w'!";
915             }
916 0         0 return;
917             }
918              
919             =pod
920              
921             =item close_backend (filehandle)
922              
923             Close given filehandle opened by previous call to C
924              
925             =cut
926              
927             sub close_backend {
928 201     201 1 440 my ($fh)=@_;
929             # Win32 hack:
930 201 100       731 if (ref($fh) eq 'File::Temp') {
931 5         9 my $filename = ${*$fh}{'ZIPTOFILE'};
  5         23  
932 5 50       23 if ($filename ne "") {
933 5 50       17 print STDERR __PACKAGE__.": Doing the real save to $filename\n" if $Debug;
934 5         117 seek($fh,0,SEEK_SET);
935 5         71 require IO::Zlib;
936 5         56 my $tmp = new IO::Zlib();
937 5 50       396 $tmp->open($filename,"wb") || die "Cannot write to $filename: $!\n";
938             # probably bug in Perl 5.8.9? - using just :raw here is not enough
939 5         10559 binmode $fh, ':raw:perlio:bytes';
940 5         31 local $/;
941 5         871 $tmp->print(<$fh>);
942 5         12000 $tmp->close;
943             }
944             }
945 201         2888 my $ret;
946 201 50 33     2590 if ((blessed($fh) and $fh->isa('IO::Zlib'))) {
947 0         0 $ret = 1;
948             } else {
949 201   33     1254 $ret = ref($fh) && $fh->close();
950             }
951 201         7407 my $unlink = delete $UNLINK_ON_CLOSE{ $fh };
952 201 50       614 if ($unlink) {
953 0         0 unlink $unlink;
954             }
955 201         1095 return $ret;
956             }
957              
958              
959             =item open_uri (URI,encoding?)
960              
961             Open given URL for reading, returning an object based on File::Handle
962             class. Since for some types of URLs this function first copies the
963             data into a temporary file, use close_uri($fh) on the resulting
964             filehandle to close it and clean up the temporary file.
965              
966             Optionally, in perl ver. >= 5.8, you may also specify file character
967             encoding.
968              
969             =cut
970              
971             sub open_uri {
972 122     122 1 319 my ($uri,$encoding) = @_;
973 122         642 my ($local_file, $is_temporary) = fetch_file( $uri );
974 122   50     434 my $fh = open_backend($local_file,'r') || return;
975 122 50 33     438 if ($is_temporary and $local_file ne $uri ) {
976 0 0       0 if (!unlink($local_file)) {
977 0         0 $UNLINK_ON_CLOSE{ $fh } = $local_file;
978             }
979             }
980 122         282 return set_encoding($fh,$encoding);
981             }
982              
983             *close_uri = \&close_backend;
984              
985             =item close_uri (filehandle)
986              
987             Close given filehandle opened by previous call to C.
988              
989             =cut
990              
991              
992             =item copy_uri ($URI_1,$URI_2)
993              
994             Copy the resource pointed to by the URI $URI_1 to $URI_2. The type of
995             $URI_2 must be writable.
996              
997             =cut
998              
999             sub copy_uri {
1000 0     0 1   my ($src_uri,$target_uri)=@_;
1001 0 0         my $in = open_uri($src_uri)
1002             or die "Cannot open source $src_uri: $!\n";
1003 0 0         my $out = open_backend($target_uri,'w')
1004             or die "Cannot open target $target_uri: $!\n";
1005 0           my $L=1024*100;
1006 0           my $buffer;
1007 0           while(read($in,$buffer,$L)>0) {
1008 0           print $out ($buffer);
1009             }
1010 0           close_backend($in);
1011 0           close_backend($out);
1012             }
1013              
1014             =back
1015              
1016             =head1 COPYRIGHT AND LICENSE
1017              
1018             Copyright (C) 2006-2010 by Petr Pajas
1019              
1020             This library is free software; you can redistribute it and/or modify
1021             it under the same terms as Perl itself, either Perl version 5.8.2 or,
1022             at your option, any later version of Perl 5 you may have available.
1023              
1024             =cut