File Coverage

blib/lib/CPAN/Mini/Extract.pm
Criterion Covered Total %
statement 124 219 56.6
branch 19 74 25.6
condition 4 23 17.3
subroutine 25 32 78.1
pod 4 6 66.6
total 176 354 49.7


line stmt bran cond sub pod time code
1             package CPAN::Mini::Extract;
2              
3             =pod
4              
5             =head1 NAME
6              
7             CPAN::Mini::Extract - Create CPAN::Mini mirrors with the archives extracted
8              
9             =head1 SYNOPSIS
10              
11             # Create a CPAN extractor
12             my $cpan = CPAN::Mini::Extract->new(
13             remote => 'http://mirrors.kernel.org/cpan/',
14             local => '/home/adam/.minicpan',
15             trace => 1,
16             extract => '/home/adam/.cpanextracted',
17             extract_filter => sub { /\.pm$/ and ! /\b(inc|t)\b/ },
18             extract_check => 1,
19             );
20            
21             # Run the minicpan process
22             my $changes = $cpan->run;
23              
24             =head1 DESCRIPTION
25              
26             C provides a base for implementing systems that
27             download "all" of CPAN, extract the dists and then process the files
28             within.
29              
30             It provides the same syncronisation functionality as L except
31             that it also maintains a parallel directory tree that contains a directory
32             located at an identical path to each archive file, with a controllable
33             subset of the files in the archive extracted below.
34              
35             =head2 How does it work
36              
37             C starts with a L local mirror, which it
38             will optionally update before each run. Once the L directory
39             is current, it will scan both directory trees, extracting any new archives
40             and removing any extracted archives no longer in the minicpan mirror.
41              
42             =head1 EXTENDING
43              
44             This class is relatively straight forward, but may evolve over time.
45              
46             If you wish to write an extension, please stay in contact with the
47             maintainer while doing so.
48              
49             =head1 METHODS
50              
51             =cut
52              
53 3     3   52452 use 5.006;
  3         10  
  3         114  
54 3     3   14 use strict;
  3         5  
  3         76  
55 3     3   24 use Carp ();
  3         5  
  3         43  
56 3     3   13 use File::Basename ();
  3         5  
  3         60  
57 3     3   14 use File::Path ();
  3         10  
  3         66  
58 3     3   14 use File::Spec 0.80 ();
  3         376  
  3         96  
59 3     3   2849 use File::Remove 0.34 ();
  3         6808  
  3         74  
60 3     3   20 use List::Util 1.15 ();
  3         67  
  3         56  
61 3     3   9228 use File::HomeDir 0.88 ();
  3         21706  
  3         93  
62 3     3   4361 use File::Temp 0.21 ();
  3         84308  
  3         99  
63 3     3   2882 use URI 1.37 ();
  3         33698  
  3         94  
64 3     3   3228 use URI::file ();
  3         20803  
  3         93  
65 3     3   3633 use IO::File 1.14 ();
  3         3518  
  3         80  
66 3     3   3619 use IO::Uncompress::Gunzip 2.017 ();
  3         150627  
  3         110  
67 3     3   4352 use Archive::Tar 1.22 ();
  3         231377  
  3         101  
68 3     3   3401 use Params::Util 1.00 ();
  3         18020  
  3         96  
69 3     3   3693 use LWP::Online 0.03 ();
  3         226215  
  3         109  
70 3     3   2949 use File::Find::Rule 0.30 ();
  3         28193  
  3         124  
71 3     3   3521 use CPAN::Mini 1.111004 ();
  3         26525  
  3         100  
72              
73 3     3   107 use vars qw{$VERSION @ISA};
  3         8  
  3         257  
74             BEGIN {
75 3     3   7 $VERSION = '1.23';
76 3         6426 @ISA = 'CPAN::Mini';
77             }
78              
79              
80              
81              
82              
83             #####################################################################
84             # Constructor and Accessors
85              
86             =pod
87              
88             =head2 new
89              
90             The C constructor is used to create and configure a new CPAN
91             Processor. It takes a set of named params something like the following.
92              
93             # Create a CPAN processor
94             my $Object = CPAN::Mini::Extract->new(
95             # The normal CPAN::Mini params
96             remote => 'ftp://cpan.pair.com/pub/CPAN/',
97             local => '/home/adam/.minicpan',
98             trace => 1,
99            
100             # Additional params
101             extract => '/home/adam/explosion',
102             extract_filter => sub { /\.pm$/ and ! /\b(inc|t)\b/ },
103             extract_check => 1,
104             );
105              
106             =over
107              
108             =item minicpan args
109              
110             C inherits from L, so all of the arguments
111             that can be used with L will also work with
112             C.
113              
114             Please note that C applies some additional defaults
115             beyond the normal ones, like turning C on.
116              
117             =item offline
118              
119             Although useless with L itself, the C flag will
120             cause the CPAN synchronisation step to be skipped, and only any
121             extraction tasks to be done. (False by default)
122              
123             =item extract
124              
125             Provides the directory (which must exist and be writable, or be creatable)
126             that the tarball dists should be extracted to.
127              
128             =item extract_filter
129              
130             C allows you to specify a filter controlling which
131             types of files are extracted from the Archive. Please note that ONLY
132             normal files are ever considered for extraction from an archive, with
133             any directories needed created automatically.
134              
135             Although by default C only extract files of type .pm,
136             .t and .pl from the archives, you can add a list of additional things you
137             do not want to be extracted.
138              
139             The filter should be provided as a subroutine reference. This sub will
140             be called with $_ set to the path of the file. The subroutine should
141             return true if the file is to be extracted, or false if not.
142              
143             # Extract all .pm files, except those in an include directory
144             extract_filter => sub { /\.pm$/ and ! /\binc\b/ },
145              
146             =item extract_check
147              
148             The main extraction process is done as each new archive is downloaded,
149             but occasionally in a process this long-running something may go wrong
150             and you can end up with archives not extracted.
151              
152             In addition, sometimes the processing of the extracted archives is
153             destructive and will result in them being deleted each run.
154              
155             Once the mirror update has been completed, the C keyword
156             forces the processor to go back over every tarball in the mirror and
157             double check that it has a corrosponding extracted directory.
158              
159             =item extract_force
160              
161             For cases in which the filter has been changed, the C
162             boolean flag can be used to forcefully delete and re-extract every
163             extracted directory.
164              
165             =back
166              
167             Returns a new C object, or dies on error.
168              
169             =cut
170              
171             sub new {
172 3     3 1 531563 my $class = shift;
173              
174             # Use the CPAN::Mini settings as defaults, and add any
175             # additional explicit params.
176 3         39 my %config = ( CPAN::Mini->read_config, @_ );
177              
178             # Unless provided auto-detect offline mode
179 3 100       613 unless ( defined $config{offline} ) {
180 1         6 $config{offline} = LWP::Online::offline();
181             }
182              
183             # Fake a remote URI if CPAN::Mini can't handle offline mode
184 3         222286 my %fake = ();
185 3 50 66     65 if ( $config{offline} and $CPAN::Mini::VERSION < 0.570 ) {
186 0         0 my $tempdir = File::Temp::tempdir();
187 0         0 my $tempuri = URI::file->new( $tempdir )->as_string;
188 0         0 $fake{remote} = $tempuri;
189             }
190              
191             # Use a default local path if none provided
192 3 50       14 unless ( defined $config{local} ) {
193 0         0 my $local = File::Spec->catdir(
194             File::HomeDir->my_data, 'minicpan',
195             );
196             }
197              
198             # Call our superclass to create the object
199 3         71 my $self = $class->SUPER::new( %config, %fake );
200              
201             # Check the extract param
202 3 50       412546 $self->{extract} or Carp::croak(
203             "Did not provide an 'extract' path"
204             );
205 3 50       77 if ( -e $self->{extract} ) {
206 0 0 0     0 unless ( -d _ and -w _ ) {
207 0         0 Carp::croak(
208             "The 'extract' path is not a writable directory"
209             );
210             }
211             } else {
212 3 50       898 File::Path::mkpath( $self->{extract}, $self->{trace}, $self->{dirmode} )
213             or Carp::croak("The 'extract' path could not be created");
214             }
215              
216             # Set defaults and apply rules
217 3 50       104 unless ( defined $self->{extract_check} ) {
218 3         15 $self->{extract_check} = 1;
219             }
220 3 100       16 if ( $self->{extract_force} ) {
221 1         13 $self->{extract_check} = 1;
222             }
223              
224             # Compile file_filters if needed
225 3         26 $self->_compile_filter('extract_filter');
226              
227             # We'll need a temp directory for expansions
228 3         31 $self->{tempdir} = File::Temp::tempdir( CLEANUP => 1 );
229              
230 3         5555 $self;
231             }
232              
233              
234              
235              
236              
237             #####################################################################
238             # Main Methods
239              
240             =pod
241              
242             =head2 run
243              
244             The C methods starts the main process, updating the minicpan mirror
245             and extracted version, and then launching the PPI Processor to process the
246             files in the source directory.
247              
248             Returns the number of changes made to the local minicpan and extracted
249             directories, or dies on error.
250              
251             =cut
252              
253             sub run {
254 0     0 1 0 my $self = shift;
255              
256             # Prepare to start
257 0         0 local $| = 1;
258 0         0 my $changes;
259 0         0 $self->{added} = {};
260 0         0 $self->{cleaned} = {};
261              
262             # If we want to force re-expansion,
263             # remove all current expansion dirs.
264 0 0       0 if ( $self->{extract_force} ) {
265 0         0 $self->log("Flushing all expansion directories (extract_force enabled)\n");
266 0         0 my $authors_dir = File::Spec->catfile( $self->{extract}, 'authors' );
267 0 0       0 if ( -e $authors_dir ) {
268 0         0 $self->log("Removing $authors_dir...");
269 0 0       0 File::Remove::remove( \1, $authors_dir ) or Carp::croak(
270             "Failed to remove previous expansion directory '$authors_dir'"
271             );
272 0         0 $self->log(" removed\n");
273             }
274             }
275              
276             # Update the CPAN::Mini local mirror
277 0 0 0     0 if ( $self->{offline} and $CPAN::Mini::VERSION < 0.570 ) {
278 0         0 $self->log("Skipping minicpan update (offline mode enabled)\n");
279             } else {
280 0         0 $self->log("Updating minicpan local mirror...\n");
281 0         0 $self->update_mirror;
282             }
283              
284 0   0     0 $changes ||= 0;
285 0 0 0     0 if ( $self->{extract_check} or $self->{extract_force} ) {
286             # Expansion checking is enabled, and we didn't do a normal
287             # forced check, so find the full list of files to check.
288 0         0 $self->log("Tarball expansion checking enabled\n");
289 0         0 my @files = File::Find::Rule->new
290             ->name('*.tar.gz')
291             ->file
292             ->relative
293             ->in( $self->{local} );
294              
295             # Filter to just those we need to extract
296 0         0 $self->log("Checking " . scalar(@files) . " tarballs\n");
297 0         0 @files = grep { ! -d File::Spec->catfile( $self->{extract}, $_ ) } @files;
  0         0  
298 0 0       0 if ( @files ) {
299 0         0 $self->log("Scheduling " . scalar(@files) . " tarballs for expansion\n");
300             } else {
301 0         0 $self->log("No tarballs need to be extracted\n");
302             }
303              
304             # Expand each of the tarballs
305 0         0 foreach my $file ( sort @files ) {
306 0         0 $self->mirror_extract( $file );
307 0         0 $changes++;
308             }
309             }
310              
311 0         0 $self->log("Completed minicpan extraction\n");
312 0         0 $changes;
313             }
314              
315              
316              
317              
318              
319             #####################################################################
320             # CPAN::Mini Methods
321              
322             # Track what we have added
323             sub mirror_file {
324 0     0 1 0 my $self = shift;
325 0         0 my $file = shift;
326              
327             # Do the normal stuff
328 0         0 my $rv = $self->SUPER::mirror_file($file, @_);
329              
330             # Expand the tarball if needed
331 0 0       0 unless ( -d File::Spec->catfile( $self->{extract}, $file ) ) {
332 0         0 $self->{current_file} = $file;
333 0 0       0 $self->mirror_extract( $file ) or return undef;
334 0         0 delete $self->{current_file};
335             }
336              
337 0         0 $self->{added}->{$file} = 1;
338 0         0 delete $self->{current_file};
339 0         0 $rv;
340             }
341              
342             sub mirror_extract {
343 0     0 0 0 my ($self, $file) = @_;
344              
345             # Don't try to extract anything other than normal tarballs for now.
346 0 0       0 return 1 unless $file =~ /\.t(ar\.)?gz$/;
347              
348             # Extract the new file to the matching directory in
349             # the processor source directory.
350 0         0 my $local_file = File::Spec->catfile( $self->{local}, $file );
351 0         0 my $extract_dir = File::Spec->catfile( $self->{extract}, $file );
352              
353             # Do the actual extraction
354 0         0 $self->_extract_archive( $local_file, $extract_dir );
355             }
356              
357             # Also remove any processing directory.
358             # And track what we have removed.
359             sub clean_file {
360 0     0 1 0 my $self = shift;
361 0         0 my $file = shift; # Absolute path
362              
363             # Convert to relative path, and clear the expansion directory
364 0         0 my $relative = File::Spec->abs2rel( $file, $self->{local} );
365 0         0 $self->clean_extract( $relative );
366              
367             # We are doing this in the reverse order to when we created it.
368 0         0 my $rv = $self->SUPER::clean_file($file, @_);
369              
370 0         0 $self->{cleaned}->{$file} = 1;
371 0         0 $rv;
372             }
373              
374             # Remove a processing directory
375             sub clean_extract {
376 0     0 0 0 my ($self, $file) = @_;
377              
378             # Remove the source directory, if it exists
379 0         0 my $source_path = File::Spec->catfile( $self->{extract}, $file );
380 0 0       0 if ( -e $source_path ) {
381 0 0       0 File::Remove::remove( \1, $source_path ) or Carp::carp(
382             "Cannot remove $source_path $!"
383             );
384             }
385              
386 0         0 1;
387             }
388              
389              
390              
391              
392              
393             #####################################################################
394             # Support Methods and Error Handling
395              
396             # Compile a set of filters
397             sub _compile_filter {
398 3     3   7 my $self = shift;
399 3         12 my $name = shift;
400              
401             # Shortcut for "no filters"
402 3 100       15 return 1 unless $self->{$name};
403              
404             # If the filter is already a code ref, shortcut
405 2 50       132 return 1 if Params::Util::_CODELIKE($self->{$name});
406              
407             # Allow a single Regexp object for the filter
408 0 0       0 if ( Params::Util::_INSTANCE($self->{$name}, 'Regexp') ) {
409 0         0 $self->{$name} = [ $self->{$name} ];
410             }
411              
412             # Check for bad cases
413 0 0       0 Params::Util::_ARRAY0($self->{$name}) or Carp::croak(
414             "$name is not an ARRAY reference"
415             );
416 0 0       0 unless ( @{$self->{$name}} ) {
  0         0  
417 0         0 delete $self->{$name};
418 0         0 return 1;
419             }
420              
421             # Check we only got Regexp objects
422 0         0 my @filters = @{$self->{$name}};
  0         0  
423 0 0       0 if ( scalar grep { ! Params::Util::_INSTANCE($_, 'Regexp') } @filters ) {
  0         0  
424 0         0 return $self->_error("$name can only contains Regexp filters");
425             }
426              
427             # Build the anonymous sub
428             $self->{$name} = sub {
429 0     0   0 foreach my $regexp ( @filters ) {
430 0 0       0 return 1 if $_ =~ $regexp;
431             }
432 0         0 return '';
433 0         0 };
434              
435 0         0 1;
436             }
437              
438             # Encapsulate the actual extraction mechanism
439             sub _extract_archive {
440 1     1   1057 my ($self, $gz, $to) = @_;
441              
442             # Do a one-shot separate decompression because for some reason
443             # the default on-the-fly decompression is horridly memory
444             # innefficientm, allocating and freeing massive blocks of memory
445             # for every single block that gets read in.
446 1         6 my $archive = $self->_extract_gz( $gz );
447              
448             # IO::Zlib::tell will cause problems and Archive::Tar
449             # tries to use it by default, so invoke it with a
450             # file handle to MAKE it do the right thing.
451 1 50       11 my $io = IO::File->new( $archive, "r" )
452             or die "Failed to open $archive";
453              
454             # Some hints to Archive::Tar to make it behave to make it
455             # work better on Win32, and to ignore the ownership crap
456             # that we don't care about.
457 1         137 local $Archive::Tar::WARN = 0;
458 1         4 local $Archive::Tar::CHOWN = 0;
459 1         3 local $Archive::Tar::CHMOD = 0;
460              
461             # Load the archive
462 1         3 my $tar = eval {
463 1         12 Archive::Tar->new( $io );
464             };
465 1 50 33     2400 if ( $@ or ! $tar ) {
466 0         0 return $self->_tar_error("Loading of $archive failed");
467             }
468              
469             # Get the complete list of files
470 1         4 my @files = eval {
471 1         8 $tar->list_files( [ 'name', 'size' ] )
472             };
473 1 50       137 return $self->_tar_error("Loading of $archive failed") if $@;
474              
475             # Filter to get just the ones we want
476 1         4 @files = map { $_->{name} } grep { $_->{size} } @files;
  2         10  
  5         11  
477 1 50       7 if ( $self->{extract_filter} ) {
478 1         4 @files = grep &{$self->{extract_filter}}, @files;
  2         25  
479             }
480              
481             # Iterate and extract each file
482 1         292 File::Path::mkpath( $to, $self->{trace}, $self->{dirmode} );
483 1         5 foreach my $wanted ( sort @files ) {
484             # Where to extract to
485 1         28 my $to_file = File::Spec->catfile( $to, $wanted );
486 1         40 my $to_dir = File::Basename::dirname( $to_file );
487 1         179 File::Path::mkpath( $to_dir, $self->{trace}, $self->{dirmode} );
488 1         24 $self->log("write $to_file\n");
489              
490 1         142 my $rv;
491             SCOPE: {
492 1         5 $rv = eval {
  1         2  
493 1         8 $tar->extract_file( $wanted, $to_file );
494             };
495             }
496 1 50 33     6947 if ( $@ or ! $rv ) {
497             # There was an error during the extraction
498 0         0 $self->_tar_error( " ... failed" );
499 0 0       0 if ( -e $to_file ) {
500             # Remove any partial file left behind
501 0         0 File::Remove::remove( $to_file );
502             }
503 0         0 return 1;
504             }
505             }
506              
507             # Clean up
508 1         27 $tar->clear;
509 1         139 undef $tar;
510 1         54 $io->close;
511 1         35 File::Remove::remove( $archive );
512              
513 1         340 return 1;
514             }
515              
516             # Extract a gz-compressed file to a temp file
517             my $counter = 0;
518             sub _extract_gz {
519 1     1   2 my $self = shift;
520 1         3 my $gz = shift;
521 1         4 my $tar = ++$counter . '.tar';
522 1         13 my $file = File::Spec->catfile(
523             $self->{tempdir}, $tar,
524             );
525 1 50       10 IO::Uncompress::Gunzip::gunzip( $gz => $file )
526             or die "Failed to uncompress $gz";
527 1         4923 return $file;
528             }
529              
530             sub _tar_error {
531 0     0     my $self = shift;
532              
533             # Get and clean up the message
534 0           my $message = shift;
535 0 0 0       if ( ! $message and $self->{current_file} ) {
536 0           $message = "Expansion of $self->{current_file} failed";
537             }
538 0 0         if ( ! $message ) {
539 0           $message = "Expansion of file failed";
540             }
541 0 0         $message .= " (Archive::Tar warning)" if $@ =~ /Archive::Tar warning/;
542 0           $message .= "\n";
543              
544 0           $self->log($message);
545             }
546              
547             1;
548              
549             =pod
550              
551             =head1 SUPPORT
552              
553             Bugs should always be submitted via the CPAN bug tracker
554              
555             L
556              
557             For other issues, contact the maintainer
558              
559             =head1 AUTHOR
560              
561             Adam Kennedy Eadamk@cpan.orgE
562              
563             =head1 SEE ALSO
564              
565             L
566              
567             =head1 COPYRIGHT
568              
569             Funding provided by The Perl Foundation.
570              
571             Copyright 2005 - 2012 Adam Kennedy.
572              
573             This program is free software; you can redistribute
574             it and/or modify it under the same terms as Perl itself.
575              
576             The full text of the license can be found in the
577             LICENSE file included with this module.
578              
579             =cut