File Coverage

blib/lib/CPAN/Visitor.pm
Criterion Covered Total %
statement 39 134 29.1
branch 0 52 0.0
condition 0 15 0.0
subroutine 13 27 48.1
pod 2 3 66.6
total 54 231 23.3


line stmt bran cond sub pod time code
1 1     1   713 use 5.006;
  1         2  
  1         27  
2 1     1   4 use strict;
  1         1  
  1         26  
3 1     1   2 use warnings;
  1         1  
  1         44  
4             package CPAN::Visitor;
5             # ABSTRACT: Generic traversal of distributions in a CPAN repository
6              
7             our $VERSION = '0.004';
8              
9 1     1   1810 use autodie;
  1         12429  
  1         9  
10              
11 1     1   5468 use Archive::Extract 0.34 ();
  1         110819  
  1         27  
12 1     1   7 use File::Find ();
  1         1  
  1         14  
13 1     1   415 use File::pushd 1.00 ();
  1         9070  
  1         25  
14 1     1   7 use File::Temp 0.20 ();
  1         15  
  1         14  
15 1     1   372 use Path::Class 0.17 ();
  1         16836  
  1         23  
16 1     1   554 use Parallel::ForkManager 0.007005 ();
  1         4334  
  1         21  
17              
18 1     1   542 use Moose 0.93 ;
  1         288736  
  1         6  
19 1     1   5169 use MooseX::Params::Validate 0.13;
  1         56694  
  1         7  
20 1     1   785 use namespace::autoclean 0.09 ;
  1         915  
  1         4  
21              
22             has 'cpan' => ( is => 'ro', required => 1 );
23             has 'quiet' => ( is => 'rw', default => 1 );
24             has 'stash' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
25             has 'files' => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } );
26              
27             sub BUILD {
28 0     0 0   my $self = shift;
29 0 0 0       unless (
30             -d $self->cpan &&
31             -d Path::Class::dir($self->cpan, 'authors', 'id')
32             ) {
33 0           die "'cpan' parameter must be the root of a CPAN repository";
34             }
35             }
36              
37             #--------------------------------------------------------------------------#
38             # selection methods
39             #--------------------------------------------------------------------------#
40              
41             my $archive_re = qr{\.(?:tar\.(?:bz2|gz|Z)|t(?:gz|bz)|zip)$}i;
42              
43             sub select {
44 0     0 1   my ($self, %params) = validated_hash( \@_,
45             match => { isa => 'RegexpRef | ArrayRef[RegexpRef]', default => [qr/./] },
46             exclude => { isa => 'RegexpRef | ArrayRef[RegexpRef]', default => [] },
47             subtrees => { isa => 'Str | ArrayRef[Str]', default => [] },
48             all_files => { isa => 'Bool', default => 0 },
49             append => { isa => 'Bool', default => 0 },
50             );
51              
52             # normalize to arrayrefs
53 0           for my $k ( qw/match exclude subtrees/ ) {
54 0 0         next unless exists $params{$k};
55 0 0 0       next if ref $params{$k} && ref $params{$k} eq 'ARRAY';
56 0           $params{$k} = [ $params{$k} ];
57             }
58              
59             # determine search dirs
60 0           my $id_dir = Path::Class::dir($self->cpan, qw/authors id/);
61 0           my @search_dirs = map { $id_dir->subdir($_)->stringify } @{$params{subtrees}};
  0            
  0            
62 0 0         @search_dirs = $id_dir->stringify if ! @search_dirs;
63              
64             # perform search
65 0           my @found;
66             File::Find::find(
67             {
68             no_chdir => 1,
69             follow => 0,
70 0     0     preprocess => sub { my @files = sort @_; return @files },
  0            
71             wanted => sub {
72 0 0   0     return unless -f;
73 0 0 0       return unless $params{all_files} || /$archive_re/;
74 0           for my $re ( @{$params{exclude}} ) {
  0            
75 0 0         return if /$re/;
76             }
77 0           for my $re ( @{$params{match}} ) {
  0            
78 0 0         return if ! /$re/;
79             }
80 0           (my $f = Path::Class::file($_)->relative($id_dir)) =~ s{./../}{};
81 0           push @found, $f;
82             }
83             },
84 0           @search_dirs,
85             );
86              
87 0 0         if ( $params{append} ) {
88 0           push @{$self->files}, @found;
  0            
89             }
90             else {
91 0           @{$self->files} = @found;
  0            
92             }
93 0           return scalar @found;
94             }
95              
96             #--------------------------------------------------------------------------#
97             # default actions
98             #
99             # These are passed a "job" hashref. It is initialized with the following
100             # fields:
101             #
102             # distfile -- e.g. DAGOLDEN/CPAN-Visitor-0.001.tar.gz
103             # distpath -- e.g. /my/cpan/authors/id/D/DA/DAGOLDEN/CPAN-Visitor-0.001.tar.gz
104             # tempdir -- File::Temp directory object for extraction or other things
105             # stash -- the 'stash' hashref from the Visitor object
106             # quiet -- the 'quiet' flag from the Visitor object
107             # result -- an empty hashref to start; the return values from each
108             # action are added and may be referenced by subsequent actions
109             #
110             # E.g. the return value from 'extract' is the directory:
111             #
112             # $job->{result}{extract} = $unpacked_directory;
113             #
114             #--------------------------------------------------------------------------#
115              
116 0     0     sub _check { 1 } # always proceed
117              
118 0     0     sub _start { 1 } # no special start action
119              
120             # _extract returns the proper directory to chdir into
121             # if the $job->{stash}{prefer_bin} is true, it will tell Archive::Extract
122             # to use binaries
123             sub _extract {
124 0     0     my $job = shift;
125 0           local $Archive::Extract::DEBUG = 0;
126 0 0         local $Archive::Extract::PREFER_BIN = $job->{stash}{prefer_bin} ? 1 : 0;
127 0 0         local $Archive::Extract::WARN = $job->{quiet} ? 0 : 1;
128              
129             # cd to tmpdir for duration of this sub
130 0           my $pushd = File::pushd::pushd( $job->{tempdir} );
131              
132 0           my $ae = Archive::Extract->new( archive => $job->{distpath} );
133              
134 0           my $olderr;
135              
136             # stderr > /dev/null if quiet
137 0 0         if ( ! $Archive::Extract::WARN ) {
138 0           open $olderr, ">&STDERR";
139 0           open STDERR, ">", File::Spec->devnull;
140             }
141              
142 0           my $extract_ok = $ae->extract;
143              
144             # restore stderr if quiet
145 0 0         if ( ! $Archive::Extract::WARN ) {
146 0           open STDERR, ">&", $olderr;
147 0           close $olderr;
148             }
149              
150 0 0         if ( ! $extract_ok ) {
151 0 0         warn "Couldn't extract '$job->{distpath}'\n" if $Archive::Extract::WARN;
152 0           return;
153             }
154              
155             # most distributions unpack a single directory that we must enter
156             # but some behave poorly and unpack to the current directory
157 0           my @children = Path::Class::dir()->children;
158 0 0 0       if ( @children == 1 && -d $children[0] ) {
159 0           return Path::Class::dir($job->{tempdir}, $children[0])->absolute->stringify;
160             }
161             else {
162 0           return Path::Class::dir($job->{tempdir})->absolute->stringify;
163             }
164             }
165              
166             sub _enter {
167 0     0     my $job = shift;
168 0           my $curdir = Path::Class::dir()->absolute;
169 0 0         my $target_dir = $job->{result}{extract} or return;
170 0 0         if ( -d $target_dir ) {
171 0 0         chmod 0755, $target_dir unless -x $target_dir;
172 0           chdir $target_dir;
173             }
174             else {
175 0 0         warn "Can't chdir to directory '$target_dir'\n"
176             unless $job->{quiet};
177 0           return;
178             }
179 0           return $curdir;
180             }
181              
182 0     0     sub _visit { 1 } # do nothing
183              
184             # chdir out and clean up
185             sub _leave {
186 0     0     my $job = shift;
187 0           chdir $job->{result}{enter};
188 0           return 1;
189             }
190              
191 0     0     sub _finish { 1 } # no special finish action
192              
193             #--------------------------------------------------------------------------#
194             # iteration methods
195             #--------------------------------------------------------------------------#
196              
197             # iterate()
198             #
199             # Arguments:
200             #
201             # jobs -- if greater than 1, distributions are processed in parallel
202             # via Parallel::ForkManager
203             #
204             # iterate() takes several optional callbacks which are run in the following
205             # order. Callbacks get a single hashref argument as described above under
206             # default actions.
207             #
208             # check -- whether the distribution should be processed; goes to next file
209             # if false; default is always true
210             #
211             # start -- used for any setup, logging, etc; default does nothing
212             #
213             # extract -- extracts a distribution into a temp directory or otherwise
214             # prepares for visiting; skips to finish action if it returns
215             # a false value; default returns the path to the extracted
216             # directory
217             #
218             # enter -- skips to the finish action if it returns false; default takes
219             # the result of extract, chdir's into it, and returns the
220             # original directory
221             #
222             # visit -- examine the distribution or otherwise do stuff; the default
223             # does nothing;
224             #
225             # leave -- default returns to the original directory (the result of enter)
226             #
227             # finish -- any teardown processing, logging, etc.
228              
229             sub iterate {
230 0     0 1   my ($self, %params) = validated_hash( \@_,
231             jobs => { isa => 'Int', default => 0 },
232             check => { isa => 'CodeRef', default => \&_check },
233             start => { isa => 'CodeRef', default => \&_start },
234             extract => { isa => 'CodeRef', default => \&_extract },
235             enter => { isa => 'CodeRef', default => \&_enter },
236             visit => { isa => 'CodeRef', default => \&_visit },
237             leave => { isa => 'CodeRef', default => \&_leave },
238             finish => { isa => 'CodeRef', default => \&_finish },
239             );
240              
241 0 0         my $pm = Parallel::ForkManager->new( $params{jobs} > 1 ? $params{jobs} : 0 );
242 0           for my $distfile ( @{ $self->files } ) {
  0            
243 0 0         $pm->start and next;
244 0           eval { $self->_iterate($distfile, \%params) };
  0            
245 0 0 0       warn "Error visiting $distfile: $@\n" if $@ && ! $self->quiet;
246 0           $pm->finish;
247             }
248 0           $pm->wait_all_children;
249 0           return 1;
250             }
251              
252             sub _iterate {
253 0     0     my ($self, $distfile, $params) = @_;
254 0           my $job = {
255             distfile => $distfile,
256             distpath => $self->_fullpath($distfile),
257             tempdir => File::Temp->newdir(),
258             stash => $self->stash,
259             quiet => $self->quiet,
260             result => {},
261             };
262 0 0         $job->{result}{check} = $params->{check}->($job) or return;
263 0           $job->{result}{start} = $params->{start}->($job);
264 0 0         ACTION: {
265 0           $job->{result}{extract} = $params->{extract}->($job) or last ACTION;
266 0 0         $job->{result}{enter} = $params->{enter}->($job) or last ACTION;
267 0           $job->{result}{visit} = $params->{visit}->($job);
268 0           $job->{result}{leave} = $params->{leave}->($job);
269             }
270 0           $params->{finish}->($job);
271 0           return;
272             }
273              
274             sub _fullpath {
275 0     0     my ($self, $distfile) = @_;
276 0           my ($two, $one) = $distfile =~ /\A((.).)/;
277 0           return Path::Class::file(
278             $self->cpan, 'authors', 'id', $one, $two, $distfile
279             )->absolute->stringify;
280             }
281              
282             __PACKAGE__->meta->make_immutable;
283              
284             1;
285              
286             __END__
287              
288             =pod
289              
290             =encoding UTF-8
291              
292             =head1 NAME
293              
294             CPAN::Visitor - Generic traversal of distributions in a CPAN repository
295              
296             =head1 VERSION
297              
298             version 0.004
299              
300             =head1 SYNOPSIS
301              
302             use CPAN::Visitor;
303             my $visitor = CPAN::Visitor->new( cpan => "/path/to/cpan" );
304              
305             # Prepare to visit all distributions
306             $visitor->select();
307              
308             # Or a subset of distributions
309             $visitor->select(
310             subtrees => [ 'D/DA', 'A/AD' ], # relative to authors/id/
311             exclude => qr{/Acme-}, # No Acme- dists
312             match => qr{/Test-} # Only Test- dists
313             );
314              
315             # Action is specified via a callback
316             $visitor->iterate(
317             visit => sub {
318             my $job = shift;
319             print $job->{distfile} if -f 'Build.PL'
320             }
321             );
322              
323             # Or start with a list of files
324             $visitor = CPAN::Visitor->new(
325             cpan => "/path/to/cpan",
326             files => \@distfiles, # e.g. ANDK/CPAN-1.94.tar.gz
327             );
328             $visitor->iterate( visit => \&callback );
329              
330             # Iterate in parallel
331             $visitor->iterate( visit => \&callback, jobs => 5 );
332              
333             =head1 DESCRIPTION
334              
335             A very generic, callback-driven program to iterate over a CPAN repository.
336              
337             Needs better documentation and tests, but is provided for others to examine,
338             use or contribute to.
339              
340             =head1 USAGE
341              
342             =head2 new
343              
344             my $visitor = CPAN::Visitor->new( @args );
345              
346             Object attributes include:
347              
348             =over 4
349              
350             =item *
351              
352             C<cpan> — path to CPAN or mini CPAN repository. Required.
353              
354             =item *
355              
356             C<quiet> — whether warnings should be silenced (e.g. from extraction). Optional.
357              
358             =item *
359              
360             C<stash> — hash-ref of user-data to be made available during iteration. Optional.
361              
362             =item *
363              
364             C<files> — array-ref with a pre-selection of of distribution files. These must be in AUTHOR/NAME.suffix format. Optional.
365              
366             =back
367              
368             =head2 select
369              
370             $visitor->select( @args );
371              
372             Valid arguments include:
373              
374             =over 4
375              
376             =item *
377              
378             C<subtrees> — path or array-ref of paths to search. These must be relative to the 'authors/id/' directory within a CPAN repo. If given, only files within those subtrees will be considered. If not specified, the entire 'authors/id' tree is searched.
379              
380             =item *
381              
382             C<exclude> — qr() or array-ref of qr() patterns. If a path matches *any* pattern, it is excluded
383              
384             =item *
385              
386             C<match> — qr() or array-ref of qr() patterns. If an array-ref is provided, only paths that match *all* patterns are included
387              
388             =item *
389              
390             all_files — boolean that determines whether all files or only files that have a distribution archive suffix are selected. Default is false.
391              
392             =item *
393              
394             append — boolean that determines whether the selected files should be appended to previously selected files. The default is false, which replaces any previous selection
395              
396             =back
397              
398             The C<select> method returns a count of files selected.
399              
400             =head2 iterate
401              
402             $visitor->iterate( @args );
403              
404             Valid arguments include:
405              
406             =over 4
407              
408             =item *
409              
410             C<jobs> — non-negative integer specifying the maximum number of forked processes. Defaults to none.
411              
412             =item *
413              
414             C<check> — code reference callback
415              
416             =item *
417              
418             C<start> — code reference callback
419              
420             =item *
421              
422             C<extract> — code reference callback
423              
424             =item *
425              
426             C<enter> — code reference callback
427              
428             =item *
429              
430             C<visit> — code reference callback
431              
432             =item *
433              
434             C<leave> — code reference callback
435              
436             =item *
437              
438             C<finish> — code reference callback
439              
440             =back
441              
442             See L</ACTION CALLBACKS> for more. Generally, you only need to provide the
443             C<visit> callback, which is called from inside the unpacked distribution
444             directory.
445              
446             The C<iterate> method always returns true.
447              
448             =for Pod::Coverage BUILD
449              
450             =head1 ACTION CALLBACKS
451              
452             Each selected distribution is processed with a series of callback
453             functions. These are each passed a hash-ref with information about
454             the particular distribution being processed.
455              
456             sub _my_visit {
457             my $job = shift;
458             # do stuff
459             }
460              
461             The job hash-ref is initialized with the following fields:
462              
463             =over 4
464              
465             =item *
466              
467             C<distfile> — the unique, short CPAN distfile name, e.g. DAGOLDEN/CPAN-Visitor-0.001.tar.gz
468              
469             =item *
470              
471             C<distpath> — the absolute path the distribution archive, e.g. /my/cpan/authors/id/D/DA/DAGOLDEN/CPAN-Visitor-0.001.tar.gz
472              
473             =item *
474              
475             C<tempdir> — a File::Temp directory object for extraction or other things
476              
477             =item *
478              
479             C<stash> — the 'stash' hashref from the Visitor object
480              
481             =item *
482              
483             C<quiet> — the 'quiet' flag from the Visitor object
484              
485             =item *
486              
487             C<result> — an empty hashref to start; the return values from each action are added and may be referenced by subsequent actions
488              
489             =back
490              
491             The C<result> field is used to accumulate the return values from action
492             callbacks. For example, the return value from the default 'extract' action is
493             the unpacked distribution directory:
494              
495             $job->{result}{extract} # distribution directory path
496              
497             You do not need to store the results yourself — the C<iterate> method
498             takes care of it for you.
499              
500             Callbacks occur in the following order. Some callbacks skip further
501             processing if the return value is false.
502              
503             =over 4
504              
505             =item *
506              
507             C<check> — determines whether the distribution should be processed; goes to next file if false; default is always true
508              
509             =item *
510              
511             C<start> — used for any setup, logging, etc; default does nothing
512              
513             =item *
514              
515             C<extract> — operate on the tarball to prepare for visiting; skips to finish action if it returns a false value; the default extracts a distribution into a temp directory and returns the path to the extracted directory; if the C<stash> has a true value for C<prefer_bin>, binary tar, etc. will be preferred. This is faster, but less portable.
516              
517             =item *
518              
519             C<enter> — skips to the finish action if it returns false; default takes the result of extract, chdir's into it, and returns the original directory; if the extract result is missing the +x permissions, this will attempt to add it before calling chdir.
520              
521             =item *
522              
523             C<visit> — examine the distribution or otherwise do stuff; the default does nothing;
524              
525             =item *
526              
527             C<leave> — default returns to the original directory (the result of enter)
528              
529             =item *
530              
531             C<finish> — any teardown processing, logging, etc.
532              
533             =back
534              
535             These allow complete customization of the iteration process. For example,
536             one could do something like this:
537              
538             =over 4
539              
540             =item *
541              
542             replace the default C<extract> callback with one that returns an arrayref of distribution files without actually unpacking it into a physical directory
543              
544             =item *
545              
546             replace the default C<enter> callback with one that does nothing but return a true value; replace the default C<leave> callback likewise
547              
548             =item *
549              
550             have the C<visit> callback get the C<< $job->{result}{extract} >> listing and examine it for the presence of certain files
551              
552             =back
553              
554             This could potentially speed up iteration if only the file names within
555             the distribution are of interest and not the contents of the actual files.
556              
557             =head1 SEE ALSO
558              
559             =over 4
560              
561             =item *
562              
563             L<App::CPAN::Mini::Visit>
564              
565             =item *
566              
567             L<CPAN::Mini::Visit>
568              
569             =back
570              
571             =for :stopwords cpan testmatrix url annocpan anno bugtracker rt cpants kwalitee diff irc mailto metadata placeholders metacpan
572              
573             =head1 SUPPORT
574              
575             =head2 Bugs / Feature Requests
576              
577             Please report any bugs or feature requests through the issue tracker
578             at L<https://github.com/dagolden/cpan-visitor/issues>.
579             You will be notified automatically of any progress on your issue.
580              
581             =head2 Source Code
582              
583             This is open source software. The code repository is available for
584             public review and contribution under the terms of the license.
585              
586             L<https://github.com/dagolden/cpan-visitor>
587              
588             git clone https://github.com/dagolden/cpan-visitor.git
589              
590             =head1 AUTHOR
591              
592             David Golden <dagolden@cpan.org>
593              
594             =head1 COPYRIGHT AND LICENSE
595              
596             This software is Copyright (c) 2010 by David Golden.
597              
598             This is free software, licensed under:
599              
600             The Apache License, Version 2.0, January 2004
601              
602             =cut