File Coverage

blib/lib/CPAN/Visitor.pm
Criterion Covered Total %
statement 31 33 93.9
branch n/a
condition n/a
subroutine 11 11 100.0
pod n/a
total 42 44 95.4


line stmt bran cond sub pod time code
1 1     1   1571 use 5.006;
  1         4  
  1         45  
2 1     1   6 use strict;
  1         2  
  1         38  
3 1     1   6 use warnings;
  1         1  
  1         73  
4             package CPAN::Visitor;
5             # ABSTRACT: Generic traversal of distributions in a CPAN repository
6             our $VERSION = '0.003'; # VERSION
7              
8 1     1   952 use autodie;
  1         26732  
  1         10  
9              
10 1     1   7729 use Archive::Extract 0.34 ();
  1         249365  
  1         39  
11 1     1   11 use File::Find ();
  1         2  
  1         22  
12 1     1   975 use File::pushd 1.00 ();
  1         15797  
  1         33  
13 1     1   9 use File::Temp 0.20 ();
  1         22  
  1         18  
14 1     1   958 use Path::Class 0.17 ();
  1         20751  
  1         31  
15 1     1   1169 use Parallel::ForkManager 0.007005 ();
  1         7247  
  1         34  
16              
17 1     1   673 use Moose 0.93 ;
  0            
  0            
18             use MooseX::Params::Validate 0.13;
19             use namespace::autoclean 0.09 ;
20              
21             has 'cpan' => ( is => 'ro', required => 1 );
22             has 'quiet' => ( is => 'rw', default => 1 );
23             has 'stash' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
24             has 'files' => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } );
25              
26             sub BUILD {
27             my $self = shift;
28             unless (
29             -d $self->cpan &&
30             -d Path::Class::dir($self->cpan, 'authors', 'id')
31             ) {
32             die "'cpan' parameter must be the root of a CPAN repository";
33             }
34             }
35              
36             #--------------------------------------------------------------------------#
37             # selection methods
38             #--------------------------------------------------------------------------#
39              
40             my $archive_re = qr{\.(?:tar\.(?:bz2|gz|Z)|t(?:gz|bz)|zip)$}i;
41              
42             sub select {
43             my ($self, %params) = validated_hash( \@_,
44             match => { isa => 'RegexpRef | ArrayRef[RegexpRef]', default => [qr/./] },
45             exclude => { isa => 'RegexpRef | ArrayRef[RegexpRef]', default => [] },
46             subtrees => { isa => 'Str | ArrayRef[Str]', default => [] },
47             all_files => { isa => 'Bool', default => 0 },
48             append => { isa => 'Bool', default => 0 },
49             );
50              
51             # normalize to arrayrefs
52             for my $k ( qw/match exclude subtrees/ ) {
53             next unless exists $params{$k};
54             next if ref $params{$k} && ref $params{$k} eq 'ARRAY';
55             $params{$k} = [ $params{$k} ];
56             }
57              
58             # determine search dirs
59             my $id_dir = Path::Class::dir($self->cpan, qw/authors id/);
60             my @search_dirs = map { $id_dir->subdir($_)->stringify } @{$params{subtrees}};
61             @search_dirs = $id_dir->stringify if ! @search_dirs;
62              
63             # perform search
64             my @found;
65             File::Find::find(
66             {
67             no_chdir => 1,
68             follow => 0,
69             preprocess => sub { my @files = sort @_; return @files },
70             wanted => sub {
71             return unless -f;
72             return unless $params{all_files} || /$archive_re/;
73             for my $re ( @{$params{exclude}} ) {
74             return if /$re/;
75             }
76             for my $re ( @{$params{match}} ) {
77             return if ! /$re/;
78             }
79             (my $f = Path::Class::file($_)->relative($id_dir)) =~ s{./../}{};
80             push @found, $f;
81             }
82             },
83             @search_dirs,
84             );
85              
86             if ( $params{append} ) {
87             push @{$self->files}, @found;
88             }
89             else {
90             @{$self->files} = @found;
91             }
92             return scalar @found;
93             }
94              
95             #--------------------------------------------------------------------------#
96             # default actions
97             #
98             # These are passed a "job" hashref. It is initialized with the following
99             # fields:
100             #
101             # distfile -- e.g. DAGOLDEN/CPAN-Visitor-0.001.tar.gz
102             # distpath -- e.g. /my/cpan/authors/id/D/DA/DAGOLDEN/CPAN-Visitor-0.001.tar.gz
103             # tempdir -- File::Temp directory object for extraction or other things
104             # stash -- the 'stash' hashref from the Visitor object
105             # quiet -- the 'quiet' flag from the Visitor object
106             # result -- an empty hashref to start; the return values from each
107             # action are added and may be referenced by subsequent actions
108             #
109             # E.g. the return value from 'extract' is the directory:
110             #
111             # $job->{result}{extract} = $unpacked_directory;
112             #
113             #--------------------------------------------------------------------------#
114              
115             sub _check { 1 } # always proceed
116              
117             sub _start { 1 } # no special start action
118              
119             # _extract returns the proper directory to chdir into
120             # if the $job->{stash}{prefer_bin} is true, it will tell Archive::Extract
121             # to use binaries
122             sub _extract {
123             my $job = shift;
124             local $Archive::Extract::DEBUG = 0;
125             local $Archive::Extract::PREFER_BIN = $job->{stash}{prefer_bin} ? 1 : 0;
126             local $Archive::Extract::WARN = $job->{quiet} ? 0 : 1;
127              
128             # cd to tmpdir for duration of this sub
129             my $pushd = File::pushd::pushd( $job->{tempdir} );
130              
131             my $ae = Archive::Extract->new( archive => $job->{distpath} );
132              
133             my $olderr;
134              
135             # stderr > /dev/null if quiet
136             if ( ! $Archive::Extract::WARN ) {
137             open $olderr, ">&STDERR";
138             open STDERR, ">", File::Spec->devnull;
139             }
140              
141             my $extract_ok = $ae->extract;
142              
143             # restore stderr if quiet
144             if ( ! $Archive::Extract::WARN ) {
145             open STDERR, ">&", $olderr;
146             close $olderr;
147             }
148              
149             if ( ! $extract_ok ) {
150             warn "Couldn't extract '$job->{distpath}'\n" if $Archive::Extract::WARN;
151             return;
152             }
153              
154             # most distributions unpack a single directory that we must enter
155             # but some behave poorly and unpack to the current directory
156             my @children = Path::Class::dir()->children;
157             if ( @children == 1 && -d $children[0] ) {
158             return Path::Class::dir($job->{tempdir}, $children[0])->absolute->stringify;
159             }
160             else {
161             return Path::Class::dir($job->{tempdir})->absolute->stringify;
162             }
163             }
164              
165             sub _enter {
166             my $job = shift;
167             my $curdir = Path::Class::dir()->absolute;
168             my $target_dir = $job->{result}{extract} or return;
169             if ( -d $target_dir ) {
170             chdir $target_dir;
171             }
172             else {
173             warn "Can't chdir to non-existing directory '$target_dir'\n";
174             return;
175             }
176             return $curdir;
177             }
178              
179             sub _visit { 1 } # do nothing
180              
181             # chdir out and clean up
182             sub _leave {
183             my $job = shift;
184             chdir $job->{result}{enter};
185             return 1;
186             }
187              
188             sub _finish { 1 } # no special finish action
189              
190             #--------------------------------------------------------------------------#
191             # iteration methods
192             #--------------------------------------------------------------------------#
193              
194             # iterate()
195             #
196             # Arguments:
197             #
198             # jobs -- if greater than 1, distributions are processed in parallel
199             # via Parallel::ForkManager
200             #
201             # iterate() takes several optional callbacks which are run in the following
202             # order. Callbacks get a single hashref argument as described above under
203             # default actions.
204             #
205             # check -- whether the distribution should be processed; goes to next file
206             # if false; default is always true
207             #
208             # start -- used for any setup, logging, etc; default does nothing
209             #
210             # extract -- extracts a distribution into a temp directory or otherwise
211             # prepares for visiting; skips to finish action if it returns
212             # a false value; default returns the path to the extracted
213             # directory
214             #
215             # enter -- skips to the finish action if it returns false; default takes
216             # the result of extract, chdir's into it, and returns the
217             # original directory
218             #
219             # visit -- examine the distribution or otherwise do stuff; the default
220             # does nothing;
221             #
222             # leave -- default returns to the original directory (the result of enter)
223             #
224             # finish -- any teardown processing, logging, etc.
225              
226             sub iterate {
227             my ($self, %params) = validated_hash( \@_,
228             jobs => { isa => 'Int', default => 0 },
229             check => { isa => 'CodeRef', default => \&_check },
230             start => { isa => 'CodeRef', default => \&_start },
231             extract => { isa => 'CodeRef', default => \&_extract },
232             enter => { isa => 'CodeRef', default => \&_enter },
233             visit => { isa => 'CodeRef', default => \&_visit },
234             leave => { isa => 'CodeRef', default => \&_leave },
235             finish => { isa => 'CodeRef', default => \&_finish },
236             );
237              
238             my $pm = Parallel::ForkManager->new( $params{jobs} > 1 ? $params{jobs} : 0 );
239             for my $distfile ( @{ $self->files } ) {
240             $pm->start and next;
241             $self->_iterate($distfile, \%params);
242             $pm->finish;
243             }
244             $pm->wait_all_children;
245             return 1;
246             }
247              
248             sub _iterate {
249             my ($self, $distfile, $params) = @_;
250             my $job = {
251             distfile => $distfile,
252             distpath => $self->_fullpath($distfile),
253             tempdir => File::Temp->newdir(),
254             stash => $self->stash,
255             quiet => $self->quiet,
256             result => {},
257             };
258             $job->{result}{check} = $params->{check}->($job) or return;
259             $job->{result}{start} = $params->{start}->($job);
260             ACTION: {
261             $job->{result}{extract} = $params->{extract}->($job) or last ACTION;
262             $job->{result}{enter} = $params->{enter}->($job) or last ACTION;
263             $job->{result}{visit} = $params->{visit}->($job);
264             $job->{result}{leave} = $params->{leave}->($job);
265             }
266             $params->{finish}->($job);
267             return;
268             }
269              
270             sub _fullpath {
271             my ($self, $distfile) = @_;
272             my ($two, $one) = $distfile =~ /\A((.).)/;
273             return Path::Class::file(
274             $self->cpan, 'authors', 'id', $one, $two, $distfile
275             )->absolute->stringify;
276             }
277              
278             __PACKAGE__->meta->make_immutable;
279              
280             1;
281              
282              
283              
284             =pod
285              
286             =head1 NAME
287              
288             CPAN::Visitor - Generic traversal of distributions in a CPAN repository
289              
290             =head1 VERSION
291              
292             version 0.003
293              
294             =head1 SYNOPSIS
295              
296             use CPAN::Visitor;
297             my $visitor = CPAN::Visitor->new( cpan => "/path/to/cpan" );
298            
299             # Prepare to visit all distributions
300             $visitor->select();
301            
302             # Or a subset of distributions
303             $visitor->select(
304             subtrees => [ qr{D/DA}, qr{A/AD} ], # relative to authors/id/
305             exclude => qr{/Acme-}, # No Acme- dists
306             match => qr{/Test-} # Only Test- dists
307             );
308            
309             # Action is specified via a callback
310             $visitor->iterate(
311             visit => sub {
312             my $job = shift;
313             print $job->{distfile} if -f 'Build.PL'
314             }
315             );
316            
317             # Or start with a list of files
318             $visitor = CPAN::Visitor->new(
319             cpan => "/path/to/cpan",
320             files => \@distfiles, # e.g. ANDK/CPAN-1.94.tar.gz
321             );
322             $visitor->iterate( visit => \&callback );
323            
324             # Iterate in parallel
325             $visitor->iterate( visit => \&callback, jobs => 5 );
326              
327             =head1 DESCRIPTION
328              
329             A very generic, callback-driven program to iterate over a CPAN repository.
330              
331             Needs better documentation and tests, but is provided for others to examine,
332             use or contribute to.
333              
334             =for Pod::Coverage BUILD
335              
336             =head1 USAGE
337              
338             =head2 new
339              
340             my $visitor = CPAN::Visitor->new( @args );
341              
342             Object attributes include:
343              
344             =over
345              
346             =item *
347              
348             C<<< cpan >>> -- path to CPAN or mini CPAN repository. Required.
349              
350             =item *
351              
352             C<<< quiet >>> -- whether warnings should be silenced (e.g. from extraction). Optional.
353              
354             =item *
355              
356             C<<< stash >>> -- hash-ref of user-data to be made available during iteration. Optional.
357              
358             =item *
359              
360             C<<< files >>> -- array-ref with a pre-selection of of distribution files.
361             These must be in AUTHORE<sol>NAME.suffix format. Optional.
362              
363             =back
364              
365             =head2 select
366              
367             $visitor->select( @args );
368              
369             Valid arguments include:
370              
371             =over
372              
373             =item *
374              
375             C<<< subtrees >>> -- path or array-ref of paths. These must be relative to the
376             'authorsE<sol>idE<sol>' directory within a CPAN repo. If given, only files within
377             those subtrees will be considered. If not specified, the entire 'authorsE<sol>id'
378             tree is searched.
379              
380             =item *
381              
382             C<<< exclude >>> -- qr() or array-ref of qr() patterns. If a path matches B<any>
383             pattern, it is excluded
384              
385             =item *
386              
387             C<<< match >>> -- qr() or array-ref of qr() patterns. If an array-ref is provided,
388             only paths that match B<all> patterns are included
389              
390             =item *
391              
392             all_files -- boolean that determines whether all files or only files that have
393             a distribution archive suffix are selected. Default is false.
394              
395             =item *
396              
397             append -- boolean that determines whether the selected files should be
398             appended to previously selected files. The default is false, which replaces
399             any previous selection
400              
401             =back
402              
403             The C<<< select >>> method returns a count of files selected.
404              
405             =head2 iterate
406              
407             $visitor->iterate( @args );
408              
409             Valid arguments include:
410              
411             =over
412              
413             =item *
414              
415             C<<< jobs >>> -- non-negative integer specifying the maximum number of
416             forked processes. Defaults to none.
417              
418             =item *
419              
420             C<<< check >>> -- code reference callback
421              
422             =item *
423              
424             C<<< start >>> -- code reference callback
425              
426             =item *
427              
428             C<<< extract >>> -- code reference callback
429              
430             =item *
431              
432             C<<< enter >>> -- code reference callback
433              
434             =item *
435              
436             C<<< visit >>> -- code reference callback
437              
438             =item *
439              
440             C<<< leave >>> -- code reference callback
441              
442             =item *
443              
444             C<<< finish >>> -- code reference callback
445              
446             =back
447              
448             See L<ACTION CALLBACKS> for more. Generally, you only need to provide the
449             C<<< visit >>> callback, which is called from inside the unpacked distribution
450             directory.
451              
452             The C<<< iterate >>> method always returns true.
453              
454             =head1 ACTION CALLBACKS
455              
456             Each selected distribution is processed with a series of callback
457             functions. These are each passed a hash-ref with information about
458             the particular distribution being processed.
459              
460             sub _my_visit {
461             my $job = shift;
462             # do stuff
463             }
464              
465             The job hash-ref is initialized with the following fields:
466              
467             =over
468              
469             =item *
470              
471             C<<< distfile >>> -- the unique, short CPAN distfile name,
472             e.g. DAGOLDENE<sol>CPAN-Visitor-0.001.tar.gz
473              
474             =item *
475              
476             C<<< distpath >>> -- the absolute path the distribution archive,
477             e.g. E<sol>myE<sol>cpanE<sol>authorsE<sol>idE<sol>DE<sol>DAE<sol>DAGOLDENE<sol>CPAN-Visitor-0.001.tar.gz
478              
479             =item *
480              
481             C<<< tempdir >>> -- a File::Temp directory object for extraction or other things
482              
483             =item *
484              
485             C<<< stash >>> -- the 'stash' hashref from the Visitor object
486              
487             =item *
488              
489             C<<< quiet >>> -- the 'quiet' flag from the Visitor object
490              
491             =item *
492              
493             C<<< result >>> -- an empty hashref to start; the return values from each
494             action are added and may be referenced by subsequent actions
495              
496             =back
497              
498             The C<<< result >>> field is used to accumulate the return values from action
499             callbacks. For example, the return value from the default 'extract' action is
500             the unpacked distribution directory:
501              
502             $job->{result}{extract} # distribution directory path
503              
504             You do not need to store the results yourself -- the C<<< iterate >>> method
505             takes care of it for you.
506              
507             Callbacks occur in the following order. Some callbacks skip further
508             processing if the return value is false.
509              
510             =over
511              
512             =item *
513              
514             C<<< check >>> -- determines whether the distribution should be processed;
515             goes to next file if false; default is always true
516              
517             =item *
518              
519             C<<< start >>> -- used for any setup, logging, etc; default does nothing
520              
521             =item *
522              
523             C<<< extract >>> -- extracts a distribution into a temp directory or otherwise
524             prepares for visiting; skips to finish action if it returns
525             a false value; default returns the path to the extracted
526             directory
527              
528             =item *
529              
530             C<<< enter >>> -- skips to the finish action if it returns false; default takes
531             the result of extract, chdir's into it, and returns the
532             original directory
533              
534             =item *
535              
536             C<<< visit >>> -- examine the distribution or otherwise do stuff; the default
537             does nothing;
538              
539             =item *
540              
541             C<<< leave >>> -- default returns to the original directory (the result of enter)
542              
543             =item *
544              
545             C<<< finish >>> -- any teardown processing, logging, etc.
546              
547             =back
548              
549             These allow complete customization of the iteration process. For example,
550             one could do something like this:
551              
552             =over
553              
554             =item *
555              
556             replace the default C<<< extract >>> callback with one that returns
557             an arrayref of distribution files without actually unpacking it into
558             a physical directory
559              
560             =item *
561              
562             replace the default C<<< enter >>> callback with one that does nothing but
563             return a true value; replace the default C<<< leave >>> callback likewise
564              
565             =item *
566              
567             have the C<<< visit >>> callback get the C<<< $job->{result}{extract} >>> listing
568             and examine it for the presence of certain files
569              
570             =back
571              
572             This could potentially speed up iteration if only the file names within
573             the distribution are of interest and not the contents of the actual files.
574              
575             =head1 BUGS
576              
577             Please report any bugs or feature requests using the CPAN Request Tracker
578             web interface at L<http://rt.cpan.org/Dist/Display.html?Queue=CPAN-Visitor>
579              
580             When submitting a bug or request, please include a test-file or a patch to an
581             existing test-file that illustrates the bug or desired feature.
582              
583             =head1 SEE ALSO
584              
585             =over
586              
587             =item *
588              
589             L<App::CPAN::Mini::Visit>
590              
591             =item *
592              
593             L<CPAN::Mini::Visit>
594              
595             =back
596              
597             =for :stopwords cpan testmatrix url annocpan anno bugtracker rt cpants kwalitee diff irc mailto metadata placeholders metacpan
598              
599             =head1 SUPPORT
600              
601             =head2 Bugs / Feature Requests
602              
603             Please report any bugs or feature requests through the issue tracker
604             at L<http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-Visitor>.
605             You will be notified automatically of any progress on your issue.
606              
607             =head2 Source Code
608              
609             This is open source software. The code repository is available for
610             public review and contribution under the terms of the license.
611              
612             L<https://github.com/dagolden/cpan-visitor>
613              
614             git clone https://github.com/dagolden/cpan-visitor.git
615              
616             =head1 AUTHOR
617              
618             David Golden <dagolden@cpan.org>
619              
620             =head1 COPYRIGHT AND LICENSE
621              
622             This software is Copyright (c) 2010 by David Golden.
623              
624             This is free software, licensed under:
625              
626             The Apache License, Version 2.0, January 2004
627              
628             =cut
629              
630              
631             __END__
632              
633