File Coverage

blib/lib/WWW/PkgFind.pm
Criterion Covered Total %
statement 25 27 92.5
branch n/a
condition n/a
subroutine 9 9 100.0
pod n/a
total 34 36 94.4


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             WWW::PkgFind - Spiders given URL(s) mirroring wanted files and
4             triggering post-processing (e.g. tests) against them.
5              
6             =head1 SYNOPSIS
7              
8             my $Pkg = new WWW::PkgFind("my_package");
9              
10             $Pkg->depth(3);
11             $Pkg->active_urls("ftp://ftp.somesite.com/pub/joe/foobar/");
12             $Pkg->wanted_regex("patch-2\.6\..*gz", "linux-2\.6.\d+\.tar\.bz2");
13             $Pkg->set_create_queue("/testing/packages/QUEUE");
14             $Pkg->retrieve();
15              
16             =head1 DESCRIPTION
17              
18             This module provides a way to mirror new packages on the web and trigger
19             post-processing operations against them. It allows you to point it at
20             one or more URLs and scan for any links matching (or not matching) given
21             patterns, and downloading them to a given location. Newly downloaded
22             files are also identified in a queue for other programs to perform
23             post-processing operations on, such as queuing test runs.
24              
25              
26              
27             =head1 FUNCTIONS
28              
29             =cut
30              
31             package WWW::PkgFind;
32              
33 1     1   24736 use strict;
  1         1  
  1         43  
34 1     1   5 use warnings;
  1         2  
  1         36  
35 1     1   161693 use Pod::Usage;
  1         149112  
  1         160  
36 1     1   2035 use Getopt::Long;
  1         19928  
  1         7  
37 1     1   1065 use LWP::Simple;
  1         107298  
  1         12  
38 1     1   1726 use WWW::RobotRules;
  1         4972  
  1         32  
39 1     1   869 use File::Spec::Functions;
  1         840  
  1         96  
40 1     1   11 use File::Path;
  1         1  
  1         53  
41 1     1   458 use Algorithm::Numerical::Shuffle qw /shuffle/;
  0            
  0            
42              
43             use fields qw(
44             _debug
45             package_name
46             depth
47             wanted_regex
48             not_wanted_regex
49             rename_regexp
50             mirrors
51             mirror_url
52             parent_url
53             active_urls
54             robot_urls
55             files
56             processed
57             create_queue
58             rules
59             user_agent
60             );
61              
62             use vars qw( %FIELDS $VERSION );
63             $VERSION = '1.00';
64              
65             =head2 new([$pkg_name], [$agent_desc])
66              
67             Creates a new WWW::PkgFind object, initializing all data members.
68              
69             pkg_name is an optional argument to specify the name of the package.
70             WWW::PkgFind will place files it downloads into a directory of this
71             name. If not defined, will default to "unnamed_package".
72              
73             agent_desc is an optional parameter to be appended to the user agent
74             string that WWW::PkgFind uses when accessing remote websites.
75              
76             =cut
77             sub new {
78             my $this = shift;
79             my $class = ref($this) || $this;
80             my $self = bless [\%FIELDS], $class;
81              
82             my $host = `hostname` || "nameless"; chomp $host;
83              
84             $self->{package_name} = shift || 'unnamed_package';
85             $self->{depth} = 5;
86             $self->{wanted_regex} = [ ];
87             $self->{not_wanted_regex} = [ ];
88             $self->{rename_regexp} = '';
89             $self->{mirrors} = [ ];
90             $self->{mirror_url} = '';
91             $self->{active_urls} = [ ];
92             $self->{robot_urls} = { };
93             $self->{files} = [ ];
94             $self->{processed} = undef;
95             $self->{create_queue} = undef;
96             $self->{rules} = WWW::RobotRules->new(__PACKAGE__."/$VERSION");
97             my $agent_desc = shift || '';
98             $self->{user_agent} = __PACKAGE__."/$VERSION $host spider $agent_desc";
99              
100             $self->{_debug} = 0;
101              
102             return $self;
103             }
104              
105             ########################################################################
106             # Accessors #
107             ########################################################################
108              
109             =head2 package_name()
110              
111             Gets or sets the package name. When a file is downloaded, it will be
112             placed into a sub-directory by this name.
113              
114             =cut
115             sub package_name {
116             my $self = shift;
117             if (@_) {
118             $self->{package_name} = shift;
119             }
120             return $self->{package_name};
121             }
122              
123             # Undocumented function. I don't think this is actually needed, but the
124             # pkgfind script requires it.
125             sub parent_url {
126             my $self = shift;
127             if (@_) {
128             $self->{parent_url} = shift;
129             }
130             return $self->{parent_url};
131             }
132              
133             =head2 depth()
134              
135             Gets or sets the depth to spider below URLs. Set to 0 if only the
136             specified URL should be scanned for new packages. Defaults to 5.
137              
138             A typical use for this would be if you are watching a site where new
139             patches are posted, and the patches are organized by the version of
140             software they apply to, such as ".../linux/linux-2.6.17/*.dif".
141              
142             =cut
143             sub depth {
144             my $self = shift;
145             if (@_) {
146             $self->{depth} = shift;
147             }
148             return $self->{depth};
149             }
150              
151             =head2 wanted_regex($regex1, [$regex2, ...])
152              
153             Gets or adds a regular expression to control what is downloaded from a
154             page. For instance, a project might post source tarballs, binary
155             tarballs, zip files, rpms, etc., but you may only be interested in the
156             source tarballs. You might specify this by calling
157              
158             $self->wanted_regex("^.*\.tar\.gz$", "^.*\.tgz$");
159              
160             By default, all files linked on the active urls will be retrieved
161             (including html and txt files.)
162              
163             You can call this function multiple times to add additional regex's.
164              
165             The return value is the current array of regex's.
166              
167             =cut
168             sub wanted_regex {
169             my $self = shift;
170              
171             foreach my $regex (@_) {
172             next unless $regex;
173             push @{$self->{wanted_regex}}, $regex;
174             }
175             return @{$self->{wanted_regex}};
176             }
177              
178             =head2 not_wanted_regex()
179              
180             Gets or adds a regular expression to control what is downloaded from a
181             page. Unlike the wanted_regex, this specifies what you do *not* want.
182             These regex's are applied after the wanted_regex's, thus allowing you
183             to fine tune the selections.
184              
185             A typical use of this might be to limit the range of release versions
186             you're interested in, or to exclude certain packages (such as
187             pre-release versions).
188              
189             You can call this function multiple times to add additional regexp's.
190              
191             The return value is the current array of regex's.
192              
193             =cut
194             sub not_wanted_regex {
195             my $self = shift;
196              
197             foreach my $regex (@_) {
198             next unless $regex;
199             push @{$self->{not_wanted_regex}}, $regex;
200             }
201             return @{$self->{not_wanted_regex}};
202             }
203              
204             =head2 mirrors()
205              
206             Sets or gets the list of mirrors to use for the package. This causes
207             the URL to be modified to include the mirror name prior to retrieval.
208             The mirror used will be selected randomly from the list of mirrors
209             provided.
210              
211             This is designed for use with SourceForge's file mirror system, allowing
212             WWW::PkgFind to watch a project's file download area on
213             prdownloads.sourceforge.net and retrieve files through the mirrors.
214              
215             You can call this function multiple times to add additional regexp's.
216              
217             =cut
218             sub mirrors {
219             my $self = shift;
220              
221             foreach my $mirror (@_) {
222             next unless $mirror;
223             push @{$self->{mirrors}}, $mirror;
224             }
225             return @{$self->{mirrors}};
226             }
227              
228             =head2 mirror_url()
229              
230             Gets or sets the URL template to use when fetching from a mirror system
231             like SourceForge's. The strings "MIRROR" and "FILENAME" in the URL will
232             be substituted appropriately when retrieve() is called.
233              
234             =cut
235             sub mirror_url {
236             my $self = shift;
237              
238             if (@_) {
239             $self->{mirror_url} = shift;
240             }
241             return $self->{mirror_url};
242             }
243              
244             # rename_regex()
245              
246             # Gets or sets a regular expression to be applied to the filename after it
247             # is downloaded. This allows you to fix-up filenames of packages, such as to
248             # reformat the version info and so forth.
249              
250             sub rename_regex {
251             my $self = shift;
252              
253             if (@_) {
254             $self->{rename_regex} = shift;
255             }
256             return $self->{rename_regex};
257             }
258              
259             =head2 active_urls([$url1], [$url2], ...)
260              
261             Gets or adds URLs to be scanned for new file releases.
262              
263             You can call this function multiple times to add additional regexp's.
264              
265             =cut
266             sub active_urls {
267             my $self = shift;
268              
269             foreach my $url (@_) {
270             next unless $url;
271             push @{$self->{active_urls}}, [$url, 0];
272             }
273             return @{$self->{active_urls}};
274             }
275              
276             # Undocumented function
277             sub robot_urls {
278             my $self = shift;
279              
280             foreach my $url (@_) {
281             next unless $url;
282             $self->{robot_urls}->{$url} = 1;
283             }
284             return keys %{$self->{robot_urls}};
285             }
286              
287             =head2 files()
288              
289             Returns a list of the files that were found at the active URLs, that
290             survived the wanted_regex and not_wanted_regex patterns. This is for
291             informational purposes only.
292              
293             =cut
294             sub files {
295             my $self = shift;
296              
297             return @{$self->{files}};
298             }
299              
300             =head2 processed()
301              
302             Returns true if retrieved() has been called.
303              
304             =cut
305             sub processed {
306             my $self = shift;
307             return $self->{processed};
308             }
309              
310             =head2 set_create_queue($dir)
311              
312             Specifies that the retrieve() routine should also create a symlink queue
313             in the specified directory.
314              
315             =cut
316             sub set_create_queue {
317             my $self = shift;
318              
319             if (@_) {
320             $self->{create_queue} = shift;
321             }
322              
323             return $self->{create_queue};
324             }
325              
326             =head2 set_debug($debug)
327              
328             Turns on debug level. Set to 0 or undef to turn off.
329              
330             =cut
331             sub set_debug {
332             my $self = shift;
333              
334             if (@_) {
335             $self->{_debug} = shift;
336             }
337              
338             return $self->{_debug};
339             }
340              
341             ########################################################################
342             # Helper functions #
343             ########################################################################
344              
345             =head3 want_file($file)
346              
347             Checks the regular expressions in the Pkg hash.
348             Returns 1 (true) if file matches at least one wanted regexp
349             and none of the not_wanted regexp's. If the file matches a
350             not-wanted regexp, it returns 0 (false). If it has no clue what
351             the file is, it returns undef (false).
352              
353             =cut
354             sub want_file {
355             my $self = shift;
356             my $file = shift;
357              
358             warn "Considering '$file'...\n" if $self->{_debug}>3;
359             foreach my $pattern ( @{$self->{'not_wanted_regex'}} ) {
360             warn "Checking against not wanted pattern '$pattern'\n" if $self->{_debug}>3;
361             if ($file =~ m/$pattern/) {
362             warn "no\n" if $self->{_debug}>3;
363             return 0;
364             }
365             }
366             foreach my $pattern ( @{$self->{'wanted_regex'}} ) {
367             warn "Checking against wanted pattern '$pattern'\n" if $self->{_debug}>3;
368             if ($file =~ m/$pattern/) {
369             warn "yes\n" if $self->{_debug}>3;
370             return 1;
371             }
372             }
373             warn "maybe\n" if $self->{_debug}>3;
374             return undef;
375             }
376              
377             =head2 get_file($url, $dest)
378              
379             Retrieves the given URL, returning true if the file was
380             successfully obtained and placed at $dest, false if something
381             prevented this from happening.
382              
383             get_file also checks for and respects robot rules, updating the
384             $rules object as needed, and caching url's it's checked in
385             %robot_urls. $robot_urls{$url} will be >0 if a robots.txt was
386             found and parsed, <0 if no robots.txt was found, and
387             undef if the url has not yet been checked.
388              
389             =cut
390             sub get_file {
391             my $self = shift;
392             my $url = shift || return undef;
393             my $dest = shift || return undef;
394              
395             warn "Creating URI object using '$url'\n" if $self->{_debug}>2;
396             my $uri = URI->new($url);
397             if (! $uri->can("host") ) {
398             warn "ERROR: URI object lacks host() object method\n";
399             return undef;
400             } elsif (! defined $self->{robot_urls}->{$uri->host()}) {
401             my $robot_url = $uri->host() . "/robots.txt";
402             my $robot_txt = get $robot_url;
403             if (defined $robot_txt) {
404             $self->{rules}->parse($url, $robot_txt);
405             $self->{robot_urls}->{$uri->host()} = 1;
406             } else {
407             warn "ROBOTS: Could not find '$robot_url'\n";
408             $self->{robot_urls}->{$uri->host()} = -1;
409             }
410             }
411              
412             if (! $self->{rules}->allowed($url) ) {
413             warn "ROBOTS: robots.txt denies access to '$url'\n";
414             return 0;
415             }
416              
417             if (! -e "/usr/bin/curl") {
418             die "ERROR: Could not locate curl executable at /usr/bin/curl!";
419             }
420              
421             my $incoming = "${dest}.incoming";
422             system("/usr/bin/curl",
423             "--user-agent","'$self->{user_agent}'",
424             "-Lo","$incoming",$url);
425             my $retval = $?;
426             if ($retval != 0) {
427             warn "CURL ERROR($retval)\n";
428             unlink($incoming);
429             return 0;
430             }
431              
432             if (! rename($incoming, $dest)) {
433             warn "RENAME FAILED: '$incoming' -> '$dest'\n";
434             return 0;
435             }
436              
437             return 1;
438             }
439              
440              
441             # Internal routine
442             sub _process_active_urls {
443             my $self = shift;
444              
445             warn "In WWW::PkgFind::_process_active_urls()\n" if $self->{_debug}>4;
446              
447             while ($self->{'active_urls'} && @{$self->{'active_urls'}}) {
448             warn "Processing active_url\n" if $self->{_debug}>3;
449             my $u_d = pop @{$self->{'active_urls'}};
450              
451             if (! $u_d) {
452             warn "Undefined url/depth. Skipping\n" if $self->{_debug}>0;
453             next;
454             }
455             my ($url, $depth) = @{$u_d};
456             if (! defined $depth) {
457             $depth = 1;
458             warn "Current depth undefined... assuming $depth\n" if $self->{_debug}>0;
459             }
460              
461             warn "depth=$depth; self->depth=$self->{'depth'}\n" if $self->{_debug}>4;
462             next if ( $depth > $self->{'depth'});
463              
464             # Get content of this page
465             warn "# Getting webpage $url\n" if $self->{_debug}>0;
466             my $content = get($url);
467             if (! $content) {
468             warn "No content retrieved for '$url'\n" if $self->{_debug}>0;
469             next;
470             }
471              
472             # Grep for files
473             my @lines = split /\<\s*A\s/si, $content;
474             foreach my $line (@lines) {
475             next unless ($line && $line =~ /HREF\s*\=\s*(\'|\")/si);
476             my ($quote, $match) = $line =~ m/HREF\s*\=\s*(\'|\")(.*?)(\'|\")/si;
477             my $new_url = $url;
478             $new_url =~ s|/$||;
479              
480             $self->_process_line($match, $new_url, $depth);
481             }
482             }
483             }
484              
485             # _process_line($match, $new_url, $depth)
486             # Processes one line, extracting files to be retrieved
487             sub _process_line {
488             my $self = shift;
489             my $match = shift or return undef;
490             my $new_url = shift;
491             my $depth = shift || 1;
492              
493             warn "In WWW::PkgFind::_process_line()\n" if $self->{_debug}>4;
494              
495             my $is_wanted = $self->want_file($match);
496             if ( $is_wanted ) {
497             warn "FOUND FILE '$match'\n" if $self->{_debug}>1;
498             push @{$self->{'files'}}, "$new_url/$match";
499             # push @{$self->{'files'}}, "$match";
500              
501             } elsif (! defined $is_wanted) {
502             return if ($depth == $self->{'depth'});
503             if ( $match && $match ne '/' && $match !~ /^\?/) {
504             # Is this a directory?
505             return if ( $match =~ /\.\./);
506             return if ( $match =~ /sign$/ );
507             return if ( $match =~ /gz$/ );
508             return if ( $match =~ /bz2$/ );
509             return if ( $match =~ /dif$/ );
510             return if ( $match =~ /patch$/ );
511              
512             if ($new_url =~ m/htm$|html$/) {
513             # Back out of index.htm[l] type files
514             $new_url .= '/..';
515             }
516              
517             my $new_depth = $depth + 1;
518             if ($match =~ m|^/|) {
519             # Handle absolute links
520             my $uri = URI->new($new_url);
521             my $path = $uri->path();
522             my @orig_path = $uri->path();
523            
524             # Link points somewhere outside our tree... skip it
525             return if ($match !~ m|^$path|);
526            
527             # Construct new url for $match
528             $new_url = $uri->scheme() . '://'
529             . $uri->authority()
530             . $match;
531             $uri = URI->new($new_url);
532            
533             # Account for a link that goes deeper than 1 level
534             # into the file tree, e.g. '$url/x/y/z/foo.txt'
535             my @new_path = $uri->path();
536             my $path_size = @new_path-@orig_path;
537             if ($path_size < 1) {
538             $path_size = 1;
539             }
540             $new_depth = $depth + $path_size;
541              
542             } else {
543             # For relative links, simply append to current
544             $new_url .= "/$match";
545             }
546              
547             warn "FOUND SUBDIR(?) '$new_url'\n" if $self->{_debug}>1;
548             push @{$self->{'active_urls'}}, [ $new_url, $new_depth ];
549             }
550              
551             } elsif ($is_wanted == 0) {
552             warn "NOT WANTED: '$match'\n" if $self->{_debug}>1;
553             }
554             }
555              
556              
557             =head2 retrieve($destination)
558              
559             This function performs the actual scanning and retrieval of packages.
560             Call this once you've configured everything. The required parameter
561             $destination is used to specify where on the local filesystem files
562             should be stored. retrieve() will create a subdirectory for the package
563             name under this location, if it doesn't already exist.
564              
565             The function will obey robot rules by checking for a robots.txt file,
566             and can be made to navigate a mirror system like SourceForge (see
567             mirrors() above).
568              
569             If configured, it will also create a symbolic link to the newly
570             downloaded file(s) in the directory specified by the set_create_queue()
571             function.
572              
573             =cut
574             sub retrieve {
575             my $self = shift;
576             my $destination = shift;
577              
578             warn "In WWW::PkgFind::retrieve()\n" if $self->{_debug}>4;
579              
580             if (! $destination ) {
581             warn "No destination specified to WWW::PkgFind::retrieve()\n";
582             return undef;
583             }
584              
585             # If no wanted regexp's have been specified, we want everything
586             if (! defined $self->{'wanted_regex'}->[0] ) {
587             warn "No regexp's specified; retrieving everything.\n" if $self->{_debug}>2;
588             push @{$self->{'wanted_regex'}}, '.*';
589             }
590              
591             # Retrieve the listing of available files
592             warn "Processing active urls\n" if $self->{_debug}>2;
593             $self->_process_active_urls();
594              
595             if (! $self->{'package_name'}) {
596             warn "Error: No package name defined\n";
597             return undef;
598             }
599              
600             my $dest_dir = catdir($destination, $self->{'package_name'});
601             if (! -d $dest_dir) {
602             eval { mkpath([$dest_dir], 0, 0777); };
603             if ($@) {
604             warn "Error: Couldn't create '$dest_dir': $@\n";
605             return undef;
606             }
607             }
608              
609             # Download wanted files
610             foreach my $wanted_url (@{$self->{'files'}}) {
611             my @parts = split(/\//, $wanted_url);
612             my $filename = pop @parts;
613             my $dest = "$dest_dir/$filename";
614              
615             warn "Considering file '$filename'\n" if $self->{_debug}>2;
616              
617             if (! $filename) {
618             warn "NOT FILENAME: '$wanted_url'\n";
619             } elsif (-f $dest) {
620             warn "EXISTS: '$dest'\n" if $self->{_debug}>0;
621             } else {
622             warn "NEW '$wanted_url'\n" if $self->{_debug}>0;
623             my $found = undef;
624              
625             if ($self->mirrors() > 0) {
626             foreach my $mirror (shuffle $self->mirrors()) {
627             my $mirror_url = $self->mirror_url() || $wanted_url;
628             $mirror_url =~ s/MIRROR/$mirror/g;
629             $mirror_url =~ s/FILENAME/$filename/g;
630             warn "MIRROR: Trying '$mirror_url'\n" if $self->{_debug}>0;
631             if ($self->get_file($mirror_url, $dest)) {
632             $found = 1;
633             last;
634             }
635             }
636             } elsif (! $self->get_file($wanted_url, $dest)) {
637             warn "FAILED RETRIEVING $wanted_url. Skipping.\n";
638             } else {
639             $found = 1;
640             }
641            
642             if ($found) {
643             warn "RETRIEVED $dest\n";
644              
645             if (defined $self->{create_queue}) {
646             # Create a symlink queue
647             symlink("$dest", "$self->{create_queue}/$filename")
648             or warn("Could not create symbolic link $self->{create_queue}/$filename: $!\n");
649             }
650             }
651             }
652             }
653              
654             return $self->{processed} = 1;
655             }
656              
657             =head1 AUTHOR
658              
659             Bryce Harrington
660              
661             =head1 COPYRIGHT
662              
663             Copyright (C) 2006 Bryce Harrington.
664             All Rights Reserved.
665              
666             This script is free software; you can redistribute it and/or modify it
667             under the same terms as Perl itself.
668              
669             =head1 SEE ALSO
670              
671             L
672              
673             =cut
674              
675              
676             1;