File Coverage

blib/lib/WWW/PkgFind.pm

Criterion	Covered	Total	%
statement	25	27	92.5
branch			n/a
condition			n/a
subroutine	9	9	100.0
pod			n/a
total	34	36	94.4

line	stmt	sub	time	code
1				=head1 NAME
2
3				WWW::PkgFind - Spiders given URL(s) mirroring wanted files and
4				triggering post-processing (e.g. tests) against them.
5
6				=head1 SYNOPSIS
7
8				my $Pkg = new WWW::PkgFind("my_package");
9
10				$Pkg->depth(3);
11				$Pkg->active_urls("ftp://ftp.somesite.com/pub/joe/foobar/");
12				$Pkg->wanted_regex("patch-2\.6\..*gz", "linux-2\.6.\d+\.tar\.bz2");
13				$Pkg->set_create_queue("/testing/packages/QUEUE");
14				$Pkg->retrieve();
15
16				=head1 DESCRIPTION
17
18				This module provides a way to mirror new packages on the web and trigger
19				post-processing operations against them. It allows you to point it at
20				one or more URLs and scan for any links matching (or not matching) given
21				patterns, and downloading them to a given location. Newly downloaded
22				files are also identified in a queue for other programs to perform
23				post-processing operations on, such as queuing test runs.
24
25
26
27				=head1 FUNCTIONS
28
29				=cut
30
31				package WWW::PkgFind;
32
33	1	1	24736	use strict;
	1		1
	1		43
34	1	1	5	use warnings;
	1		2
	1		36
35	1	1	161693	use Pod::Usage;
	1		149112
	1		160
36	1	1	2035	use Getopt::Long;
	1		19928
	1		7
37	1	1	1065	use LWP::Simple;
	1		107298
	1		12
38	1	1	1726	use WWW::RobotRules;
	1		4972
	1		32
39	1	1	869	use File::Spec::Functions;
	1		840
	1		96
40	1	1	11	use File::Path;
	1		1
	1		53
41	1	1	458	use Algorithm::Numerical::Shuffle qw /shuffle/;
	0
	0
42
43				use fields qw(
44				_debug
45				package_name
46				depth
47				wanted_regex
48				not_wanted_regex
49				rename_regexp
50				mirrors
51				mirror_url
52				parent_url
53				active_urls
54				robot_urls
55				files
56				processed
57				create_queue
58				rules
59				user_agent
60				);
61
62				use vars qw( %FIELDS $VERSION );
63				$VERSION = '1.00';
64
65				=head2 new([$pkg_name], [$agent_desc])
66
67				Creates a new WWW::PkgFind object, initializing all data members.
68
69				pkg_name is an optional argument to specify the name of the package.
70				WWW::PkgFind will place files it downloads into a directory of this
71				name. If not defined, will default to "unnamed_package".
72
73				agent_desc is an optional parameter to be appended to the user agent
74				string that WWW::PkgFind uses when accessing remote websites.
75
76				=cut
77				sub new {
78				my $this = shift;
79				my $class = ref($this) \|\| $this;
80				my $self = bless [\%FIELDS], $class;
81
82				my $host = `hostname` \|\| "nameless"; chomp $host;
83
84				$self->{package_name} = shift \|\| 'unnamed_package';
85				$self->{depth} = 5;
86				$self->{wanted_regex} = [ ];
87				$self->{not_wanted_regex} = [ ];
88				$self->{rename_regexp} = '';
89				$self->{mirrors} = [ ];
90				$self->{mirror_url} = '';
91				$self->{active_urls} = [ ];
92				$self->{robot_urls} = { };
93				$self->{files} = [ ];
94				$self->{processed} = undef;
95				$self->{create_queue} = undef;
96				$self->{rules} = WWW::RobotRules->new(__PACKAGE__."/$VERSION");
97				my $agent_desc = shift \|\| '';
98				$self->{user_agent} = __PACKAGE__."/$VERSION $host spider $agent_desc";
99
100				$self->{_debug} = 0;
101
102				return $self;
103				}
104
105				########################################################################
106				# Accessors #
107				########################################################################
108
109				=head2 package_name()
110
111				Gets or sets the package name. When a file is downloaded, it will be
112				placed into a sub-directory by this name.
113
114				=cut
115				sub package_name {
116				my $self = shift;
117				if (@_) {
118				$self->{package_name} = shift;
119				}
120				return $self->{package_name};
121				}
122
123				# Undocumented function. I don't think this is actually needed, but the
124				# pkgfind script requires it.
125				sub parent_url {
126				my $self = shift;
127				if (@_) {
128				$self->{parent_url} = shift;
129				}
130				return $self->{parent_url};
131				}
132
133				=head2 depth()
134
135				Gets or sets the depth to spider below URLs. Set to 0 if only the
136				specified URL should be scanned for new packages. Defaults to 5.
137
138				A typical use for this would be if you are watching a site where new
139				patches are posted, and the patches are organized by the version of
140				software they apply to, such as ".../linux/linux-2.6.17/*.dif".
141
142				=cut
143				sub depth {
144				my $self = shift;
145				if (@_) {
146				$self->{depth} = shift;
147				}
148				return $self->{depth};
149				}
150
151				=head2 wanted_regex($regex1, [$regex2, ...])
152
153				Gets or adds a regular expression to control what is downloaded from a
154				page. For instance, a project might post source tarballs, binary
155				tarballs, zip files, rpms, etc., but you may only be interested in the
156				source tarballs. You might specify this by calling
157
158				$self->wanted_regex("^.\.tar\.gz$", "^.\.tgz$");
159
160				By default, all files linked on the active urls will be retrieved
161				(including html and txt files.)
162
163				You can call this function multiple times to add additional regex's.
164
165				The return value is the current array of regex's.
166
167				=cut
168				sub wanted_regex {
169				my $self = shift;
170
171				foreach my $regex (@_) {
172				next unless $regex;
173				push @{$self->{wanted_regex}}, $regex;
174				}
175				return @{$self->{wanted_regex}};
176				}
177
178				=head2 not_wanted_regex()
179
180				Gets or adds a regular expression to control what is downloaded from a
181				page. Unlike the wanted_regex, this specifies what you do not want.
182				These regex's are applied after the wanted_regex's, thus allowing you
183				to fine tune the selections.
184
185				A typical use of this might be to limit the range of release versions
186				you're interested in, or to exclude certain packages (such as
187				pre-release versions).
188
189				You can call this function multiple times to add additional regexp's.
190
191				The return value is the current array of regex's.
192
193				=cut
194				sub not_wanted_regex {
195				my $self = shift;
196
197				foreach my $regex (@_) {
198				next unless $regex;
199				push @{$self->{not_wanted_regex}}, $regex;
200				}
201				return @{$self->{not_wanted_regex}};
202				}
203
204				=head2 mirrors()
205
206				Sets or gets the list of mirrors to use for the package. This causes
207				the URL to be modified to include the mirror name prior to retrieval.
208				The mirror used will be selected randomly from the list of mirrors
209				provided.
210
211				This is designed for use with SourceForge's file mirror system, allowing
212				WWW::PkgFind to watch a project's file download area on
213				prdownloads.sourceforge.net and retrieve files through the mirrors.
214
215				You can call this function multiple times to add additional regexp's.
216
217				=cut
218				sub mirrors {
219				my $self = shift;
220
221				foreach my $mirror (@_) {
222				next unless $mirror;
223				push @{$self->{mirrors}}, $mirror;
224				}
225				return @{$self->{mirrors}};
226				}
227
228				=head2 mirror_url()
229
230				Gets or sets the URL template to use when fetching from a mirror system
231				like SourceForge's. The strings "MIRROR" and "FILENAME" in the URL will
232				be substituted appropriately when retrieve() is called.
233
234				=cut
235				sub mirror_url {
236				my $self = shift;
237
238				if (@_) {
239				$self->{mirror_url} = shift;
240				}
241				return $self->{mirror_url};
242				}
243
244				# rename_regex()
245
246				# Gets or sets a regular expression to be applied to the filename after it
247				# is downloaded. This allows you to fix-up filenames of packages, such as to
248				# reformat the version info and so forth.
249
250				sub rename_regex {
251				my $self = shift;
252
253				if (@_) {
254				$self->{rename_regex} = shift;
255				}
256				return $self->{rename_regex};
257				}
258
259				=head2 active_urls([$url1], [$url2], ...)
260
261				Gets or adds URLs to be scanned for new file releases.
262
263				You can call this function multiple times to add additional regexp's.
264
265				=cut
266				sub active_urls {
267				my $self = shift;
268
269				foreach my $url (@_) {
270				next unless $url;
271				push @{$self->{active_urls}}, [$url, 0];
272				}
273				return @{$self->{active_urls}};
274				}
275
276				# Undocumented function
277				sub robot_urls {
278				my $self = shift;
279
280				foreach my $url (@_) {
281				next unless $url;
282				$self->{robot_urls}->{$url} = 1;
283				}
284				return keys %{$self->{robot_urls}};
285				}
286
287				=head2 files()
288
289				Returns a list of the files that were found at the active URLs, that
290				survived the wanted_regex and not_wanted_regex patterns. This is for
291				informational purposes only.
292
293				=cut
294				sub files {
295				my $self = shift;
296
297				return @{$self->{files}};
298				}
299
300				=head2 processed()
301
302				Returns true if retrieved() has been called.
303
304				=cut
305				sub processed {
306				my $self = shift;
307				return $self->{processed};
308				}
309
310				=head2 set_create_queue($dir)
311
312				Specifies that the retrieve() routine should also create a symlink queue
313				in the specified directory.
314
315				=cut
316				sub set_create_queue {
317				my $self = shift;
318
319				if (@_) {
320				$self->{create_queue} = shift;
321				}
322
323				return $self->{create_queue};
324				}
325
326				=head2 set_debug($debug)
327
328				Turns on debug level. Set to 0 or undef to turn off.
329
330				=cut
331				sub set_debug {
332				my $self = shift;
333
334				if (@_) {
335				$self->{_debug} = shift;
336				}
337
338				return $self->{_debug};
339				}
340
341				########################################################################
342				# Helper functions #
343				########################################################################
344
345				=head3 want_file($file)
346
347				Checks the regular expressions in the Pkg hash.
348				Returns 1 (true) if file matches at least one wanted regexp
349				and none of the not_wanted regexp's. If the file matches a
350				not-wanted regexp, it returns 0 (false). If it has no clue what
351				the file is, it returns undef (false).
352
353				=cut
354				sub want_file {
355				my $self = shift;
356				my $file = shift;
357
358				warn "Considering '$file'...\n" if $self->{_debug}>3;
359				foreach my $pattern ( @{$self->{'not_wanted_regex'}} ) {
360				warn "Checking against not wanted pattern '$pattern'\n" if $self->{_debug}>3;
361				if ($file =~ m/$pattern/) {
362				warn "no\n" if $self->{_debug}>3;
363				return 0;
364				}
365				}
366				foreach my $pattern ( @{$self->{'wanted_regex'}} ) {
367				warn "Checking against wanted pattern '$pattern'\n" if $self->{_debug}>3;
368				if ($file =~ m/$pattern/) {
369				warn "yes\n" if $self->{_debug}>3;
370				return 1;
371				}
372				}
373				warn "maybe\n" if $self->{_debug}>3;
374				return undef;
375				}
376
377				=head2 get_file($url, $dest)
378
379				Retrieves the given URL, returning true if the file was
380				successfully obtained and placed at $dest, false if something
381				prevented this from happening.
382
383				get_file also checks for and respects robot rules, updating the
384				$rules object as needed, and caching url's it's checked in
385				%robot_urls. $robot_urls{$url} will be >0 if a robots.txt was
386				found and parsed, <0 if no robots.txt was found, and
387				undef if the url has not yet been checked.
388
389				=cut
390				sub get_file {
391				my $self = shift;
392				my $url = shift \|\| return undef;
393				my $dest = shift \|\| return undef;
394
395				warn "Creating URI object using '$url'\n" if $self->{_debug}>2;
396				my $uri = URI->new($url);
397				if (! $uri->can("host") ) {
398				warn "ERROR: URI object lacks host() object method\n";
399				return undef;
400				} elsif (! defined $self->{robot_urls}->{$uri->host()}) {
401				my $robot_url = $uri->host() . "/robots.txt";
402				my $robot_txt = get $robot_url;
403				if (defined $robot_txt) {
404				$self->{rules}->parse($url, $robot_txt);
405				$self->{robot_urls}->{$uri->host()} = 1;
406				} else {
407				warn "ROBOTS: Could not find '$robot_url'\n";
408				$self->{robot_urls}->{$uri->host()} = -1;
409				}
410				}
411
412				if (! $self->{rules}->allowed($url) ) {
413				warn "ROBOTS: robots.txt denies access to '$url'\n";
414				return 0;
415				}
416
417				if (! -e "/usr/bin/curl") {
418				die "ERROR: Could not locate curl executable at /usr/bin/curl!";
419				}
420
421				my $incoming = "${dest}.incoming";
422				system("/usr/bin/curl",
423				"--user-agent","'$self->{user_agent}'",
424				"-Lo","$incoming",$url);
425				my $retval = $?;
426				if ($retval != 0) {
427				warn "CURL ERROR($retval)\n";
428				unlink($incoming);
429				return 0;
430				}
431
432				if (! rename($incoming, $dest)) {
433				warn "RENAME FAILED: '$incoming' -> '$dest'\n";
434				return 0;
435				}
436
437				return 1;
438				}
439
440
441				# Internal routine
442				sub _process_active_urls {
443				my $self = shift;
444
445				warn "In WWW::PkgFind::_process_active_urls()\n" if $self->{_debug}>4;
446
447				while ($self->{'active_urls'} && @{$self->{'active_urls'}}) {
448				warn "Processing active_url\n" if $self->{_debug}>3;
449				my $u_d = pop @{$self->{'active_urls'}};
450
451				if (! $u_d) {
452				warn "Undefined url/depth. Skipping\n" if $self->{_debug}>0;
453				next;
454				}
455				my ($url, $depth) = @{$u_d};
456				if (! defined $depth) {
457				$depth = 1;
458				warn "Current depth undefined... assuming $depth\n" if $self->{_debug}>0;
459				}
460
461				warn "depth=$depth; self->depth=$self->{'depth'}\n" if $self->{_debug}>4;
462				next if ( $depth > $self->{'depth'});
463
464				# Get content of this page
465				warn "# Getting webpage $url\n" if $self->{_debug}>0;
466				my $content = get($url);
467				if (! $content) {
468				warn "No content retrieved for '$url'\n" if $self->{_debug}>0;
469				next;
470				}
471
472				# Grep for files
473				my @lines = split /\<\s*A\s/si, $content;
474				foreach my $line (@lines) {
475				next unless ($line && $line =~ /HREF\s\=\s(\'\|\")/si);
476				my ($quote, $match) = $line =~ m/HREF\s\=\s(\'\|\")(.*?)(\'\|\")/si;
477				my $new_url = $url;
478				$new_url =~ s\|/$\|\|;
479
480				$self->_process_line($match, $new_url, $depth);
481				}
482				}
483				}
484
485				# _process_line($match, $new_url, $depth)
486				# Processes one line, extracting files to be retrieved
487				sub _process_line {
488				my $self = shift;
489				my $match = shift or return undef;
490				my $new_url = shift;
491				my $depth = shift \|\| 1;
492
493				warn "In WWW::PkgFind::_process_line()\n" if $self->{_debug}>4;
494
495				my $is_wanted = $self->want_file($match);
496				if ( $is_wanted ) {
497				warn "FOUND FILE '$match'\n" if $self->{_debug}>1;
498				push @{$self->{'files'}}, "$new_url/$match";
499				# push @{$self->{'files'}}, "$match";
500
501				} elsif (! defined $is_wanted) {
502				return if ($depth == $self->{'depth'});
503				if ( $match && $match ne '/' && $match !~ /^\?/) {
504				# Is this a directory?
505				return if ( $match =~ /\.\./);
506				return if ( $match =~ /sign$/ );
507				return if ( $match =~ /gz$/ );
508				return if ( $match =~ /bz2$/ );
509				return if ( $match =~ /dif$/ );
510				return if ( $match =~ /patch$/ );
511
512				if ($new_url =~ m/htm$\|html$/) {
513				# Back out of index.htm[l] type files
514				$new_url .= '/..';
515				}
516
517				my $new_depth = $depth + 1;
518				if ($match =~ m\|^/\|) {
519				# Handle absolute links
520				my $uri = URI->new($new_url);
521				my $path = $uri->path();
522				my @orig_path = $uri->path();
523
524				# Link points somewhere outside our tree... skip it
525				return if ($match !~ m\|^$path\|);
526
527				# Construct new url for $match
528				$new_url = $uri->scheme() . '://'
529				. $uri->authority()
530				. $match;
531				$uri = URI->new($new_url);
532
533				# Account for a link that goes deeper than 1 level
534				# into the file tree, e.g. '$url/x/y/z/foo.txt'
535				my @new_path = $uri->path();
536				my $path_size = @new_path-@orig_path;
537				if ($path_size < 1) {
538				$path_size = 1;
539				}
540				$new_depth = $depth + $path_size;
541
542				} else {
543				# For relative links, simply append to current
544				$new_url .= "/$match";
545				}
546
547				warn "FOUND SUBDIR(?) '$new_url'\n" if $self->{_debug}>1;
548				push @{$self->{'active_urls'}}, [ $new_url, $new_depth ];
549				}
550
551				} elsif ($is_wanted == 0) {
552				warn "NOT WANTED: '$match'\n" if $self->{_debug}>1;
553				}
554				}
555
556
557				=head2 retrieve($destination)
558
559				This function performs the actual scanning and retrieval of packages.
560				Call this once you've configured everything. The required parameter
561				$destination is used to specify where on the local filesystem files
562				should be stored. retrieve() will create a subdirectory for the package
563				name under this location, if it doesn't already exist.
564
565				The function will obey robot rules by checking for a robots.txt file,
566				and can be made to navigate a mirror system like SourceForge (see
567				mirrors() above).
568
569				If configured, it will also create a symbolic link to the newly
570				downloaded file(s) in the directory specified by the set_create_queue()
571				function.
572
573				=cut
574				sub retrieve {
575				my $self = shift;
576				my $destination = shift;
577
578				warn "In WWW::PkgFind::retrieve()\n" if $self->{_debug}>4;
579
580				if (! $destination ) {
581				warn "No destination specified to WWW::PkgFind::retrieve()\n";
582				return undef;
583				}
584
585				# If no wanted regexp's have been specified, we want everything
586				if (! defined $self->{'wanted_regex'}->[0] ) {
587				warn "No regexp's specified; retrieving everything.\n" if $self->{_debug}>2;
588				push @{$self->{'wanted_regex'}}, '.*';
589				}
590
591				# Retrieve the listing of available files
592				warn "Processing active urls\n" if $self->{_debug}>2;
593				$self->_process_active_urls();
594
595				if (! $self->{'package_name'}) {
596				warn "Error: No package name defined\n";
597				return undef;
598				}
599
600				my $dest_dir = catdir($destination, $self->{'package_name'});
601				if (! -d $dest_dir) {
602				eval { mkpath([$dest_dir], 0, 0777); };
603				if ($@) {
604				warn "Error: Couldn't create '$dest_dir': $@\n";
605				return undef;
606				}
607				}
608
609				# Download wanted files
610				foreach my $wanted_url (@{$self->{'files'}}) {
611				my @parts = split(/\//, $wanted_url);
612				my $filename = pop @parts;
613				my $dest = "$dest_dir/$filename";
614
615				warn "Considering file '$filename'\n" if $self->{_debug}>2;
616
617				if (! $filename) {
618				warn "NOT FILENAME: '$wanted_url'\n";
619				} elsif (-f $dest) {
620				warn "EXISTS: '$dest'\n" if $self->{_debug}>0;
621				} else {
622				warn "NEW '$wanted_url'\n" if $self->{_debug}>0;
623				my $found = undef;
624
625				if ($self->mirrors() > 0) {
626				foreach my $mirror (shuffle $self->mirrors()) {
627				my $mirror_url = $self->mirror_url() \|\| $wanted_url;
628				$mirror_url =~ s/MIRROR/$mirror/g;
629				$mirror_url =~ s/FILENAME/$filename/g;
630				warn "MIRROR: Trying '$mirror_url'\n" if $self->{_debug}>0;
631				if ($self->get_file($mirror_url, $dest)) {
632				$found = 1;
633				last;
634				}
635				}
636				} elsif (! $self->get_file($wanted_url, $dest)) {
637				warn "FAILED RETRIEVING $wanted_url. Skipping.\n";
638				} else {
639				$found = 1;
640				}
641
642				if ($found) {
643				warn "RETRIEVED $dest\n";
644
645				if (defined $self->{create_queue}) {
646				# Create a symlink queue
647				symlink("$dest", "$self->{create_queue}/$filename")
648				or warn("Could not create symbolic link $self->{create_queue}/$filename: $!\n");
649				}
650				}
651				}
652				}
653
654				return $self->{processed} = 1;
655				}
656
657				=head1 AUTHOR
658
659				Bryce Harrington
660
661				=head1 COPYRIGHT
662
663				Copyright (C) 2006 Bryce Harrington.
664				All Rights Reserved.
665
666				This script is free software; you can redistribute it and/or modify it
667				under the same terms as Perl itself.
668
669				=head1 SEE ALSO
670
671				L
672
673				=cut
674
675
676				1;