File Coverage

blib/lib/WWW/Spider.pm

Criterion	Covered	Total	%
statement	7	9	77.7
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	10	12	83.3

line	stmt	sub	time	code
1				package WWW::Spider;
2
3				=head1 NAME
4
5				WWW::Spider - flexible Internet spider for fetching and analyzing websites
6
7				=head1 VERSION
8
9				This document describes C version 0.01_10
10
11				=head1 SYNOPSIS
12
13				#configuration
14				my $spider=new WWW::Spider;
15				$spider=new WWW::Spider({UASTRING=>"mybot"});
16
17				print $spider->uastring;
18				$spider->uastring('New UserAgent String');
19				$spider->user_agent(new LWP::UserAgent);
20
21				#basic stuff
22				print $spider->get_page_response('http://search.cpan.org/') -> content;
23				print $spider->get_page_content('http://search.cpan.org/');
24				$spider->get_links_from('http://google.com/'); #get array of URLs
25
26				#registering hooks
27
28				#crawling
29
30				=head1 DESCRIPTION
31
32				WWW::Spider is a customizable Internet spider intended to be used for
33				fetching and analyzing websites. Features include:
34
35				=over
36
37				=item * basic methods for high-level html handling
38
39				=item * the manner in which pages are retrieved is customizable
40
41				=item * callbacks for when pages are fetched, errors caused, etc...
42
43				=item * caching
44
45				=item * thread-safe operation, and optional multithreading operation
46				(faster)
47
48				=item * a high-level implementation of a 'graph' of either pages or
49				sites (as defined by the callback) which can be analyzed
50
51				=back
52
53				=cut
54
55	1	1	28327	use strict;
	1		2
	1		57
56	1	1	6	use warnings;
	1		2
	1		31
57
58	1	1	2964	use threads;
	0
	0
59				use Carp;
60				use LWP::UserAgent;
61				use HTTP::Request;
62				use Thread::Queue;
63
64				use WWW::Spider::Graph;
65				use WWW::Spider::Hooklist;
66
67				use vars qw( $VERSION );
68				$VERSION = '0.01_10';
69
70				=pod
71
72				=head1 FUNCTIONS
73
74				=head2 PARAMETERS
75
76				Parameter getting and setting functions.
77
78				=over
79
80				=item new WWW::Spider([%params])
81
82				Constructor for C
83
84				=cut
85
86				sub new {
87				my $class=shift;
88				my $self={};
89				my $params=shift \|\| {};
90
91				=pod
92
93				Arguments include:
94
95				=over
96
97				=item * UASTRING
98
99				The useragent string to be used. The default is "WWW::Spider"
100
101				=cut
102
103				my $uastring=$params->{UASTRING} \|\| 'WWW::Spider';
104
105				=pod
106
107				=item * USER_AGENT
108
109				The LWP::UserAgent to use. If this is specified, the UASTRING
110				argument is ignored.
111
112				=cut
113
114				my $ua=new LWP::UserAgent;
115				$ua->agent($uastring);
116				$ua=$params->{USER_AGENT} \|\| $ua;
117				$self->{USER_AGENT}=$ua;
118
119				$self->{HOOKS}=new WWW::Spider::Hooklist(['']);
120
121				bless $self,$class;
122				return $self;
123				}
124
125				=pod
126
127				=back
128
129				=item ->user_agent [LWP::UserAgent]
130
131				Returns/sets the user agent being used by this object.
132
133				=cut
134
135				sub user_agent {
136				my $self=shift;
137				my $original=$self->{USER_AGENT};
138				$self->{USER_AGENT}=$_[0] if exists $_[0];
139				return $original
140				}
141
142				=pod
143
144				=item ->uastring [STRING]
145
146				Returns/sets the user agent string being used by this object.
147
148				=cut
149
150				sub uastring {
151				my $self=shift;
152				return $self->{USER_AGENT}->agent($_[0]);
153				}
154
155				=pod
156
157				=back
158
159				=head2 GENERAL
160
161				These functions could be implemented anywhere - nothing about what
162				they do is special do WWW::Spider. Mainly, they are just conveiniance
163				functions for the rest of the code.
164
165				=over
166
167				=item ->get_page_content URL
168
169				Returns the contents of the page at URL.
170
171				=cut
172
173				sub get_page_content {
174				my ($self,$url)=@_;
175				return $self->get_page_response($url)->content;
176				}
177
178				=pod
179
180				=item ->get_page_response URL
181
182				Returns the HTTP::Response object corresponding to URL
183
184				=cut
185
186				sub get_page_response {
187				my ($self,$url)=@_;
188				return $self->{USER_AGENT}->get($url);
189				}
190
191				=pod
192
193				=back
194
195				=head2 SPIDER
196
197				These functions implement the spider functionality.
198
199				=over
200
201				=item ->crawl URL MAX_DEPTH
202
203				Crawls URL to the specified maxiumum depth. This is implemented as a
204				breadth-first search.
205
206				The default value for MAX_DEPTH is 0.
207
208				=cut
209
210				sub crawl {
211				(my $self,my $url,my $max_depth)=@_;
212				$max_depth=$max_depth \|\| 0;
213				croak "fatal: crawl called with empty url string" unless $url;
214				my $response=$self->get_page_response($url);
215				$self->handle_response($response);
216				$self->crawl_content($response->content,$max_depth,$url);
217				}
218
219				=pod
220
221				=item ->handle_url URL
222
223				The same as C.
224
225				=cut
226
227				sub handle_url {
228				my ($self,$url)=@_;
229				croak "fatal: handle_url called with empty url string" unless $url;
230				$self->handle_response($self->get_page_response($url));
231				}
232
233				=pod
234
235				=item ->crawl_content STRING [$MAX_DEPTH] [$SOURCE]
236
237				Treats STRING as if it was encountered during a crawl, with a
238				remaining maximum depth of MAX_DEPTH. The crawl is implemented as a
239				breadth-first search using C.
240
241				The default value for MAX_DEPTH is 0.
242
243				The assumption is made that handlers have already been called on this
244				page (otherwise, implementation would be impossible).
245
246				=cut
247
248				sub crawl_content {
249				(my $self,my $content,my $max_depth,my $source)=@_;
250				croak "fatal: crawl_content called with empty content string" unless $content;
251				$max_depth=$max_depth \|\| 0;
252				my %urls_done;
253				$urls_done{$source}=1;
254				my @links=$self->get_links_from_content($content,$source);
255				my $q=new Thread::Queue(@links);
256				my $depth=0;
257				$q->enqueue('--');
258				while($q->pending()>0 and $max_depth>$depth) {
259				my $link=$q->dequeue;
260				if($link eq '--') {
261				$depth++;
262				$q->enqueue('--');
263				next;
264				}
265				next if $urls_done{$link};
266				my $response=$self->get_page_response($link);
267				next unless $response->header('Content-type')=~/^text/;
268				my $tmp_content=$response->content;
269				$self->handle_response($response);
270				$urls_done{$link}=1;
271				@links=$self->get_links_from_content($tmp_content,$link);
272				for my $a (@links) {
273				next if $urls_done{$a};
274				$q->enqueue($a);
275				}
276				}
277				}
278
279				=pod
280
281				=item ->handle_response HTTP::RESPONSE
282
283				Handles the HTTP reponse, calling the appropriate hooks, without
284				crawling any other pages.
285
286				=cut
287
288				sub handle_response {
289				my ($self, $content)=@_;
290				carp "warning: handle_response called with empty content string" unless $content;
291				}
292
293				=pod
294
295				=item ->get_links_from URL
296
297				Returns a list of URLs linked to from URL.
298
299				=cut
300
301				sub get_links_from {
302				my ($self,$url)=@_;
303				croak "fatal: get_links_from called with empty url string" unless $url;
304				return $self->get_links_from_content($self->get_page_content($url),$url);
305				}
306
307				=pod
308
309				=item ->get_links_from_content $CONTENT [$SOURCE]
310
311				Returns a list of URLs linked to in STRING. When a URL is discovered
312				that is not complete, it is fixed by assuming that is was found on
313				SOURCE. If there is no source page specified, bad URLs are treated as
314				if they were linked to from http://localhost/.
315
316				SOURCE must be a valid and complete url.
317
318				=cut
319
320				sub get_links_from_content {
321				(my $self,my $content,my $source)=@_;
322				croak "fatal: get_links_from_content called with empty content string" unless $content;
323				my @list;
324				my $domain="http://localhost/";
325				my $root="http://localhost/";
326				if($source) {
327				$source=~/^(https?:\/\/[^\/]+\/)(.*)$/g;
328				$domain=$1;
329				$root=$1.$2;
330				if($root=~/^(.+\/)[^\/]+$/g) {
331				$root=$1;
332				}
333				}
334				while($content=~/]* )?href = \"([^\"]*)\"/msg) {
335				my $partial=$2;
336				my $url;
337				if($partial=~/^http:\/\/.*\//) {
338				$url=$partial;
339				} elsif($partial=~/^http:\/\//) {
340				$url=$partial."/";
341				} elsif($partial=~/^\/(.*)$/g) {
342				$url=$domain.$1;
343				} else {
344				$url=$root.$partial;
345				}
346				push @list,$url;
347				}
348				return @list;
349				}
350
351				=pod
352
353				=back
354
355				=head2 CALLBACKS AND HOOKS
356
357				All hook registration and deletion functions are considered atomic.
358				If five hooks have been registered, and then all of them are deleted
359				in one operation, there will be no page for which fewer than five but
360				more than zero of those hooks are called (unless some hooks are added
361				afterwords).
362
363				The legal hook strings are:
364
365				=over
366
367				=item * handle-page
368
369				Called whenever a crawlable page is reached.
370
371				Arguments: CONTENT, URL
372
373				Return:
374
375				=item * handle-response
376
377				Called on an HTTP response, successfull, crawlable, or otherwise.
378
379				Arguments:
380
381				Return:
382
383				=item * handle-failure
384
385				Called on any failed HTTP response.
386
387				Arguments:
388
389				Return:
390
391				=back
392
393				Functions for handling callbacks are:
394
395				=over
396
397				=item ->call_hooks HOOK-STRING, @ARGS
398
399				Calls all of the registered HOOK-STRING callbacks with @ARGS. This
400				function returns a list of all of the return values (in some
401				unspecified order) which are to be handled appropriately by the
402				caller.
403
404				=cut
405
406				sub call_hooks {
407				my ($self,$name,@args)=@_;
408				my @list=$self->get_hooks($name);
409				my @ret;
410				for my $hook (@list) {
411				push @ret,&$hook(@args);
412				}
413				return @ret;
414				}
415
416				=pod
417
418				=item ->register_hook HOOK-STRING, SUB, [{OPTIONS}]
419
420				Registers a subroutine to be run on HOOK-STRING. Has no return value.
421				Valid options are:
422
423				=over
424
425				=item * FORK
426
427				Set to a non-zero value if you want this hook to be run in a separate
428				thread. This means that, among other things, the return value will
429				not have the same affect (or even a well defined affect).
430
431				=back
432
433				=cut
434
435				sub register_hook {
436				my ($self,$name,$hook,$options)=@_;
437				}
438
439				=pod
440
441				=item ->get_hooks [HOOK-STRING]
442
443				Returns all hooks corresponding to HOOK-STRING. If HOOK-STRING is not
444				given, returns all hooks.
445
446				=cut
447
448				sub get_hooks {
449				my ($self,$name)=@_;
450				}
451
452				=pod
453
454				=item ->clear_hooks [HOOK-STRING]
455
456				Removes all hooks corresponding to HOOK-STRING. If HOOK-STRING is not
457				given, it deletes all hooks.
458
459				=cut
460
461				sub clear_hooks {
462				my ($self,$name)=@_;
463				}
464
465				1;
466
467				__END__