File Coverage

blib/lib/WWW/Crawler/Lite.pm

Criterion	Covered	Total	%
statement	127	144	88.1
branch	28	52	53.8
condition	10	22	45.4
subroutine	28	35	80.0
pod	10	17	58.8
total	203	270	75.1

line	stmt	bran	cond	sub	pod	time	code
1							package WWW::Crawler::Lite;
2
3	2			2		1781	use strict;
	2					4
	2					89
4	2			2		11	use warnings 'all';
	2					4
	2					92
5	2			2		6251	use LWP::UserAgent;
	2					181059
	2					77
6	2			2		2382	use HTTP::Request::Common;
	2					5501
	2					313
7	2			2		2820	use WWW::RobotRules;
	2					6705
	2					70
8	2			2		1869	use URI::URL;
	2					22988
	2					5228
9	2			2		3164	use HTML::LinkExtor;
	2					59576
	2					98
10	2			2		6211	use Time::HiRes 'usleep';
	2					7151
	2					13
11	2			2		689	use Carp 'confess';
	2					7
	2					38061
12
13							our $VERSION = '0.005';
14
15
16							sub new
17							{
18	1			1	1	1300	my ($class, %args) = @_;
19
20							my $s = bless {
21							url_pattern => 'https?://.+',
22							agent => "WWW-Crawler-Lite/$VERSION $^O",
23							http_accept => [qw( text/html text/plain application/xhtml+xml )],
24	4			4		867	on_new_urls => sub { my @urls = @_; },
25	0			0		0	on_bad_url => sub { my ($bad_url) = @_; },
26	0			0		0	on_response => sub { my ($url, $http_response) = @_; },
27	0			0		0	on_link => sub { my ($from, $to, $text) = @_ },
28	654			654		17017	follow_ok => sub { my ($url) = @_; return 1; },
	654					1910
29	1					37	link_parser => 'default',
30							delay_seconds => 1,
31							disallowed => [ ],
32							%args,
33							urls => { },
34							_responded_urls => { },
35							RUNNING => 1,
36							IS_INITIALIZING => 1,
37							}, $class;
38	1					20	$s->{rules} = WWW::RobotRules->new( $s->agent );
39
40	1					61	return $s;
41							}# end new()
42
43							# Public read-only properties:
44	2			2	1	22	sub agent { shift->{agent} }
45	8			8	1	30	sub url_pattern { shift->{url_pattern} }
46	4			4	1	4008765	sub delay_seconds { shift->{delay_seconds} }
47	8			8	1	14	sub http_accept { @{ shift->{http_accept} } }
	8					42
48	0			0	0	0	sub is_initializing { shift->{IS_INITIALIZING} }
49	4			4	0	265	sub is_running { shift->{RUNNING} }
50	2286			2286	0	8729	sub rules { shift->{rules} }
51
52							# Public method:
53	1			1	1	13356	sub stop { shift->{RUNNING} = 0 }
54
55
56							# Public getters/setters:
57							sub on_new_urls
58							{
59	4			4	0	8	my $s = shift;
60
61	4	50				44	return @_ ? $s->{on_new_urls} = shift : $s->{on_new_urls};
62							}# end on_new_urls()
63
64							sub on_bad_url
65							{
66	0			0	1	0	my $s = shift;
67
68	0	0				0	return @_ ? $s->{on_bad_url} = shift : $s->{on_bad_url};
69							}# end on_bad_url()
70
71							sub on_response
72							{
73	4			4	1	12	my $s = shift;
74
75	4	50				36	return @_ ? $s->{on_response} = shift : $s->{on_response};
76							}# end on_response()
77
78							sub on_link
79							{
80	617			617	1	867	my $s = shift;
81
82	617	50				3129	return @_ ? $s->{on_link} = shift : $s->{on_link};
83							}# end on_link()
84
85
86							sub follow_ok
87							{
88	654			654	0	948	my $s = shift;
89
90	654	50				2674	return @_ ? $s->{follow_ok} = shift : $s->{follow_ok};
91							}# end follow_ok()
92
93
94							sub link_parser
95							{
96	8			8	1	14	my $s = shift;
97
98	8	50				64	return @_ ? $s->{link_parser} = shift : $s->{link_parser};
99							}# end link_parser()
100
101
102							sub url_count
103							{
104	0			0	0	0	my ($s) = @_;
105
106	0					0	return scalar( keys %{ $s->{urls} } );
	0					0
107							}# end url_count()
108
109
110							sub crawl
111							{
112	1			1	0	6	my ($s, %args) = @_;
113
114	1	50				5	confess "Require param 'url' not provided" unless $args{url};
115
116	1					4	my $ua = LWP::UserAgent->new( agent => $s->agent );
117							$ua->add_handler( response_header => sub {
118	5			5		2434942	my ($response, $ua, $h) = @_;
119	5	50	50			35	my ($type) = split /\;/, ( $response->header('content-type') \|\| '' )
120							or die "no mime type provided by server";
121	5	50				291	grep { $type =~ m{\Q$_\E}i } $s->http_accept
	15					407
122							or die "unwanted mime type '$type'";
123	1					14080	});
124
125							# Try to find robots.txt:
126	1					136	my ($proto, $domain) = $args{url} =~ m{^(https?)://(.*?)(?:/\|$)};
127	1					2	eval {
128	1					8	local $SIG{__DIE__} = \&confess;
129	1					6	my $robots_url = "$proto://$domain/robots.txt";
130	1					7	my $res = $ua->request( GET $robots_url );
131
132							# If robots.txt has extra newlines in it, the rules parser always allows (which is bad):
133	1					1080	(my $robots_txt = $res->content) =~ s/[\r?\n]{2,}/\n/sg;
134	1	50	33			28	$s->rules->parse( $robots_url, $robots_txt )
			33
135							if $res && $res->is_success && $res->content;
136							};
137	1	50				1880	warn "Error fetching/parsing robots.txt: $@" if $@;
138
139	1					8	$s->{urls}->{$args{url}} = 'taken';
140	1					7	my $res = $ua->request( GET $args{url} );
141	1					1384624	$s->_parse_result( $args{url}, $res );
142
143	1					34	while( my $url = $s->_take_url() )
144							{
145	4					21	usleep( $s->delay_seconds * 1_000_000 );
146	4	100				60	last unless $s->is_running;
147
148	3					33	my $res = $ua->request( GET $url );
149	3					944005	my ($type) = split /\;/, $res->header('content-type');
150
151							# Only parse responses that are of the correct MIME type:
152	9					190	$s->_parse_result( $url, $res )
153	3	50				200	if grep { $type =~ m{\Q$_\E}i } $s->http_accept;
154							}# end while()
155							}# end crawl()
156
157
158							sub _take_url
159							{
160	4			4		13	my ($s) = @_;
161
162	4					7	my $url;
163	2285					396857	SCOPE: {
164	4	50				8	($url) = grep { $s->rules->allowed( $_ ) } grep { $s->{urls}->{$_} eq 'new' } keys %{ $s->{urls} }
	4					8
	2295					6471
	4					617
165							or return;
166	4					1316	$s->{urls}->{$url} = 'taken';
167							};
168	4					26	return $url;
169							}# end _take_url()
170
171
172							sub _parse_result
173							{
174	4			4		15	my ($s, $url, $res) = @_;
175
176	4					20	my $base = $res->base;
177	4					2279	my @new_urls = ( );
178
179	4	50				20	if( $s->link_parser eq 'HTML::LinkExtor' )
		50
180							{
181							# This option added after the original regexp way was pointed out on perlmonks:
182							# http://www.perlmonks.org/?node_id=946548
183							my $cb = sub {
184	0			0		0	my ($tag, %attrs) = @_;
185	0	0				0	return unless uc($tag) eq 'A';
186	0	0				0	if( $s->follow_ok->( $attrs{href} ) )
187							{
188	0		0			0	push @new_urls, { href => $attrs{href}, text => $attrs{title} \|\| $attrs{alt} };
189							}# end if()
190	0					0	};
191	0					0	my $parser = HTML::LinkExtor->new($cb, $base);
192	0					0	$parser->parse( $res->content );
193							}
194							elsif( $s->link_parser eq 'default' )
195							{
196							# This method might be a bit naive, but HTML::LinkExtor (AFAIK) doesn't allow
197							# me to get at the text within a hyperlink.
198							# I'm open to alternatives and recognise the problems inherent in parsing
199							# HTML with regexps.
200	4					23	(my $tmp = $res->content) =~ s{(.*?)}{
201	654					2381	my ($href,$anchortext) = ( $1, $2 );
202	654	100				1929	if( $anchortext =~ m/
203							{
204	13					91	my ($alt) = join ". ", $anchortext =~ m/alt\s\=\s"(.*?)"/sig;
205	13					73	$anchortext =~ s///sig;
206	13	100				56	$anchortext .= ". $alt" if $alt;
207							}# end if()
208	654					868	$anchortext =~ s{]}{}sg;
209	654	50				2352	if( my ($quote) = $href =~ m/^(['"])/ )
210							{
211	654					3836	($href) = $href =~ m/^$quote(.*?)$quote/;
212							}
213							else
214							{
215	0					0	($href) = $href =~ m/^([^\s+])/;
216							}# end if()
217	654	50				1550	$href = "" unless defined($href);
218	654					995	$href =~ s/\#.*$//;
219	654	50				1200	if( $href )
220							{
221	654					2022	(my $new = url($href, $base)->abs->as_string) =~ s/\#.*$//;
222	654	50				304181	if( $s->follow_ok->( $new ) )
223							{
224	654					1717	$anchortext =~ s/^\s+//s;
225	654					1114	$anchortext =~ s/\s+$//s;
226	654					4952	push @new_urls, { href => $new, text => $anchortext };
227							}# end if()
228							}# end if()
229	654					6375	"";
230							}isgxe;
231							}# end if()
232
233	4					103	$s->on_response->( $url, $res );
234
235	4					4242	my %accepted_urls = ( );
236	4					19	SCOPE: {
237	4					9	my $pattern = $s->url_pattern;
238	594					1559	map {
239	654					766	$accepted_urls{$_}++;
240	594		100			2981	$s->{urls}->{$_} \|\|= 'new';
241							}
242							grep {
243	654					1223	my $u = $_;
244	594					4362	m/$pattern/ &&
245							! exists($s->{urls}->{$u}) &&
246							! grep {
247							$u =~ m{^https?://[^/]+?\Q$_\E.*}
248	654	100	33			4237	} @{$s->{disallowed}} &&
			100
249							$s->rules->allowed( $u )
250							}
251	4					17	map { $_->{href} } @new_urls;
252							};
253
254							# Send the event about this page linking to those other pages:
255	4					56	my $pattern = $s->url_pattern;
256	617					487325	map {
257	654					718	$s->on_link->( $url, $_->{href}, $_->{text} );
258							}
259							grep {
260	4					13	my $u = $_;
261	617					10801	$u->{href} =~ m/$pattern/ &&
262							! grep {
263							$u->{href} =~ m{^https?://[^/]+?\Q$_\E.*}
264	654	100	33			4208	} @{$s->{disallowed}} &&
265							$s->rules->allowed( $u->{href} )
266							} @new_urls;
267
268	4					3407	$s->on_new_urls->( keys(%accepted_urls) );
269							}# end _parse_result()
270
271							1;# return true:
272
273							=pod
274
275							=head1 NAME
276
277							WWW::Crawler::Lite - A single-threaded crawler/spider for the web.
278
279							=head1 SYNOPSIS
280
281							my %pages = ( );
282							my $pattern = 'https?://example\.com\/';
283							my %links = ( );
284							my $downloaded = 0;
285
286							my $crawler;
287							$crawler = WWW::Crawler::Lite->new(
288							agent => 'MySuperBot/1.0',
289							url_pattern => $pattern,
290							http_accept => [qw( text/plain text/html application/xhtml+xml )],
291							link_parser => 'default',
292							on_response => sub {
293							my ($url, $res) = @_;
294
295							warn "$url contains " . $res->content;
296							$downloaded++;
297							$crawler->stop() if $downloaded++ > 5;
298							},
299							follow_ok => sub {
300							my ($url) = @_;
301							# If you like this url and want to use it, then return a true value:
302							return 1;
303							},
304							on_link => sub {
305							my ($from, $to, $text) = @_;
306
307							return if exists($pages{$to}) && $pages{$to} eq 'BAD';
308							$pages{$to}++;
309							$links{$to} \|\|= [ ];
310							push @{$links{$to}}, { from => $from, text => $text };
311							},
312							on_bad_url => sub {
313							my ($url) = @_;
314
315							# Mark this url as 'bad':
316							$pages{$url} = 'BAD';
317							}
318							);
319							$crawler->crawl( url => "http://example.com/" );
320
321							warn "DONE!!!!!";
322
323							use Data::Dumper;
324							map {
325							warn "$_ ($pages{$_} incoming links) -> " . Dumper($links{$_})
326							} sort keys %links;
327
328							=head1 DESCRIPTION
329
330							C is a single-threaded spider/crawler for the web. It can
331							be used within a mod_perl, CGI or Catalyst-style environment because it does not
332							fork or use threads.
333
334							The callback-based interface is fast and simple, allowing you to focus on simply
335							processing the data that C extracts from the target website.
336
337							=head1 PUBLIC METHODS
338
339							=head2 new( %args )
340
341							Creates and returns a new C object.
342
343							The C<%args> hash is not required, but may contain the following elements:
344
345							=over 4
346
347							=item agent - String
348
349							Used as the user-agent string for HTTP requests.
350
351							B - C
352
353							=item url_pattern - RegExp or String
354
355							New links that do not match this pattern will not be added to the processing queue.
356
357							B C
358
359							=item http_accept - ArrayRef
360
361							This can be used to filter out unwanted responses.
362
363							=item link_parser - String
364
365							Valid values: 'C' and 'C'
366
367							The default value is 'C' which uses a naive regexp to do the link parsing.
368
369							The upshot of using 'C' is that the regexp will also find the hyperlinked
370							text or alt-text (of a hyperlinked img tag) and give that to your 'C' handler.
371
372							B C<[qw( text/html text/plain application/xhtml+xml )]>
373
374							=item on_response($url, $response) - CodeRef
375
376							Called whenever a successful response is returned.
377
378							=item on_link($from, $to, $text) - CodeRef
379
380							Called whenever a new link is found. Arguments are:
381
382							=over 8
383
384							=item $from
385
386							The URL that is linked from
387
388							=item $to
389
390							The URL that is linked to
391
392							=item $text
393
394							The anchor text (eg: The HTML within the link - B)
395
396							=back
397
398							=item on_bad_url($url) - CodeRef
399
400							Called whenever an unsuccessful response is received.
401
402							=item delay_seconds - Number
403
404							Indicates the length of time (in seconds) that the crawler should pause before making
405							each request. This can be useful when you want to spider a website, not launch
406							a denial of service attack on it.
407
408							=back
409
410							=head2 stop( )
411
412							Causes the crawler to stop processing its queue of URLs.
413
414							=head1 AUTHOR
415
416							John Drago
417
418							=head1 COPYRIGHT
419
420							This software is Free software and may be used and redistributed under the same
421							terms as perl itself.
422
423							=cut
424
425
426