line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WWW::Crawler::Lite; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
1781
|
use strict; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
89
|
|
4
|
2
|
|
|
2
|
|
11
|
use warnings 'all'; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
92
|
|
5
|
2
|
|
|
2
|
|
6251
|
use LWP::UserAgent; |
|
2
|
|
|
|
|
181059
|
|
|
2
|
|
|
|
|
77
|
|
6
|
2
|
|
|
2
|
|
2382
|
use HTTP::Request::Common; |
|
2
|
|
|
|
|
5501
|
|
|
2
|
|
|
|
|
313
|
|
7
|
2
|
|
|
2
|
|
2820
|
use WWW::RobotRules; |
|
2
|
|
|
|
|
6705
|
|
|
2
|
|
|
|
|
70
|
|
8
|
2
|
|
|
2
|
|
1869
|
use URI::URL; |
|
2
|
|
|
|
|
22988
|
|
|
2
|
|
|
|
|
5228
|
|
9
|
2
|
|
|
2
|
|
3164
|
use HTML::LinkExtor; |
|
2
|
|
|
|
|
59576
|
|
|
2
|
|
|
|
|
98
|
|
10
|
2
|
|
|
2
|
|
6211
|
use Time::HiRes 'usleep'; |
|
2
|
|
|
|
|
7151
|
|
|
2
|
|
|
|
|
13
|
|
11
|
2
|
|
|
2
|
|
689
|
use Carp 'confess'; |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
38061
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
our $VERSION = '0.005'; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
sub new |
17
|
|
|
|
|
|
|
{ |
18
|
1
|
|
|
1
|
1
|
1300
|
my ($class, %args) = @_; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
my $s = bless { |
21
|
|
|
|
|
|
|
url_pattern => 'https?://.+', |
22
|
|
|
|
|
|
|
agent => "WWW-Crawler-Lite/$VERSION $^O", |
23
|
|
|
|
|
|
|
http_accept => [qw( text/html text/plain application/xhtml+xml )], |
24
|
4
|
|
|
4
|
|
867
|
on_new_urls => sub { my @urls = @_; }, |
25
|
0
|
|
|
0
|
|
0
|
on_bad_url => sub { my ($bad_url) = @_; }, |
26
|
0
|
|
|
0
|
|
0
|
on_response => sub { my ($url, $http_response) = @_; }, |
27
|
0
|
|
|
0
|
|
0
|
on_link => sub { my ($from, $to, $text) = @_ }, |
28
|
654
|
|
|
654
|
|
17017
|
follow_ok => sub { my ($url) = @_; return 1; }, |
|
654
|
|
|
|
|
1910
|
|
29
|
1
|
|
|
|
|
37
|
link_parser => 'default', |
30
|
|
|
|
|
|
|
delay_seconds => 1, |
31
|
|
|
|
|
|
|
disallowed => [ ], |
32
|
|
|
|
|
|
|
%args, |
33
|
|
|
|
|
|
|
urls => { }, |
34
|
|
|
|
|
|
|
_responded_urls => { }, |
35
|
|
|
|
|
|
|
RUNNING => 1, |
36
|
|
|
|
|
|
|
IS_INITIALIZING => 1, |
37
|
|
|
|
|
|
|
}, $class; |
38
|
1
|
|
|
|
|
20
|
$s->{rules} = WWW::RobotRules->new( $s->agent ); |
39
|
|
|
|
|
|
|
|
40
|
1
|
|
|
|
|
61
|
return $s; |
41
|
|
|
|
|
|
|
}# end new() |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# Public read-only properties: |
44
|
2
|
|
|
2
|
1
|
22
|
sub agent { shift->{agent} } |
45
|
8
|
|
|
8
|
1
|
30
|
sub url_pattern { shift->{url_pattern} } |
46
|
4
|
|
|
4
|
1
|
4008765
|
sub delay_seconds { shift->{delay_seconds} } |
47
|
8
|
|
|
8
|
1
|
14
|
sub http_accept { @{ shift->{http_accept} } } |
|
8
|
|
|
|
|
42
|
|
48
|
0
|
|
|
0
|
0
|
0
|
sub is_initializing { shift->{IS_INITIALIZING} } |
49
|
4
|
|
|
4
|
0
|
265
|
sub is_running { shift->{RUNNING} } |
50
|
2286
|
|
|
2286
|
0
|
8729
|
sub rules { shift->{rules} } |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# Public method: |
53
|
1
|
|
|
1
|
1
|
13356
|
sub stop { shift->{RUNNING} = 0 } |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# Public getters/setters: |
57
|
|
|
|
|
|
|
sub on_new_urls |
58
|
|
|
|
|
|
|
{ |
59
|
4
|
|
|
4
|
0
|
8
|
my $s = shift; |
60
|
|
|
|
|
|
|
|
61
|
4
|
50
|
|
|
|
44
|
return @_ ? $s->{on_new_urls} = shift : $s->{on_new_urls}; |
62
|
|
|
|
|
|
|
}# end on_new_urls() |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
sub on_bad_url |
65
|
|
|
|
|
|
|
{ |
66
|
0
|
|
|
0
|
1
|
0
|
my $s = shift; |
67
|
|
|
|
|
|
|
|
68
|
0
|
0
|
|
|
|
0
|
return @_ ? $s->{on_bad_url} = shift : $s->{on_bad_url}; |
69
|
|
|
|
|
|
|
}# end on_bad_url() |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub on_response |
72
|
|
|
|
|
|
|
{ |
73
|
4
|
|
|
4
|
1
|
12
|
my $s = shift; |
74
|
|
|
|
|
|
|
|
75
|
4
|
50
|
|
|
|
36
|
return @_ ? $s->{on_response} = shift : $s->{on_response}; |
76
|
|
|
|
|
|
|
}# end on_response() |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub on_link |
79
|
|
|
|
|
|
|
{ |
80
|
617
|
|
|
617
|
1
|
867
|
my $s = shift; |
81
|
|
|
|
|
|
|
|
82
|
617
|
50
|
|
|
|
3129
|
return @_ ? $s->{on_link} = shift : $s->{on_link}; |
83
|
|
|
|
|
|
|
}# end on_link() |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
sub follow_ok |
87
|
|
|
|
|
|
|
{ |
88
|
654
|
|
|
654
|
0
|
948
|
my $s = shift; |
89
|
|
|
|
|
|
|
|
90
|
654
|
50
|
|
|
|
2674
|
return @_ ? $s->{follow_ok} = shift : $s->{follow_ok}; |
91
|
|
|
|
|
|
|
}# end follow_ok() |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
sub link_parser |
95
|
|
|
|
|
|
|
{ |
96
|
8
|
|
|
8
|
1
|
14
|
my $s = shift; |
97
|
|
|
|
|
|
|
|
98
|
8
|
50
|
|
|
|
64
|
return @_ ? $s->{link_parser} = shift : $s->{link_parser}; |
99
|
|
|
|
|
|
|
}# end link_parser() |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
sub url_count |
103
|
|
|
|
|
|
|
{ |
104
|
0
|
|
|
0
|
0
|
0
|
my ($s) = @_; |
105
|
|
|
|
|
|
|
|
106
|
0
|
|
|
|
|
0
|
return scalar( keys %{ $s->{urls} } ); |
|
0
|
|
|
|
|
0
|
|
107
|
|
|
|
|
|
|
}# end url_count() |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub crawl |
111
|
|
|
|
|
|
|
{ |
112
|
1
|
|
|
1
|
0
|
6
|
my ($s, %args) = @_; |
113
|
|
|
|
|
|
|
|
114
|
1
|
50
|
|
|
|
5
|
confess "Require param 'url' not provided" unless $args{url}; |
115
|
|
|
|
|
|
|
|
116
|
1
|
|
|
|
|
4
|
my $ua = LWP::UserAgent->new( agent => $s->agent ); |
117
|
|
|
|
|
|
|
$ua->add_handler( response_header => sub { |
118
|
5
|
|
|
5
|
|
2434942
|
my ($response, $ua, $h) = @_; |
119
|
5
|
50
|
50
|
|
|
35
|
my ($type) = split /\;/, ( $response->header('content-type') || '' ) |
120
|
|
|
|
|
|
|
or die "no mime type provided by server"; |
121
|
5
|
50
|
|
|
|
291
|
grep { $type =~ m{\Q$_\E}i } $s->http_accept |
|
15
|
|
|
|
|
407
|
|
122
|
|
|
|
|
|
|
or die "unwanted mime type '$type'"; |
123
|
1
|
|
|
|
|
14080
|
}); |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
# Try to find robots.txt: |
126
|
1
|
|
|
|
|
136
|
my ($proto, $domain) = $args{url} =~ m{^(https?)://(.*?)(?:/|$)}; |
127
|
1
|
|
|
|
|
2
|
eval { |
128
|
1
|
|
|
|
|
8
|
local $SIG{__DIE__} = \&confess; |
129
|
1
|
|
|
|
|
6
|
my $robots_url = "$proto://$domain/robots.txt"; |
130
|
1
|
|
|
|
|
7
|
my $res = $ua->request( GET $robots_url ); |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# If robots.txt has extra newlines in it, the rules parser always allows (which is bad): |
133
|
1
|
|
|
|
|
1080
|
(my $robots_txt = $res->content) =~ s/[\r?\n]{2,}/\n/sg; |
134
|
1
|
50
|
33
|
|
|
28
|
$s->rules->parse( $robots_url, $robots_txt ) |
|
|
|
33
|
|
|
|
|
135
|
|
|
|
|
|
|
if $res && $res->is_success && $res->content; |
136
|
|
|
|
|
|
|
}; |
137
|
1
|
50
|
|
|
|
1880
|
warn "Error fetching/parsing robots.txt: $@" if $@; |
138
|
|
|
|
|
|
|
|
139
|
1
|
|
|
|
|
8
|
$s->{urls}->{$args{url}} = 'taken'; |
140
|
1
|
|
|
|
|
7
|
my $res = $ua->request( GET $args{url} ); |
141
|
1
|
|
|
|
|
1384624
|
$s->_parse_result( $args{url}, $res ); |
142
|
|
|
|
|
|
|
|
143
|
1
|
|
|
|
|
34
|
while( my $url = $s->_take_url() ) |
144
|
|
|
|
|
|
|
{ |
145
|
4
|
|
|
|
|
21
|
usleep( $s->delay_seconds * 1_000_000 ); |
146
|
4
|
100
|
|
|
|
60
|
last unless $s->is_running; |
147
|
|
|
|
|
|
|
|
148
|
3
|
|
|
|
|
33
|
my $res = $ua->request( GET $url ); |
149
|
3
|
|
|
|
|
944005
|
my ($type) = split /\;/, $res->header('content-type'); |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# Only parse responses that are of the correct MIME type: |
152
|
9
|
|
|
|
|
190
|
$s->_parse_result( $url, $res ) |
153
|
3
|
50
|
|
|
|
200
|
if grep { $type =~ m{\Q$_\E}i } $s->http_accept; |
154
|
|
|
|
|
|
|
}# end while() |
155
|
|
|
|
|
|
|
}# end crawl() |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub _take_url |
159
|
|
|
|
|
|
|
{ |
160
|
4
|
|
|
4
|
|
13
|
my ($s) = @_; |
161
|
|
|
|
|
|
|
|
162
|
4
|
|
|
|
|
7
|
my $url; |
163
|
2285
|
|
|
|
|
396857
|
SCOPE: { |
164
|
4
|
50
|
|
|
|
8
|
($url) = grep { $s->rules->allowed( $_ ) } grep { $s->{urls}->{$_} eq 'new' } keys %{ $s->{urls} } |
|
4
|
|
|
|
|
8
|
|
|
2295
|
|
|
|
|
6471
|
|
|
4
|
|
|
|
|
617
|
|
165
|
|
|
|
|
|
|
or return; |
166
|
4
|
|
|
|
|
1316
|
$s->{urls}->{$url} = 'taken'; |
167
|
|
|
|
|
|
|
}; |
168
|
4
|
|
|
|
|
26
|
return $url; |
169
|
|
|
|
|
|
|
}# end _take_url() |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
sub _parse_result |
173
|
|
|
|
|
|
|
{ |
174
|
4
|
|
|
4
|
|
15
|
my ($s, $url, $res) = @_; |
175
|
|
|
|
|
|
|
|
176
|
4
|
|
|
|
|
20
|
my $base = $res->base; |
177
|
4
|
|
|
|
|
2279
|
my @new_urls = ( ); |
178
|
|
|
|
|
|
|
|
179
|
4
|
50
|
|
|
|
20
|
if( $s->link_parser eq 'HTML::LinkExtor' ) |
|
|
50
|
|
|
|
|
|
180
|
|
|
|
|
|
|
{ |
181
|
|
|
|
|
|
|
# This option added after the original regexp way was pointed out on perlmonks: |
182
|
|
|
|
|
|
|
# http://www.perlmonks.org/?node_id=946548 |
183
|
|
|
|
|
|
|
my $cb = sub { |
184
|
0
|
|
|
0
|
|
0
|
my ($tag, %attrs) = @_; |
185
|
0
|
0
|
|
|
|
0
|
return unless uc($tag) eq 'A'; |
186
|
0
|
0
|
|
|
|
0
|
if( $s->follow_ok->( $attrs{href} ) ) |
187
|
|
|
|
|
|
|
{ |
188
|
0
|
|
0
|
|
|
0
|
push @new_urls, { href => $attrs{href}, text => $attrs{title} || $attrs{alt} }; |
189
|
|
|
|
|
|
|
}# end if() |
190
|
0
|
|
|
|
|
0
|
}; |
191
|
0
|
|
|
|
|
0
|
my $parser = HTML::LinkExtor->new($cb, $base); |
192
|
0
|
|
|
|
|
0
|
$parser->parse( $res->content ); |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
elsif( $s->link_parser eq 'default' ) |
195
|
|
|
|
|
|
|
{ |
196
|
|
|
|
|
|
|
# This method might be a bit naive, but HTML::LinkExtor (AFAIK) doesn't allow |
197
|
|
|
|
|
|
|
# me to get at the text within a hyperlink. |
198
|
|
|
|
|
|
|
# I'm open to alternatives and recognise the problems inherent in parsing |
199
|
|
|
|
|
|
|
# HTML with regexps. |
200
|
4
|
|
|
|
|
23
|
(my $tmp = $res->content) =~ s{(.*?)}{ |
201
|
654
|
|
|
|
|
2381
|
my ($href,$anchortext) = ( $1, $2 ); |
202
|
654
|
100
|
|
|
|
1929
|
if( $anchortext =~ m/
|
203
|
|
|
|
|
|
|
{ |
204
|
13
|
|
|
|
|
91
|
my ($alt) = join ". ", $anchortext =~ m/alt\s*\=\s*"(.*?)"/sig; |
205
|
13
|
|
|
|
|
73
|
$anchortext =~ s///sig; |
206
|
13
|
100
|
|
|
|
56
|
$anchortext .= ". $alt" if $alt; |
207
|
|
|
|
|
|
|
}# end if() |
208
|
654
|
|
|
|
|
868
|
$anchortext =~ s{?.*?[/>]}{}sg; |
209
|
654
|
50
|
|
|
|
2352
|
if( my ($quote) = $href =~ m/^(['"])/ ) |
210
|
|
|
|
|
|
|
{ |
211
|
654
|
|
|
|
|
3836
|
($href) = $href =~ m/^$quote(.*?)$quote/; |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
else |
214
|
|
|
|
|
|
|
{ |
215
|
0
|
|
|
|
|
0
|
($href) = $href =~ m/^([^\s+])/; |
216
|
|
|
|
|
|
|
}# end if() |
217
|
654
|
50
|
|
|
|
1550
|
$href = "" unless defined($href); |
218
|
654
|
|
|
|
|
995
|
$href =~ s/\#.*$//; |
219
|
654
|
50
|
|
|
|
1200
|
if( $href ) |
220
|
|
|
|
|
|
|
{ |
221
|
654
|
|
|
|
|
2022
|
(my $new = url($href, $base)->abs->as_string) =~ s/\#.*$//; |
222
|
654
|
50
|
|
|
|
304181
|
if( $s->follow_ok->( $new ) ) |
223
|
|
|
|
|
|
|
{ |
224
|
654
|
|
|
|
|
1717
|
$anchortext =~ s/^\s+//s; |
225
|
654
|
|
|
|
|
1114
|
$anchortext =~ s/\s+$//s; |
226
|
654
|
|
|
|
|
4952
|
push @new_urls, { href => $new, text => $anchortext }; |
227
|
|
|
|
|
|
|
}# end if() |
228
|
|
|
|
|
|
|
}# end if() |
229
|
654
|
|
|
|
|
6375
|
""; |
230
|
|
|
|
|
|
|
}isgxe; |
231
|
|
|
|
|
|
|
}# end if() |
232
|
|
|
|
|
|
|
|
233
|
4
|
|
|
|
|
103
|
$s->on_response->( $url, $res ); |
234
|
|
|
|
|
|
|
|
235
|
4
|
|
|
|
|
4242
|
my %accepted_urls = ( ); |
236
|
4
|
|
|
|
|
19
|
SCOPE: { |
237
|
4
|
|
|
|
|
9
|
my $pattern = $s->url_pattern; |
238
|
594
|
|
|
|
|
1559
|
map { |
239
|
654
|
|
|
|
|
766
|
$accepted_urls{$_}++; |
240
|
594
|
|
100
|
|
|
2981
|
$s->{urls}->{$_} ||= 'new'; |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
grep { |
243
|
654
|
|
|
|
|
1223
|
my $u = $_; |
244
|
594
|
|
|
|
|
4362
|
m/$pattern/ && |
245
|
|
|
|
|
|
|
! exists($s->{urls}->{$u}) && |
246
|
|
|
|
|
|
|
! grep { |
247
|
|
|
|
|
|
|
$u =~ m{^https?://[^/]+?\Q$_\E.*} |
248
|
654
|
100
|
33
|
|
|
4237
|
} @{$s->{disallowed}} && |
|
|
|
100
|
|
|
|
|
249
|
|
|
|
|
|
|
$s->rules->allowed( $u ) |
250
|
|
|
|
|
|
|
} |
251
|
4
|
|
|
|
|
17
|
map { $_->{href} } @new_urls; |
252
|
|
|
|
|
|
|
}; |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# Send the event about this page linking to those other pages: |
255
|
4
|
|
|
|
|
56
|
my $pattern = $s->url_pattern; |
256
|
617
|
|
|
|
|
487325
|
map { |
257
|
654
|
|
|
|
|
718
|
$s->on_link->( $url, $_->{href}, $_->{text} ); |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
grep { |
260
|
4
|
|
|
|
|
13
|
my $u = $_; |
261
|
617
|
|
|
|
|
10801
|
$u->{href} =~ m/$pattern/ && |
262
|
|
|
|
|
|
|
! grep { |
263
|
|
|
|
|
|
|
$u->{href} =~ m{^https?://[^/]+?\Q$_\E.*} |
264
|
654
|
100
|
33
|
|
|
4208
|
} @{$s->{disallowed}} && |
265
|
|
|
|
|
|
|
$s->rules->allowed( $u->{href} ) |
266
|
|
|
|
|
|
|
} @new_urls; |
267
|
|
|
|
|
|
|
|
268
|
4
|
|
|
|
|
3407
|
$s->on_new_urls->( keys(%accepted_urls) ); |
269
|
|
|
|
|
|
|
}# end _parse_result() |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
1;# return true: |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=pod |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=head1 NAME |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
WWW::Crawler::Lite - A single-threaded crawler/spider for the web. |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
=head1 SYNOPSIS |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
my %pages = ( ); |
282
|
|
|
|
|
|
|
my $pattern = 'https?://example\.com\/'; |
283
|
|
|
|
|
|
|
my %links = ( ); |
284
|
|
|
|
|
|
|
my $downloaded = 0; |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
my $crawler; |
287
|
|
|
|
|
|
|
$crawler = WWW::Crawler::Lite->new( |
288
|
|
|
|
|
|
|
agent => 'MySuperBot/1.0', |
289
|
|
|
|
|
|
|
url_pattern => $pattern, |
290
|
|
|
|
|
|
|
http_accept => [qw( text/plain text/html application/xhtml+xml )], |
291
|
|
|
|
|
|
|
link_parser => 'default', |
292
|
|
|
|
|
|
|
on_response => sub { |
293
|
|
|
|
|
|
|
my ($url, $res) = @_; |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
warn "$url contains " . $res->content; |
296
|
|
|
|
|
|
|
$downloaded++; |
297
|
|
|
|
|
|
|
$crawler->stop() if $downloaded++ > 5; |
298
|
|
|
|
|
|
|
}, |
299
|
|
|
|
|
|
|
follow_ok => sub { |
300
|
|
|
|
|
|
|
my ($url) = @_; |
301
|
|
|
|
|
|
|
# If you like this url and want to use it, then return a true value: |
302
|
|
|
|
|
|
|
return 1; |
303
|
|
|
|
|
|
|
}, |
304
|
|
|
|
|
|
|
on_link => sub { |
305
|
|
|
|
|
|
|
my ($from, $to, $text) = @_; |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
return if exists($pages{$to}) && $pages{$to} eq 'BAD'; |
308
|
|
|
|
|
|
|
$pages{$to}++; |
309
|
|
|
|
|
|
|
$links{$to} ||= [ ]; |
310
|
|
|
|
|
|
|
push @{$links{$to}}, { from => $from, text => $text }; |
311
|
|
|
|
|
|
|
}, |
312
|
|
|
|
|
|
|
on_bad_url => sub { |
313
|
|
|
|
|
|
|
my ($url) = @_; |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
# Mark this url as 'bad': |
316
|
|
|
|
|
|
|
$pages{$url} = 'BAD'; |
317
|
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
); |
319
|
|
|
|
|
|
|
$crawler->crawl( url => "http://example.com/" ); |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
warn "DONE!!!!!"; |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
use Data::Dumper; |
324
|
|
|
|
|
|
|
map { |
325
|
|
|
|
|
|
|
warn "$_ ($pages{$_} incoming links) -> " . Dumper($links{$_}) |
326
|
|
|
|
|
|
|
} sort keys %links; |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
=head1 DESCRIPTION |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
C is a single-threaded spider/crawler for the web. It can |
331
|
|
|
|
|
|
|
be used within a mod_perl, CGI or Catalyst-style environment because it does not |
332
|
|
|
|
|
|
|
fork or use threads. |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
The callback-based interface is fast and simple, allowing you to focus on simply |
335
|
|
|
|
|
|
|
processing the data that C extracts from the target website. |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=head1 PUBLIC METHODS |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
=head2 new( %args ) |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
Creates and returns a new C object. |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
The C<%args> hash is not required, but may contain the following elements: |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=over 4 |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=item agent - String |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
Used as the user-agent string for HTTP requests. |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
B - C |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
=item url_pattern - RegExp or String |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
New links that do not match this pattern will not be added to the processing queue. |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
B C |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
=item http_accept - ArrayRef |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
This can be used to filter out unwanted responses. |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
=item link_parser - String |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
Valid values: 'C' and 'C' |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
The default value is 'C' which uses a naive regexp to do the link parsing. |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
The upshot of using 'C' is that the regexp will also find the hyperlinked |
370
|
|
|
|
|
|
|
text or alt-text (of a hyperlinked img tag) and give that to your 'C' handler. |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
B C<[qw( text/html text/plain application/xhtml+xml )]> |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
=item on_response($url, $response) - CodeRef |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
Called whenever a successful response is returned. |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
=item on_link($from, $to, $text) - CodeRef |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
Called whenever a new link is found. Arguments are: |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
=over 8 |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=item $from |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
The URL that is linked *from* |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=item $to |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
The URL that is linked *to* |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
=item $text |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
The anchor text (eg: The HTML within the link - B) |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
=back |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=item on_bad_url($url) - CodeRef |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
Called whenever an unsuccessful response is received. |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=item delay_seconds - Number |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
Indicates the length of time (in seconds) that the crawler should pause before making |
405
|
|
|
|
|
|
|
each request. This can be useful when you want to spider a website, not launch |
406
|
|
|
|
|
|
|
a denial of service attack on it. |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=back |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=head2 stop( ) |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
Causes the crawler to stop processing its queue of URLs. |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=head1 AUTHOR |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
John Drago |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=head1 COPYRIGHT |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
This software is Free software and may be used and redistributed under the same |
421
|
|
|
|
|
|
|
terms as perl itself. |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
=cut |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
|