File Coverage

blib/lib/WWW/RoboCop.pm
Criterion Covered Total %
statement 61 61 100.0
branch 7 8 87.5
condition n/a
subroutine 16 16 100.0
pod 2 2 100.0
total 86 87 98.8


line stmt bran cond sub pod time code
1 1     1   283652 use strict;
  1         6  
  1         31  
2 1     1   6 use warnings;
  1         2  
  1         31  
3 1     1   5 use feature qw( state );
  1         2  
  1         167  
4              
5             package WWW::RoboCop;
6             our $VERSION = '0.000100';
7 1     1   9 use Carp qw( croak );
  1         2  
  1         50  
8 1     1   643 use Moo;
  1         10972  
  1         8  
9 1     1   2150 use MooX::HandlesVia;
  1         11249  
  1         7  
10 1     1   658 use MooX::StrictConstructor;
  1         4927  
  1         5  
11 1     1   24477 use Mozilla::CA;
  1         304  
  1         34  
12 1     1   516 use Type::Params qw( compile );
  1         100254  
  1         10  
13 1     1   331 use Types::Standard qw( CodeRef HashRef InstanceOf );
  1         3  
  1         5  
14 1     1   1240 use Types::URI -all;
  1         110450  
  1         13  
15 1     1   2924 use URI;
  1         3  
  1         39  
16 1     1   7 use WWW::Mechanize;
  1         2  
  1         699  
17              
18             has is_url_allowed => (
19             is => 'ro',
20             isa => CodeRef,
21             handles_via => 'Code',
22             handles => { _should_follow_link => 'execute' },
23             required => 1,
24             );
25              
26             has report_for_url => (
27             is => 'ro',
28             isa => CodeRef,
29             handles_via => 'Code',
30             handles => { _log_response => 'execute' },
31             default => sub {
32             sub {
33             my $response = shift; # HTTP::Response object
34             my $referring_url = shift; # URI object
35             return {
36             $response->redirects
37             ? (
38             redirects => [
39             map {
40             {
41             status => $_->code,
42             uri => $_->request->uri,
43             }
44             } $response->redirects
45             ],
46             target_url => $response->request->uri,
47             )
48             : (),
49             referrer => $referring_url,
50             status => $response->code,
51             };
52             };
53             },
54             );
55              
56             has ua => (
57             is => 'ro',
58             isa => InstanceOf ['WWW::Mechanize'],
59             default => sub {
60             WWW::Mechanize->new( autocheck => 0 );
61             },
62             );
63              
64             has _history => (
65             is => 'ro',
66             isa => HashRef,
67             handles_via => 'Hash',
68             handles => {
69             _add_url_to_history => 'set',
70             _has_processed_url => 'exists',
71             },
72             init_arg => undef,
73             default => sub { +{} },
74             );
75              
76             sub _get {
77 2     2   6 my $self = shift;
78 2         4 my $url = shift;
79 2         5 my $referring_url = shift;
80              
81 2         46 my $response = $self->ua->get($url);
82 2         18401 my $report = $self->_log_response( $response, $referring_url );
83 2         1082 $self->_add_url_to_history( $url, $report );
84              
85 2         423 my @links = $self->ua->find_all_links;
86              
87 2         3453 foreach my $link (@links) {
88              
89             # this just points back at the same url
90 6 100       594 next if substr( $link->url, 0, 1 ) eq '#';
91              
92 4         30 my $uri = URI->new( $link->url_abs );
93 4         4618 $uri->fragment(undef); # fragments result in duplicate urls
94              
95 4 100       195 next if $self->_has_processed_url($uri);
96 3 50       194 next unless $uri->can('host'); # no mailto: links
97 3 100       55 next unless $self->_should_follow_link( $link, $url );
98              
99 1         341 $self->_get( $uri, $url );
100             }
101             }
102              
103             sub crawl {
104 1     1 1 1489 my $self = shift;
105              
106 1         37 state $check = compile(Uri);
107 1         8570 my ($url) = $check->(@_);
108              
109 1         39 $self->_get($url);
110             }
111              
112             sub get_report {
113 1     1 1 40 my $self = shift;
114 1         3 return %{ $self->_history };
  1         12  
115             }
116              
117             1;
118              
119             =pod
120              
121             =encoding UTF-8
122              
123             =head1 NAME
124              
125             WWW::RoboCop - Police your URLs!
126              
127             =head1 VERSION
128              
129             version 0.000100
130              
131             =head1 SYNOPSIS
132              
133             use feature qw( state );
134              
135             use WWW::RoboCop;
136              
137             my $robocop = WWW::RoboCop->new(
138             is_url_allowed => sub {
139             state $count = 0;
140             return $count++ < 5; # just crawl 5 URLs
141             },
142             );
143              
144             $robocop->crawl( 'http://host.myhost.com/start' );
145              
146             my %history = $robocop->get_report;
147              
148             # %history = (
149             # 'http://myhost.com/one' => { status => 200, ... },
150             # 'http://myhost.com/two' => { status => 404, ... },
151             # ...
152             # )
153              
154             =head1 DESCRIPTION
155              
156             BETA BETA BETA!
157              
158             C<WWW::RoboCop> is a dead simple, somewhat opinionated robot. Given a starting
159             page, this module will crawl only URLs which have been allowed by the
160             C<is_url_allowed> callback. It then creates a report of all visited pages,
161             keyed on URL. You are encouraged to provide your own report creation callback
162             so that you can collect all of the information which you require for each URL.
163              
164             =head1 CONSTRUCTOR AND STARTUP
165              
166             =head2 new()
167              
168             Creates and returns a new C<WWW::RoboCop> object.
169              
170             Below are the arguments which you may pass to C<new> when creating an object.
171              
172             =head3 is_url_allowed
173              
174             This argument is required. You must provide an anonymous subroutine which will
175             return true or false based on some arbitrary criteria which you provide. The
176             two arguments to this anonymous subroutine will be a L<WWW::Mechanize::Link>
177             object as well as the referring URL, in the form of a L<URI> object.
178              
179             Your sub might look something like this:
180              
181             use feature qw( state );
182              
183             use URI;
184             use WWW::RoboCop;
185              
186             my $robocop = WWW::RoboCop->new(
187             is_url_allowed => sub {
188             my $link = shift;
189             my $referring_url = shift;
190              
191             my $upper_limit = 100;
192             my $host = 'some.host.com';
193              
194             state $limit = 0;
195              
196             return 0 if $limit > $upper_limit;
197             my $uri = URI->new( $link->url_abs );
198              
199             # if the referring_url matches the host then this is a 1st degree
200             # outbound web link
201              
202             if ( $uri->host eq $host || $referring_url->host eq $host ) {
203             ++$limit;
204             return 1;
205             }
206             return 0;
207             }
208             );
209              
210             =head3 report_for_url
211              
212             This argument is not required, but is highly recommended. The arguments to this
213             anonymous subroutine will be an L<HTTP::Response> object as well as the
214             referring URL in the form of a L<URI> object. Your sub might look something
215             like this:
216              
217             my $reporter = sub {
218             my $response = shift; # HTTP::Response object
219             my $referring_url = shift; # URI object
220             return {
221             redirects => [
222             map { +{ status => $_->code, uri => $_->base->as_string } }
223             $res->redirects
224             ],
225             referrer => $referring_url,
226             status => $res->code,
227             };
228             };
229              
230             my $robocop = WWW::RoboCop->new(
231             is_url_allowed => sub { ... },
232             report_for_url => $reporter,
233             );
234              
235             That would give you a HashRef with the status code for each link visited (200,
236             404, 500, etc) as well as the referring URL (the page on which the link was
237             found) and a list of any redirects which were followed in order to get to this
238             URL.
239              
240             The default C<report_for_url> sub will already provide something like the
241             above, but you should only treat this as a stub method while you get up and
242             running. Since it's only meant to be an example, the format of the default
243             report could change at some future date without notice. You should not rely on
244             or expect it to remain consistent in future. If you are going to rely on this
245             module, you should provide your own reporting logic.
246              
247             =head3 ua( WWW::Mechanize )
248              
249             You can provide your own UserAgent object to this class. It should be of the
250             L<WWW::Mechanize> family. If you're looking for a significant speed boost
251             while under development, consider providing a L<WWW::Mechanize::Cached> object.
252             This can give you enough of a speedup to save you from getting distracted
253             and going off to read Hacker News while you wait.
254              
255             use CHI;
256             use WWW::Mechanize::Cached;
257             use WWW::RoboCop;
258              
259             my $cache = CHI->new(
260             driver => 'File',
261             root_dir => /tmp/cache-example',
262             );
263              
264             my $robocop = WWW::RoboCop->new(
265             is_url_allowed => sub { ... },
266             ua => WWW::Mechanize::Cached->new( cache => $cache ),
267             );
268              
269             If you're not using a Cached agent, be sure to disable autocheck.
270              
271             my $robocop = WWW::RoboCop->new(
272             is_url_allowed => sub { ... },
273             ua => WWW::Mechanize->new( autocheck => 0 ),
274             );
275              
276             =head2 crawl( $url )
277              
278             This method sets the C<WWW::RoboCop> in motion. The robot will only come to a
279             halt once has exhausted all of the allowed URLs it can find.
280              
281             =head2 get_report
282              
283             This method returns a Hash of crawling results, keyed on the URLs visited.
284             By default, it returns a very simple Hash, containing only the status code
285             of the visited URL. You are encouraged to provide your own callback so that
286             you can get a detailed report returned to you. You can do this by providing a
287             C<report_for_url> callback when instantiating the object.
288              
289             The default report looks something like this:
290              
291             # %history = (
292             # 'http://myhost.com/one' => { status => 200, ... },
293             # 'http://myhost.com/two' => { status => 404, ... },
294             # )
295              
296             See examples/crawl-host.pl, which is included with this distribution, to get a
297             dump of the default report.
298              
299             =head1 AUTHOR
300              
301             Olaf Alders <olaf@wundercounter.com>
302              
303             =head1 COPYRIGHT AND LICENSE
304              
305             This software is Copyright (c) 2015 by MaxMind, Inc.
306              
307             This is free software, licensed under:
308              
309             The Artistic License 2.0 (GPL Compatible)
310              
311             =cut
312              
313             __END__
314              
315             # ABSTRACT: Police your URLs!
316