File Coverage

blib/lib/WWW/RoboCop.pm
Criterion Covered Total %
statement 55 55 100.0
branch 7 8 87.5
condition n/a
subroutine 14 14 100.0
pod 2 2 100.0
total 78 79 98.7


line stmt bran cond sub pod time code
1 1     1   361616 use strict;
  1         11  
  1         29  
2 1     1   5 use warnings;
  1         2  
  1         26  
3 1     1   6 use feature qw( state );
  1         2  
  1         176  
4              
5             package WWW::RoboCop;
6             our $VERSION = '0.000102';
7              
8 1     1   1023 use Moo;
  1         11017  
  1         4  
9              
10 1     1   2018 use MooX::HandlesVia;
  1         10076  
  1         8  
11 1     1   571 use MooX::StrictConstructor;
  1         5200  
  1         12  
12 1     1   24760 use Type::Params qw( compile );
  1         103933  
  1         9  
13 1     1   247 use Types::Standard qw( CodeRef HashRef InstanceOf );
  1         3  
  1         5  
14 1     1   1136 use Types::URI qw( Uri );
  1         101936  
  1         12  
15 1     1   476 use URI ();
  1         2  
  1         22  
16 1     1   5 use WWW::Mechanize ();
  1         2  
  1         654  
17              
18             has is_url_allowed => (
19             is => 'ro',
20             isa => CodeRef,
21             handles_via => 'Code',
22             handles => { _should_follow_link => 'execute' },
23             required => 1,
24             );
25              
26             has report_for_url => (
27             is => 'ro',
28             isa => CodeRef,
29             handles_via => 'Code',
30             handles => { _log_response => 'execute' },
31             default => sub {
32             sub {
33             my $response = shift; # HTTP::Response object
34             my $referring_url = shift; # URI object
35             return {
36             $response->redirects
37             ? (
38             redirects => [
39             map {
40             {
41             status => $_->code,
42             uri => $_->request->uri,
43             }
44             } $response->redirects
45             ],
46             target_url => $response->request->uri,
47             )
48             : (),
49             referrer => $referring_url,
50             status => $response->code,
51             };
52             };
53             },
54             );
55              
56             has ua => (
57             is => 'ro',
58             isa => InstanceOf ['WWW::Mechanize'],
59             default => sub {
60             WWW::Mechanize->new( autocheck => 0 );
61             },
62             );
63              
64             has _history => (
65             is => 'ro',
66             isa => HashRef,
67             handles_via => 'Hash',
68             handles => {
69             _add_url_to_history => 'set',
70             _has_processed_url => 'exists',
71             },
72             init_arg => undef,
73             default => sub { +{} },
74             );
75              
76             sub _get {
77 2     2   5 my $self = shift;
78 2         16 my $url = shift;
79 2         5 my $referring_url = shift;
80              
81 2         22 my $response = $self->ua->get($url);
82 2         22510 my $report = $self->_log_response( $response, $referring_url );
83 2         1233 $self->_add_url_to_history( $url, $report );
84              
85 2         409 my @links = $self->ua->find_all_links;
86              
87 2         3818 foreach my $link (@links) {
88              
89             # this just points back at the same url
90 6 100       711 next if substr( $link->url, 0, 1 ) eq '#';
91              
92 4         102 my $uri = URI->new( $link->url_abs );
93 4         5139 $uri->fragment(undef); # fragments result in duplicate urls
94              
95 4 100       219 next if $self->_has_processed_url($uri);
96 3 50       197 next unless $uri->can('host'); # no mailto: links
97 3 100       73 next unless $self->_should_follow_link( $link, $url );
98              
99 1         310 $self->_get( $uri, $url );
100             }
101             }
102              
103             sub crawl {
104 1     1 1 1599 my $self = shift;
105              
106 1         31 state $check = compile(Uri);
107 1         8098 my ($url) = $check->(@_);
108              
109 1         38 $self->_get($url);
110             }
111              
112             sub get_report {
113 1     1 1 28 my $self = shift;
114 1         2 return %{ $self->_history };
  1         11  
115             }
116              
117             1;
118              
119             =pod
120              
121             =encoding UTF-8
122              
123             =head1 NAME
124              
125             WWW::RoboCop - Police your URLs!
126              
127             =head1 VERSION
128              
129             version 0.000102
130              
131             =head1 SYNOPSIS
132              
133             use feature qw( state );
134              
135             use WWW::RoboCop ();
136              
137             my $robocop = WWW::RoboCop->new(
138             is_url_allowed => sub {
139             state $count = 0;
140             return $count++ < 5; # just crawl 5 URLs
141             },
142             );
143              
144             $robocop->crawl( 'https://example.com' );
145              
146             my %history = $robocop->get_report;
147              
148             # %history = (
149             # 'http://myhost.com/one' => { status => 200, ... },
150             # 'http://myhost.com/two' => { status => 404, ... },
151             # ...
152             # )
153              
154             See C<examples/crawl-host.pl>, which is included with this distribution, to get a
155             quick start.
156              
157             =head1 DESCRIPTION
158              
159             C<WWW::RoboCop> is a simple, somewhat opinionated robot. Given a starting
160             page, this module will crawl only URLs which have been allowed by the
161             C<is_url_allowed> callback. It then creates a report of all visited pages,
162             keyed on URL. You are encouraged to provide your own report creation callback
163             so that you can collect all of the information which you require for each URL.
164              
165             =head1 CONSTRUCTOR AND STARTUP
166              
167             =head2 new()
168              
169             Creates and returns a new C<WWW::RoboCop> object.
170              
171             Below are the arguments which you may pass to C<new> when creating an object.
172              
173             =head3 is_url_allowed
174              
175             This argument is required. You must provide an anonymous subroutine which will
176             return true or false based on some arbitrary criteria which you provide. The
177             two arguments to this anonymous subroutine will be a L<WWW::Mechanize::Link>
178             object as well as the referring URL, in the form of a L<URI> object.
179              
180             Your sub might look something like this:
181              
182             use feature qw( state );
183              
184             use URI ();
185             use WWW::RoboCop ();
186              
187             my $robocop = WWW::RoboCop->new(
188             is_url_allowed => sub {
189             my $link = shift;
190             my $referring_url = shift;
191              
192             my $upper_limit = 100;
193             my $host = 'some.host.com';
194              
195             state $limit = 0;
196              
197             return 0 if $limit > $upper_limit;
198             my $uri = URI->new( $link->url_abs );
199              
200             # if the referring_url matches the host then this is a 1st degree
201             # outbound web link
202              
203             if ( $uri->host eq $host || $referring_url->host eq $host ) {
204             ++$limit;
205             return 1;
206             }
207             return 0;
208             }
209             );
210              
211             =head3 report_for_url
212              
213             This argument is not required, but is highly recommended. The arguments to this
214             anonymous subroutine will be an L<HTTP::Response> object as well as the
215             referring URL in the form of a L<URI> object. Your sub might look something
216             like this:
217              
218             my $reporter = sub {
219             my $response = shift; # HTTP::Response object
220             my $referring_url = shift; # URI object
221             return {
222             redirects => [
223             map { +{ status => $_->code, uri => $_->base->as_string } }
224             $res->redirects
225             ],
226             referrer => $referring_url,
227             status => $res->code,
228             };
229             };
230              
231             my $robocop = WWW::RoboCop->new(
232             is_url_allowed => sub { ... },
233             report_for_url => $reporter,
234             );
235              
236             That would give you a HashRef with the status code for each link visited (200,
237             404, 500, etc) as well as the referring URL (the page on which the link was
238             found) and a list of any redirects which were followed in order to get to this
239             URL.
240              
241             The default C<report_for_url> sub will already provide something like the
242             above, but you should only treat this as a stub method while you get up and
243             running. Since it's only meant to be an example, the format of the default
244             report could change at some future date without notice. You should not rely on
245             or expect it to remain consistent in future. If you are going to rely on this
246             module, you should provide your own reporting logic.
247              
248             =head3 ua( WWW::Mechanize )
249              
250             You can provide your own UserAgent object to this class. It should be of the
251             L<WWW::Mechanize> family. If you're looking for a significant speed boost
252             while under development, consider providing a L<WWW::Mechanize::Cached> object.
253             This can give you enough of a speedup to save you from getting distracted
254             and going off to read Hacker News while you wait.
255              
256             use CHI ();
257             use WWW::Mechanize::Cached ();
258             use WWW::RoboCop ();
259              
260             my $cache = CHI->new(
261             driver => 'File',
262             root_dir => /tmp/cache-example',
263             );
264              
265             my $robocop = WWW::RoboCop->new(
266             is_url_allowed => sub { ... },
267             ua => WWW::Mechanize::Cached->new( cache => $cache ),
268             );
269              
270             If you're not using a Cached agent, be sure to disable autocheck.
271              
272             my $robocop = WWW::RoboCop->new(
273             is_url_allowed => sub { ... },
274             ua => WWW::Mechanize->new( autocheck => 0 ),
275             );
276              
277             =head2 crawl( $url )
278              
279             This method sets the C<WWW::RoboCop> in motion. The robot will only come to a
280             halt once has exhausted all of the allowed URLs it can find.
281              
282             =head2 get_report
283              
284             This method returns a Hash of crawling results, keyed on the URLs visited.
285             By default, it returns a very simple Hash, containing only the status code
286             of the visited URL. You are encouraged to provide your own callback so that
287             you can get a detailed report returned to you. You can do this by providing a
288             C<report_for_url> callback when instantiating the object.
289              
290             The default report looks something like this:
291              
292             # %history = (
293             # 'http://myhost.com/one' => { status => 200, ... },
294             # 'http://myhost.com/two' => { status => 404, ... },
295             # )
296              
297             See C<examples/crawl-host.pl>, which is included with this distribution, to get
298             a dump of the default report.
299              
300             =head1 AUTHOR
301              
302             Olaf Alders <olaf@wundercounter.com>
303              
304             =head1 COPYRIGHT AND LICENSE
305              
306             This software is Copyright (c) 2015 by MaxMind, Inc.
307              
308             This is free software, licensed under:
309              
310             The Artistic License 2.0 (GPL Compatible)
311              
312             =cut
313              
314             __END__
315              
316             # ABSTRACT: Police your URLs!
317