File Coverage

blib/lib/WWW/RoboCop.pm
Criterion Covered Total %
statement 55 55 100.0
branch 7 8 87.5
condition n/a
subroutine 14 14 100.0
pod 2 2 100.0
total 78 79 98.7


line stmt bran cond sub pod time code
1 1     1   281222 use strict;
  1         8  
  1         23  
2 1     1   5 use warnings;
  1         1  
  1         22  
3 1     1   5 use feature qw( state );
  1         1  
  1         127  
4              
5             our $VERSION = '0.000101';
6              
7             use Moo;
8 1     1   447  
  1         8482  
  1         3  
9             use MooX::HandlesVia;
10 1     1   1625 use MooX::StrictConstructor;
  1         7983  
  1         6  
11 1     1   453 use Type::Params qw( compile );
  1         3941  
  1         4  
12 1     1   20094 use Types::Standard qw( CodeRef HashRef InstanceOf );
  1         87130  
  1         8  
13 1     1   233 use Types::URI qw( Uri );
  1         3  
  1         3  
14 1     1   1088 use URI ();
  1         85035  
  1         7  
15 1     1   305 use WWW::Mechanize ();
  1         2  
  1         13  
16 1     1   7  
  1         4  
  1         572  
17             has is_url_allowed => (
18             is => 'ro',
19             isa => CodeRef,
20             handles_via => 'Code',
21             handles => { _should_follow_link => 'execute' },
22             required => 1,
23             );
24              
25             has report_for_url => (
26             is => 'ro',
27             isa => CodeRef,
28             handles_via => 'Code',
29             handles => { _log_response => 'execute' },
30             default => sub {
31             sub {
32             my $response = shift; # HTTP::Response object
33             my $referring_url = shift; # URI object
34             return {
35             $response->redirects
36             ? (
37             redirects => [
38             map {
39             {
40             status => $_->code,
41             uri => $_->request->uri,
42             }
43             } $response->redirects
44             ],
45             target_url => $response->request->uri,
46             )
47             : (),
48             referrer => $referring_url,
49             status => $response->code,
50             };
51             };
52             },
53             );
54              
55             has ua => (
56             is => 'ro',
57             isa => InstanceOf ['WWW::Mechanize'],
58             default => sub {
59             WWW::Mechanize->new( autocheck => 0 );
60             },
61             );
62              
63             has _history => (
64             is => 'ro',
65             isa => HashRef,
66             handles_via => 'Hash',
67             handles => {
68             _add_url_to_history => 'set',
69             _has_processed_url => 'exists',
70             },
71             init_arg => undef,
72             default => sub { +{} },
73             );
74              
75             my $self = shift;
76             my $url = shift;
77 2     2   6 my $referring_url = shift;
78 2         4  
79 2         2 my $response = $self->ua->get($url);
80             my $report = $self->_log_response( $response, $referring_url );
81 2         18 $self->_add_url_to_history( $url, $report );
82 2         17119  
83 2         1047 my @links = $self->ua->find_all_links;
84              
85 2         413 foreach my $link (@links) {
86              
87 2         3807 # this just points back at the same url
88             next if substr( $link->url, 0, 1 ) eq '#';
89              
90 6 100       479 my $uri = URI->new( $link->url_abs );
91             $uri->fragment(undef); # fragments result in duplicate urls
92 4         23  
93 4         4046 next if $self->_has_processed_url($uri);
94             next unless $uri->can('host'); # no mailto: links
95 4 100       143 next unless $self->_should_follow_link( $link, $url );
96 3 50       151  
97 3 100       58 $self->_get( $uri, $url );
98             }
99 1         248 }
100              
101             my $self = shift;
102              
103             state $check = compile(Uri);
104 1     1 1 1112 my ($url) = $check->(@_);
105              
106 1         26 $self->_get($url);
107 1         6331 }
108              
109 1         35 my $self = shift;
110             return %{ $self->_history };
111             }
112              
113 1     1 1 17 1;
114 1         2  
  1         6  
115             =pod
116              
117             =encoding UTF-8
118              
119             =head1 NAME
120              
121             WWW::RoboCop - Police your URLs!
122              
123             =head1 VERSION
124              
125             version 0.000101
126              
127             =head1 SYNOPSIS
128              
129             use feature qw( state );
130              
131             use WWW::RoboCop ();
132              
133             my $robocop = WWW::RoboCop->new(
134             is_url_allowed => sub {
135             state $count = 0;
136             return $count++ < 5; # just crawl 5 URLs
137             },
138             );
139              
140             $robocop->crawl( 'https://example.com' );
141              
142             my %history = $robocop->get_report;
143              
144             # %history = (
145             # 'http://myhost.com/one' => { status => 200, ... },
146             # 'http://myhost.com/two' => { status => 404, ... },
147             # ...
148             # )
149              
150             See C<examples/crawl-host.pl>, which is included with this distribution, to get a
151             quick start.
152              
153             =head1 DESCRIPTION
154              
155             C<WWW::RoboCop> is a simple, somewhat opinionated robot. Given a starting
156             page, this module will crawl only URLs which have been allowed by the
157             C<is_url_allowed> callback. It then creates a report of all visited pages,
158             keyed on URL. You are encouraged to provide your own report creation callback
159             so that you can collect all of the information which you require for each URL.
160              
161             =head1 CONSTRUCTOR AND STARTUP
162              
163             =head2 new()
164              
165             Creates and returns a new C<WWW::RoboCop> object.
166              
167             Below are the arguments which you may pass to C<new> when creating an object.
168              
169             =head3 is_url_allowed
170              
171             This argument is required. You must provide an anonymous subroutine which will
172             return true or false based on some arbitrary criteria which you provide. The
173             two arguments to this anonymous subroutine will be a L<WWW::Mechanize::Link>
174             object as well as the referring URL, in the form of a L<URI> object.
175              
176             Your sub might look something like this:
177              
178             use feature qw( state );
179              
180             use URI ();
181             use WWW::RoboCop ();
182              
183             my $robocop = WWW::RoboCop->new(
184             is_url_allowed => sub {
185             my $link = shift;
186             my $referring_url = shift;
187              
188             my $upper_limit = 100;
189             my $host = 'some.host.com';
190              
191             state $limit = 0;
192              
193             return 0 if $limit > $upper_limit;
194             my $uri = URI->new( $link->url_abs );
195              
196             # if the referring_url matches the host then this is a 1st degree
197             # outbound web link
198              
199             if ( $uri->host eq $host || $referring_url->host eq $host ) {
200             ++$limit;
201             return 1;
202             }
203             return 0;
204             }
205             );
206              
207             =head3 report_for_url
208              
209             This argument is not required, but is highly recommended. The arguments to this
210             anonymous subroutine will be an L<HTTP::Response> object as well as the
211             referring URL in the form of a L<URI> object. Your sub might look something
212             like this:
213              
214             my $reporter = sub {
215             my $response = shift; # HTTP::Response object
216             my $referring_url = shift; # URI object
217             return {
218             redirects => [
219             map { +{ status => $_->code, uri => $_->base->as_string } }
220             $res->redirects
221             ],
222             referrer => $referring_url,
223             status => $res->code,
224             };
225             };
226              
227             my $robocop = WWW::RoboCop->new(
228             is_url_allowed => sub { ... },
229             report_for_url => $reporter,
230             );
231              
232             That would give you a HashRef with the status code for each link visited (200,
233             404, 500, etc) as well as the referring URL (the page on which the link was
234             found) and a list of any redirects which were followed in order to get to this
235             URL.
236              
237             The default C<report_for_url> sub will already provide something like the
238             above, but you should only treat this as a stub method while you get up and
239             running. Since it's only meant to be an example, the format of the default
240             report could change at some future date without notice. You should not rely on
241             or expect it to remain consistent in future. If you are going to rely on this
242             module, you should provide your own reporting logic.
243              
244             =head3 ua( WWW::Mechanize )
245              
246             You can provide your own UserAgent object to this class. It should be of the
247             L<WWW::Mechanize> family. If you're looking for a significant speed boost
248             while under development, consider providing a L<WWW::Mechanize::Cached> object.
249             This can give you enough of a speedup to save you from getting distracted
250             and going off to read Hacker News while you wait.
251              
252             use CHI ();
253             use WWW::Mechanize::Cached ();
254             use WWW::RoboCop ();
255              
256             my $cache = CHI->new(
257             driver => 'File',
258             root_dir => /tmp/cache-example',
259             );
260              
261             my $robocop = WWW::RoboCop->new(
262             is_url_allowed => sub { ... },
263             ua => WWW::Mechanize::Cached->new( cache => $cache ),
264             );
265              
266             If you're not using a Cached agent, be sure to disable autocheck.
267              
268             my $robocop = WWW::RoboCop->new(
269             is_url_allowed => sub { ... },
270             ua => WWW::Mechanize->new( autocheck => 0 ),
271             );
272              
273             =head2 crawl( $url )
274              
275             This method sets the C<WWW::RoboCop> in motion. The robot will only come to a
276             halt once has exhausted all of the allowed URLs it can find.
277              
278             =head2 get_report
279              
280             This method returns a Hash of crawling results, keyed on the URLs visited.
281             By default, it returns a very simple Hash, containing only the status code
282             of the visited URL. You are encouraged to provide your own callback so that
283             you can get a detailed report returned to you. You can do this by providing a
284             C<report_for_url> callback when instantiating the object.
285              
286             The default report looks something like this:
287              
288             # %history = (
289             # 'http://myhost.com/one' => { status => 200, ... },
290             # 'http://myhost.com/two' => { status => 404, ... },
291             # )
292              
293             See C<examples/crawl-host.pl>, which is included with this distribution, to get
294             a dump of the default report.
295              
296             =head1 AUTHOR
297              
298             Olaf Alders <olaf@wundercounter.com>
299              
300             =head1 COPYRIGHT AND LICENSE
301              
302             This software is Copyright (c) 2015 by MaxMind, Inc.
303              
304             This is free software, licensed under:
305              
306             The Artistic License 2.0 (GPL Compatible)
307              
308             =cut
309              
310              
311             # ABSTRACT: Police your URLs!
312