File Coverage

blib/lib/URL/List.pm
Criterion Covered Total %
statement 86 95 90.5
branch 14 22 63.6
condition 4 11 36.3
subroutine 17 17 100.0
pod 8 8 100.0
total 129 153 84.3


line stmt bran cond sub pod time code
1             package URL::List;
2 4     4   250148 use Moose;
  4         1744388  
  4         33  
3 4     4   30752 use namespace::autoclean;
  4         26065  
  4         21  
4              
5             =head1 NAME
6              
7             URL::List - Helper class for creating distributed lists of URLs based on their host name, domain name or TLDs.
8              
9             =head1 VERSION
10              
11             Version 0.13
12              
13             =cut
14              
15             our $VERSION = '0.14';
16              
17 4     4   297 use Carp;
  4         12  
  4         205  
18 4     4   2097 use Domain::PublicSuffix;
  4         424615  
  4         27  
19 4     4   1993 use List::MoreUtils qw( uniq );
  4         25869  
  4         38  
20 4     4   4416 use URI;
  4         14324  
  4         3219  
21              
22             =head1 SYNOPSIS
23              
24             use URL::List;
25              
26             my $list = URL::List->new;
27             $list->add( 'http://www.google.com/' );
28             $list->add( 'http://www.bbc.co.uk/' );
29              
30             # or
31              
32             my $list = URL::List->new(
33             allow_duplicates => 1, # default false
34             urls => [ ... ], # arrayref of URLs
35             );
36              
37             my $distributed_by_hosts = $list->distributed_by_host;
38              
39             # $VAR1 = {
40             # 'www.google.com' => [
41             # 'http://www.google.com/',
42             # ],
43             # 'www.bbc.co.uk' => [
44             # 'http://www.bbc.co.uk/',
45             # ],
46             # };
47              
48             my $distributed_by_domains = $list->distributed_by_domain;
49              
50             # $VAR1 = {
51             # 'google.com' => [
52             # 'http://www.google.com/',
53             # ],
54             # 'bbc.co.uk' => [
55             # 'http://www.bbc.co.uk/',
56             # ],
57             # };
58              
59             my $distributed_by_tlds = $list->distributed_by_tld;
60              
61             # $VAR1 = {
62             # 'com' => [
63             # 'http://www.google.com/',
64             # ],
65             # 'co.uk' => [
66             # 'http://www.bbc.co.uk/',
67             # ],
68             # };
69              
70             my $urls = $list->all; # All the URLs are still there, so use this...
71             $list->clear; # ...to clear the list.
72              
73             =head1 DESCRIPTION
74              
75             URL:List is a module which helps you with distributing a list of URLs "evenly"
76             based on the URLs' host name, domain name or TLD (top-level domain).
77              
78             This can be useful for crawlers, ie. giving out a list of URLs within specific
79             hostnames, domain names and/or TLD names to different workers.
80              
81             =head1 METHODS
82              
83             =head2 new
84              
85             Returns an instance of URL::List.
86              
87             Takes one optional parameter, 'allow_duplicates', which is default 0. By setting
88             it to true (1), URL::List will not filter out duplicate articles.
89              
90             =cut
91              
92             has 'allow_duplicates' => ( isa => 'Bool', is => 'rw', default => 0 );
93             has 'urls' => ( isa => 'ArrayRef[Str]', is => 'rw', default => sub { [] } );
94              
95             sub _add {
96 41     41   52 my $self = shift;
97 41   50     88 my $url = shift || '';
98              
99 41 50       103 if ( length $url ) {
100 41 50       115 if ( my $uri = URI->new($url) ) {
101 41         15390 push( @{$self->urls}, $uri );
  41         834  
102             }
103             else {
104 0         0 carp "Can't add '" . $url . "' to the list of URLs; seems to be an invalid URL!";
105             }
106             }
107             else {
108 0         0 carp "Can't add an empty string as a URL!";
109             }
110             }
111              
112             =head2 add( $url )
113              
114             Add a URL to the list. C<$url> can be an array reference of URLs.
115              
116             =cut
117              
118             sub add {
119 41     41 1 780 my $self = shift;
120 41         66 my $url = shift;
121              
122 41 50       99 if ( defined $url ) {
123 41 50       110 my $urls = ( ref $url eq 'ARRAYREF' ) ? $url : [ $url ];
124              
125 41         60 foreach ( @{$urls} ) {
  41         74  
126 41         90 $self->_add( $_ );
127             }
128             }
129             }
130              
131             =head2 all
132              
133             Returns an array reference of all the URLs in the list.
134              
135             =cut
136              
137             sub all {
138 12     12 1 23 my $self = shift;
139              
140 12 100       282 if ( $self->allow_duplicates ) {
141 2         39 return $self->urls;
142             }
143             else {
144 10         19 return [ List::MoreUtils::uniq(@{$self->urls}) ];
  10         202  
145             }
146             }
147              
148             =head2 count
149              
150             Returns the number of URLs in the list, including potential duplicates,
151             depending on the 'allow_duplicates' setting.
152              
153             =cut
154              
155             sub count {
156 7     7 1 56 my $self = shift;
157              
158 7         14 return scalar( @{$self->all} );
  7         20  
159             }
160              
161             =head2 clear
162              
163             Clears the URL list.
164              
165             =cut
166              
167             sub clear {
168 2     2 1 982 my $self = shift;
169              
170 2         52 $self->urls( [] );
171             }
172              
173             =head2 flush
174              
175             An alias for C<clear>.
176              
177             =cut
178              
179             sub flush {
180 1     1 1 548 return shift->clear;
181             }
182              
183             #
184             # DISTRIBUTIONS
185             #
186              
187             =head2 distributions
188              
189             Returns a hash reference of all the possible distributions.
190              
191             This method should not be used directly. Instead, the distributed_by_* methods
192             should be used.
193              
194             =cut
195              
196             has 'distributions' => ( isa => 'HashRef', is => 'ro', lazy_build => 1 );
197              
198             sub _build_distributions {
199 5     5   12 my $self = shift;
200              
201             #
202             # Create a list of valid URLs
203             #
204 5         12 my @urls = ();
205              
206 5         10 foreach my $url ( @{$self->all} ) {
  5         20  
207 40         507 push( @urls, $url );
208             }
209              
210             #
211             # Build the different distributions
212             #
213 5         15 my %distributions = ();
214 5         39 my $suffix = Domain::PublicSuffix->new;
215              
216 5         3636624 foreach my $url ( @urls ) {
217             # my $host = undef;
218              
219 40         77 my $host = eval {
220 40         134 URI->new( $url )->host;
221             };
222              
223 40 50       9501 if ( $@ ) {
224 0         0 carp "Failed to determine host from '" . $url . "'. Skipping it!";
225 0         0 next;
226             }
227              
228 40 50 33     178 if ( defined $host && length $host ) {
229 40         122 my $domain = $suffix->get_root_domain( $host );
230 40         4133 my $tld = $suffix->tld;
231              
232 40         158 push( @{$distributions{host}->{$host}}, $url );
  40         129  
233              
234 40 50 33     238 if ( defined $domain && length $domain ) {
235 40         57 push( @{$distributions{domain}->{$domain}}, $url );
  40         101  
236             }
237             else {
238 0         0 carp "Failed to determine the domain name from '" . $url . "'. Skipping it!";
239 0         0 next;
240             }
241              
242 40 50 33     152 if ( defined $tld && length $tld ) {
243 40         50 push( @{$distributions{tld}->{$tld}}, $url );
  40         114  
244             }
245             else {
246 0         0 carp "Failed to determine the TLD from '" . $url . "'. Skipping it!";
247 0         0 next;
248             }
249             }
250             else {
251 0         0 carp "Failed to determine host from '" . $url . "'. Skipping it!";
252             }
253             }
254              
255             #
256             # Return
257             #
258 5         20340 return \%distributions;
259             }
260              
261             =head2 distributed_by_host
262              
263             Returns a hash reference where the key is the host name, like "www.google.com",
264             and the value is an array reference to the host name's URLs.
265              
266             =cut
267              
268             sub distributed_by_host {
269 5     5 1 1210 my $self = shift;
270              
271 5         127 return $self->distributions->{host};
272             }
273              
274             =head2 distributed_by_domain
275              
276             Returns a hash reference where the key is the domain name, like "google.com",
277             and the value is an array reference to the domain name's URLs.
278              
279             =cut
280              
281             sub distributed_by_domain {
282 4     4 1 17 my $self = shift;
283              
284 4         107 return $self->distributions->{domain};
285             }
286              
287             =head2 distributed_by_tld
288              
289             Returns a hash reference where the key is the top-level domain name, like "com",
290             and the value is an array reference to the top-level domain name's URLs.
291              
292             =cut
293              
294             sub distributed_by_tld {
295 4     4 1 13 my $self = shift;
296              
297 4         106 return $self->distributions->{tld};
298             }
299              
300             sub _blocks_by {
301 1     1   3 my $self = shift;
302 1         2 my $dist = shift;
303              
304 1         3 my @blocks = ();
305              
306 1         3 while ( keys %{$dist} ) {
  6         17  
307 5         9 my @urls = ();
308              
309 5         8 foreach my $key ( sort keys %{$dist} ) {
  5         17  
310 25 100       34 if ( my $url = shift @{$dist->{$key}} ) {
  25         48  
311 19         32 push( @urls, $url );
312             }
313             else {
314 6         12 delete $dist->{$key};
315             }
316             }
317              
318 5 100       14 if ( @urls ) {
319 4         7 push( @blocks, \@urls );
320             }
321             }
322              
323 1         426 return \@blocks;
324             }
325              
326             =head2 blocks_by_host, blocks_by_domain, blocks_by_tld
327              
328             Returns "blocks" of URLs distributed by their host/domain/TLD, i.e. an array
329             reference of array references containing URLs distributed as evenly as possible;
330              
331             my $list = URL::List->new(
332             urls => [qw(
333             http://www.businessinsider.com/1.html
334             http://www.businessinsider.com/2.html
335             http://www.businessinsider.com/3.html
336             http://www.engadget.com/1.html
337             http://www.engadget.com/2.html
338             http://www.engadget.com/3.html
339             http://www.engadget.com/4.html
340             http://www.independent.co.uk/1.html
341             http://www.independent.co.uk/2.html
342             http://www.pcmag.com/1.html
343             http://www.pcmag.com/2.html
344             http://www.pcmag.com/3.html
345             http://www.technologyreview.com/1.html
346             http://www.technologyreview.com/2.html
347             http://www.technologyreview.com/3.html
348             http://www.technologyreview.com/4.html
349             http://www.zdnet.com/1.html
350             http://www.zdnet.com/2.html
351             http://www.zdnet.com/3.html
352             )],
353             );
354              
355             # $list->blocks_by_host = [
356             # [qw(
357             # http://www.businessinsider.com/1.html
358             # http://www.engadget.com/1.html
359             # http://www.independent.co.uk/1.html
360             # http://www.pcmag.com/1.html
361             # http://www.technologyreview.com/1.html
362             # http://www.zdnet.com/1.html
363             # )],
364             #
365             # [qw(
366             # http://www.businessinsider.com/2.html
367             # http://www.engadget.com/2.html
368             # http://www.independent.co.uk/2.html
369             # http://www.pcmag.com/2.html
370             # http://www.technologyreview.com/2.html
371             # http://www.zdnet.com/2.html
372             # )],
373             #
374             # [qw(
375             # http://www.businessinsider.com/3.html
376             # http://www.engadget.com/3.html
377             # http://www.pcmag.com/3.html
378             # http://www.technologyreview.com/3.html
379             # http://www.zdnet.com/3.html
380             # )],
381             #
382             # [qw(
383             # http://www.engadget.com/4.html
384             # http://www.technologyreview.com/4.html
385             # )],
386             # ],
387              
388             This is useful if you want to crawl many URLs, but also want to pause between
389             each visit to host/domain/TLD;
390              
391             my $list = URL::List->new( urls => [...] );
392              
393             foreach my $urls ( @{$list->blocks_by_domain} ) {
394             # get $urls in parallel, you will only visit each domain once, or you
395             # can delegate $urls to other workers (crawlers) to spread load etc.
396              
397             sleep( 5 ); # let's be nice and pause
398             }
399              
400             =cut
401              
402             has 'blocks_by_host' => (
403             isa => 'ArrayRef[ArrayRef]',
404             is => 'ro',
405             lazy => 1,
406             default => sub {
407             my $self = shift;
408              
409             return $self->_blocks_by( $self->distributed_by_host );
410             },
411             );
412              
413             has 'blocks_by_domain' => (
414             isa => 'ArrayRef[ArrayRef]',
415             is => 'ro',
416             lazy => 1,
417             default => sub {
418             my $self = shift;
419              
420             return $self->_blocks_by( $self->distributed_by_domain );
421             },
422             );
423              
424             has 'blocks_by_tld' => (
425             isa => 'ArrayRef[ArrayRef]',
426             is => 'ro',
427             lazy => 1,
428             default => sub {
429             my $self = shift;
430              
431             return $self->_blocks_by( $self->distributed_by_tld );
432             },
433             );
434              
435             #
436             # The End
437             #
438             __PACKAGE__->meta->make_immutable;
439              
440             1;
441              
442             =head1 LICENSE AND COPYRIGHT
443              
444             Copyright 2012-2017 Tore Aursand.
445              
446             This program is free software; you can redistribute it and/or modify it
447             under the terms of the the Artistic License (2.0). You may obtain a
448             copy of the full license at:
449              
450             L<http://www.perlfoundation.org/artistic_license_2_0>
451              
452             Any use, modification, and distribution of the Standard or Modified
453             Versions is governed by this Artistic License. By using, modifying or
454             distributing the Package, you accept this license. Do not use, modify,
455             or distribute the Package, if you do not accept this license.
456              
457             If your Modified Version has been derived from a Modified Version made
458             by someone other than you, you are nevertheless required to ensure that
459             your Modified Version complies with the requirements of this license.
460              
461             This license does not grant you the right to use any trademark, service
462             mark, tradename, or logo of the Copyright Holder.
463              
464             This license includes the non-exclusive, worldwide, free-of-charge
465             patent license to make, have made, use, offer to sell, sell, import and
466             otherwise transfer the Package with respect to any patent claims
467             licensable by the Copyright Holder that are necessarily infringed by the
468             Package. If you institute patent litigation (including a cross-claim or
469             counterclaim) against any party alleging that the Package constitutes
470             direct or contributory patent infringement, then this Artistic License
471             to you shall terminate on the date that such litigation is filed.
472              
473             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
474             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
475             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
476             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
477             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
478             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
479             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
480             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.