File Coverage

blib/lib/URL/List.pm
Criterion Covered Total %
statement 65 73 89.0
branch 8 14 57.1
condition 4 12 33.3
subroutine 15 15 100.0
pod 8 8 100.0
total 100 122 81.9


line stmt bran cond sub pod time code
1             package URL::List;
2 3     3   66462 use Mouse;
  3         104420  
  3         16  
3 3     3   4852 use namespace::autoclean;
  3         69231  
  3         25  
4              
5             =head1 NAME
6              
7             URL::List - Object-oriented methods of handling list of URLs.
8              
9             =head1 VERSION
10              
11             Version 0.11
12              
13             =cut
14              
15             our $VERSION = '0.11';
16              
17 3     3   278 use Carp;
  3         12  
  3         3780  
18 3     3   5299 use Domain::PublicSuffix;
  3         85165  
  3         33  
19 3     3   3030 use List::MoreUtils qw( uniq );
  3         4368  
  3         301  
20 3     3   9062 use URI;
  3         27541  
  3         2368  
21              
22             =head1 SYNOPSIS
23              
24             use URL::List;
25              
26             my $list = URL::List->new; # or URL::List->new;
27             $list->add( 'http://www.google.com/' );
28             $list->add( 'http://www.bbc.co.uk/' );
29              
30             my $distributed_by_hosts = $list->distributed_by_host;
31              
32             # $VAR1 = {
33             # 'www.google.com' => [
34             # 'http://www.google.com/',
35             # ],
36             # 'www.bbc.co.uk' => [
37             # 'http://www.bbc.co.uk/',
38             # ],
39             # };
40              
41             my $distributed_by_domains = $list->distributed_by_domain;
42              
43             # $VAR1 = {
44             # 'google.com' => [
45             # 'http://www.google.com/',
46             # ],
47             # 'bbc.co.uk' => [
48             # 'http://www.bbc.co.uk/',
49             # ],
50             # };
51              
52             my $distributed_by_tlds = $list->distributed_by_tld;
53              
54             # $VAR1 = {
55             # 'com' => [
56             # 'http://www.google.com/',
57             # ],
58             # 'co.uk' => [
59             # 'http://www.bbc.co.uk/',
60             # ],
61             # };
62              
63             my $urls = $list->all; # All the URLs are still there, so use this...
64             $list->clear; # ...to clear the list.
65              
66             =head1 DESCRIPTION
67              
68             URL:List is a module which helps you with distributing a list of URLs "evenly"
69             based on the URLs' host name, domain name or TLD (top-level domain).
70              
71             This can be useful for crawlers, ie. giving out a list of URLs within specific
72             hostnames, domain names and/or TLD names to different workers.
73              
74             =head1 METHODS
75              
76             =head2 new
77              
78             Returns an instance of URL::List.
79              
80             Takes one optional parameter, 'allow_duplicates', which is default 0. By setting
81             it to true (1), URL::List will not filter out duplicate articles.
82              
83             =cut
84              
85             has 'allow_duplicates' => ( isa => 'Bool', is => 'rw', default => 0 );
86             has 'urls' => ( isa => 'ArrayRef[Str]', is => 'rw', default => sub { [] } );
87              
88             =head2 add( $url )
89              
90             Add a URL to the list.
91              
92             =cut
93              
94             sub add {
95 41     41 1 159 my $self = shift;
96 41         52 my $url = shift;
97              
98 41 50 33     184 if ( defined $url && length $url ) {
99 41         43 push( @{$self->urls}, $url );
  41         164  
100             }
101             }
102              
103             =head2 all
104              
105             Returns an array reference of all the URLs in the list. This list can include
106             duplicates.
107              
108             =cut
109              
110             sub all {
111 9     9 1 13 my $self = shift;
112              
113 9 100       39 if ( $self->allow_duplicates ) {
114 2         12 return $self->urls;
115             }
116             else {
117 7         11 return [ List::MoreUtils::uniq(@{$self->urls}) ];
  7         109  
118             }
119             }
120              
121             =head2 count
122              
123             Returns the number of URLs in the list, including potential duplicates,
124             depending on the 'allow_duplicates' setting.
125              
126             =cut
127              
128             sub count {
129 6     6 1 30 my $self = shift;
130              
131 6         10 return scalar( @{$self->all} );
  6         31  
132             }
133              
134             =head2 clear
135              
136             Clears the URL list.
137              
138             =cut
139              
140             sub clear {
141 2     2 1 5 my $self = shift;
142              
143 2         11 $self->urls( [] );
144             }
145              
146             =head2 flush
147              
148             An alias for C.
149              
150             =cut
151              
152             sub flush {
153 1     1 1 3 return shift->clear;
154             }
155              
156             #
157             # DISTRIBUTIONS
158             #
159              
160             =head2 distributions
161              
162             Returns a hash reference of all the possible distributions.
163              
164             This method should not be used directly. Instead, the distributed_by_* methods
165             should be used.
166              
167             =cut
168              
169             has 'distributions' => ( isa => 'HashRef', is => 'ro', lazy_build => 1 );
170              
171             sub _build_distributions {
172 3     3   6 my $self = shift;
173              
174             #
175             # Create a list of valid URLs
176             #
177 3         5 my @urls = ();
178              
179 3         6 foreach my $url ( @{$self->all} ) {
  3         10  
180 16 50       60 if ( my $uri = URI->new($url) ) {
181 16         11499 push( @urls, $url );
182             }
183             else {
184 0         0 carp "Couldn't create a URI object from '" . $url . "'. Skipping it!";
185             }
186             }
187              
188             #
189             # Build the different distributions
190             #
191 3         13 my %distributions = ();
192 3         24 my $suffix = Domain::PublicSuffix->new;
193              
194 3         594505 foreach my $url ( @urls ) {
195 16         22 my $host = undef;
196              
197 16         24 eval {
198 16         67 $host = URI->new( $url )->host;
199             };
200              
201 16 50       1324 if ( $@ ) {
202 0         0 carp "Failed to determine host from '" . $url . "'. Skipping it!";
203 0         0 next;
204             }
205              
206 16 50 33     68 if ( defined $host && length $host ) {
207 16         43 my $domain = $suffix->get_root_domain( $host );
208 16         2482 my $tld = $suffix->tld;
209              
210 16         58 push( @{$distributions{host}->{$host}}, $url );
  16         41  
211              
212 16 50 33     66 if ( defined $domain && length $domain ) {
213 16         17 push( @{$distributions{domain}->{$domain}}, $url );
  16         37  
214             }
215             else {
216 0         0 carp "Failed to determine the domain name from '" . $url . "'. Skipping it!";
217 0         0 next;
218             }
219              
220 16 50 33     54 if ( defined $tld && length $tld ) {
221 16         34 push( @{$distributions{tld}->{$tld}}, $url );
  16         64  
222             }
223             else {
224 0         0 carp "Failed to determine the TLD from '" . $url . "'. Skipping it!";
225 0         0 next;
226             }
227             }
228             else {
229 0         0 carp "Failed to determine host from '" . $url . "'. Skipping it!";
230             }
231             }
232              
233             #
234             # Return
235             #
236 3         8555 return \%distributions;
237             }
238              
239             =head2 distributed_by_host
240              
241             Returns a hash reference where the key is the host name, like "www.google.com",
242             and the value is an array reference to the host name's URLs.
243              
244             =cut
245              
246             sub distributed_by_host {
247 3     3 1 10 my $self = shift;
248              
249 3         27 return $self->distributions->{host};
250             }
251              
252             =head2 distributed_by_domain
253              
254             Returns a hash reference where the key is the domain name, like "google.com",
255             and the value is an array reference to the domain name's URLs.
256              
257             =cut
258              
259             sub distributed_by_domain {
260 3     3 1 14 my $self = shift;
261              
262 3         27 return $self->distributions->{domain};
263             }
264              
265             =head2 distributed_by_tld
266              
267             Returns a hash reference where the key is the top-level domain name, like "com",
268             and the value is an array reference to the top-level domain name's URLs.
269              
270             =cut
271              
272             sub distributed_by_tld {
273 3     3 1 10 my $self = shift;
274              
275 3         25 return $self->distributions->{tld};
276             }
277              
278             #
279             # The End
280             #
281             __PACKAGE__->meta->make_immutable;
282              
283             1;
284              
285             =head1 LICENSE AND COPYRIGHT
286              
287             Copyright 2012-2013 Tore Aursand.
288              
289             This program is free software; you can redistribute it and/or modify it
290             under the terms of the the Artistic License (2.0). You may obtain a
291             copy of the full license at:
292              
293             L
294              
295             Any use, modification, and distribution of the Standard or Modified
296             Versions is governed by this Artistic License. By using, modifying or
297             distributing the Package, you accept this license. Do not use, modify,
298             or distribute the Package, if you do not accept this license.
299              
300             If your Modified Version has been derived from a Modified Version made
301             by someone other than you, you are nevertheless required to ensure that
302             your Modified Version complies with the requirements of this license.
303              
304             This license does not grant you the right to use any trademark, service
305             mark, tradename, or logo of the Copyright Holder.
306              
307             This license includes the non-exclusive, worldwide, free-of-charge
308             patent license to make, have made, use, offer to sell, sell, import and
309             otherwise transfer the Package with respect to any patent claims
310             licensable by the Copyright Holder that are necessarily infringed by the
311             Package. If you institute patent litigation (including a cross-claim or
312             counterclaim) against any party alleging that the Package constitutes
313             direct or contributory patent infringement, then this Artistic License
314             to you shall terminate on the date that such litigation is filed.
315              
316             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
317             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
318             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
319             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
320             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
321             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
322             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
323             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.