File Coverage

blib/lib/WWW/Spyder.pm
Criterion Covered Total %
statement 75 507 14.7
branch 0 260 0.0
condition 0 128 0.0
subroutine 25 72 34.7
pod 13 13 100.0
total 113 980 11.5


line stmt bran cond sub pod time code
1             package WWW::Spyder;
2              
3 1     1   24344 use strict;
  1         3  
  1         65  
4 1     1   7 use warnings;
  1         2  
  1         211  
5              
6 1     1   1400 use HTML::Parser 3;
  1         12055  
  1         36  
7              
8 1     1   16172 use LWP::UserAgent;
  1         65533  
  1         27  
9 1     1   859 use HTTP::Cookies;
  1         7079  
  1         24  
10 1     1   738 use URI::URL;
  1         3938  
  1         50  
11 1     1   6 use HTML::Entities;
  1         2  
  1         51  
12              
13 1     1   4 use Digest::MD5 "md5_base64"; # For making seen content key/index.
  1         1  
  1         42  
14              
15 1     1   4 use Carp;
  1         1  
  1         233  
16             our $VERSION = '0.24';
17             our $VERBOSITY ||= 0;
18              
19             # Methods
20             #--------------------------
21             { # make it all a bit more private
22             my %_methods = (# these are methods & roots of our attribute names
23             UA => undef,
24             bell => undef,
25             html_parser => undef,
26             sleep_base => undef,
27             cookie_file => undef,
28             _exit_epoch => undef,
29             _term_count => undef,
30             );
31             # Those may all get hardcoded eventually, but they're handy for now.
32             #--------------------------
33             sub new {
34 0     0 1   my ( $caller ) = shift;
35 0   0       my $class = ref($caller) || $caller;
36 0           my $self = bless {}, $class;
37              
38 0           my ( $seed, %arg );
39 0 0         if ( @_ == 1 ) {
40 0           ( $seed ) = @_;
41             }
42 0           %arg = ( broken_links => [],
43             exit_on => undef,
44             image_checking => 0,
45             report_broken_links => 0,
46             seed => undef,
47             sleep => undef,
48             sleep_base => 5,
49             UA => undef
50             );
51 0 0         %arg = ( %arg, @_ ) unless @_ % 2;
52              
53             # Set our UA object if it was passed on to our constructor.
54 0 0         $self->{UA} = $arg{UA} if $arg{UA};
55              
56             # Turn on image checking if requested. img src tags will be checked if
57             # image_checking is set to 1 in the constructor.
58 0 0         $self->{image_checking} = $arg{image_checking} if $arg{image_checking};
59              
60             # Turn on broken link checking if requested. Broken link URIs can be
61             # obtained via get_broken_links().
62 0 0         $self->{report_broken_links} = $arg{report_broken_links}
63             if $arg{report_broken_links};
64              
65             # Install all our methods, either set once then get only or push/shift
66             # array refs.
67 0           for my $method ( %_methods ) {
68 1     1   4 no strict "refs";
  1         1  
  1         27  
69 1     1   3 no warnings;
  1         2  
  1         1621  
70 0           my $attribute = '_' . $method;
71              
72 0 0         if ( ref $_methods{$method} eq 'ARRAY' ) {
73 0           *{"$class::$method"} = sub {
74 0     0     my($self,@args) = @_;
75 0 0         return shift(@{$self->{$attribute}}) unless @args;
  0            
76 0           push(@{$self->{$attribute}}, @args);
  0            
77 0           };
78             } else {
79 0           *{"$class::$method"} = sub {
80 0     0     my($self,$arg) = @_;
81 0 0 0       carp "You cannot reset $method!"
82             if $arg and exists $self->{$attribute};
83 0 0         return $self->{$attribute} #get if already set
84             if exists $self->{$attribute};
85 0           $self->{$attribute} = $arg; #only set one time!
86 0           };
87             }
88             }
89              
90 0   0       $seed ||= $arg{seed};
91 0 0         $self->seed($seed) if $seed;
92 0           $self->sleep_base($arg{sleep_base});
93 0 0         $self->_install_exit_check(\%arg) unless $self->can('_exit_check');
94 0           $self->_install_html_parser;
95 0           $self->_install_web_agent;
96              
97 0           return $self;
98             }
99             #--------------------------
100             sub terms {
101 0     0 1   my ($self,@terms) = @_;
102 0 0 0       if ( @terms and not exists $self->{_terms} ) {
103 0           $self->_term_count(scalar @terms); # makes this set once op
104 0           my %terms;
105 0           $terms{$_} = qr/$_/ for @terms;
106 0           $self->{_terms} = \%terms;
107             } else {
108 0           return $self->{_terms}
109             }
110             }
111             #--------------------------
112             sub show_attributes {
113 0     0 1   my ($self) = @_;
114 0           return map {/^_(.+)$/} keys %{$self};
  0            
  0            
115             }
116             #--------------------------
117             sub slept {
118 0     0 1   my ($self, $time) = @_;
119 0 0         $self->{_Slept} += $time if $time;
120 0 0         return $self->{_Slept} unless $time;
121             }
122             #--------------------------
123             sub seed {
124 0     0 1   my ($self, $url) = @_;
125 0 0         $url or croak "Must provide URL to seed().";
126 0 0         croak "You have passed something besides a plain URL to seed()!"
127             if ref $url;
128 0           $self->_stack_urls($url);
129 0           return 1; # to the top of the stacks
130             }
131              
132             #--------------------------
133             sub get_broken_links {
134 0     0 1   my $self = shift;
135              
136 0           return $self->{broken_links};
137             }
138              
139             #--------------------------
140             sub crawl {
141 0     0 1   my $self = shift;
142 0   0       my $opts = shift || undef;
143 0           my $excludes = [];
144              
145             # Exclude list option.
146 0 0         if ( ref($opts->{exclude}) eq 'ARRAY' ) {
147 0           $excludes = $opts->{exclude};
148             }
149              
150 0           while ('I have pages to get...') {
151              
152 0 0         $self->_exit_check and return;
153              
154 0           my $skip_url = 0;
155 0           my $enQ = undef;
156              
157             # Report a page with a 404 error in the title if report_broken_links is
158             # enabled. Also keep processing if we're looking for img src tags.
159 0 0 0       if ($self->{report_broken_links} || $self->{image_checking}) {
160 0   0       $enQ = $self->_choose_courteously ||
161             $self->_just_choose;
162             } else {
163 0   0       $enQ = $self->_choose_courteously ||
164             $self->_just_choose ||
165             return;
166             }
167              
168 0           my $url = $enQ->url;
169              
170             # Skip this URL if it's in our excluded list.
171 0 0         for (@$excludes) { $skip_url = 1 if $url =~ m/$_/; }
  0            
172 0 0         next if $skip_url;
173              
174 0           $self->url($url);
175 0           $self->_current_enQ($enQ);
176              
177 0 0         print "GET'ing: $url\n" if $VERBOSITY;
178              
179 0           my $response = $self->UA->request # no redirects &c is simple_
180             ( HTTP::Request->new( GET => "$url" ) );
181              
182 0 0         print STDERR "\a" if $self->bell;
183              
184 0 0 0       $response or
185             carp "$url failed GET!" and next;
186              
187 0           push @{$self->{_courtesy_Queue}}, $enQ->domain;
  0            
188 0           shift @{$self->{_courtesy_Queue}}
  0            
189             if $self->{_courtesy_Queue}
190 0 0 0       and @{$self->{_courtesy_Queue}} > 100;
191              
192 0           my $head = $response->headers_as_string;
193 0 0 0       $head or
194             carp "$url has no HEAD!" and
195             next; # no headless webpages
196              
197 0 0         length($head) > 1_024 and $head = substr($head,0,1_024);
198              
199 0 0         print $head, "\n" if $VERBOSITY > 2;
200              
201 0           my $base;
202 0           eval { $base = $response->base };
  0            
203 0 0 0       $base or
204             carp "$url has no discernible BASE!" and
205             next; # no baseless webpages
206              
207             # WE SHOULD also look for because some servers that we might want
208             # to look at don't properly report the content-type
209              
210             # start over unless this is something we can read
211 0           my $title = '';
212 0           my $description = '';
213 0           my $is_image = 0;
214              
215             # Make an exception for images.
216 0 0         if ($self->{image_checking}) {
217 0 0         if ($head =~ /Content\-Type:\s*image/i) {
218 0           my ($img_size) = $head =~ /Content\-Length:\s*(\d+)/i;
219              
220 0 0         if ($img_size <= 0) {
221 0           $title = $description = '404 Not Found';
222 0           next;
223             } else {
224 0           $is_image = 1;
225             }
226             }
227             } else {
228 0 0 0       lc($head) =~ /content-type:\s?(?:text|html)/ or
229             carp "$url doesn't look like TEXT or HTML!" and
230             next; # no weird media, movies, flash, etc
231             }
232              
233 0 0         ( $title ) = $head =~ m,[Tt]itle:\s*(.+)\n,
234             unless $title;
235              
236 0 0         ( $description ) = $head =~
237             /[^:]*?DESCRIPTION:\s*((?:[^\n]+(?:\n )?)+)/i
238             unless $description;
239              
240             # Add this link to our dead links list if the title matches
241             # a standard "404 Not Found" error.
242 0 0 0       if ($title && $self->{report_broken_links}) {
243 0 0         push(@{ $self->{broken_links} }, $url)
  0            
244             if $title =~ /^\s*404\s+Not\s+Found\s*$/;
245             }
246              
247 0 0         $description = $self->_snip($description) if $description;
248              
249 0 0 0       my $page = $response->content or
250             carp "Failed to fetch $url." and
251             next; # no empty pages, start over with next url
252              
253 0           $self->{_current_Bytes} = length($page);
254 0           $self->spyder_data($self->{_current_Bytes});
255              
256             # we are going to use a digest to prevent parsing the identical
257             # content received via a different url
258 0           my $digest = md5_base64($page); # unique microtag of the page
259             # so if we've seen it before, start over with the next URL
260 0 0 0       $self->{_page_Memory}{$digest}++ and
261             carp "Seen this page's content before: $url"
262             and next;
263              
264 0           $self->{_page_content} = $page;
265 0 0         print "PARSING: $url\n" if $VERBOSITY > 1;
266 0           $self->{_spydered}{$url}++;
267 0           $self->html_parser->parse($page);
268 0           $self->html_parser->eof;
269              
270 0 0         $self->{_adjustment} = $self->_parse_for_terms if $self->terms;
271              
272             # make links absolute and fix bad spacing in link names, then turn
273             # them into an Enqueue object
274 0           for my $pair ( @{$self->{_enqueue_Objects}} ) {
  0            
275 0           my $url;
276 0           eval {
277 0           $url = URI::URL::url($pair->[0], $base)->abs;
278             };
279 0           my $name = _snip($pair->[1]);
280 0           my $item = WWW::Spyder::Enqueue->new("$url",$name);
281 0           $pair = $item;
282             }
283             # put links into the queue(s)
284 0 0         $self->_stack_urls() if $self->_links;
285              
286             # clean up text a bit. should this be here...?
287 0 0 0       if ( $self->{_text} and ${$self->{_text}} ) {
  0            
288 0           ${$self->{_text}} =~ s/(?:\s*[\r\n]){3,}/\n\n/g;
  0            
289             }
290              
291             # in the future Page object should be installed like parsers as a
292             # reusable container
293             # return
294 0   0       my $Page =
295             WWW::Spyder::Page->new(
296             title => $title,
297             text => $self->{_text},
298             raw => \$page,
299             url => $enQ->url,
300             domain => $enQ->domain,
301             link_name => undef,
302             link => undef,
303             description => $description || '',
304             pages_enQs => $self->_enqueue,
305             );
306 0           $self->_reset; #<<--clear out things that might remain
307 0           return $Page;
308             }
309             }
310             #--------------------------
311             sub _stack_urls { # should eventually be broken into stack and sift?
312              
313             # dual purpose, w/ terms it filters as long as there are no urls
314             # passed, otherwise it's setting them to the top of the queues
315 0     0     my ($self, @urls) = @_;
316              
317 0 0 0       print "Stacking " . join(', ', @urls) . "\n"
318             if @urls and $VERBOSITY > 5;
319              
320 0 0 0       if ( $self->terms and not @urls ) {
    0          
321 1     1   6 no warnings;
  1         2  
  1         1351  
322 0           my @Qs = $self->_queues;
323 0           for my $enQ ( @{$self->_enqueue} ) {
  0            
324 0           my ( $url, $name ) = ( $enQ->url, $enQ->name );
325              
326 0 0         next if $self->_seen($url);
327              
328 0           my $match = 0;
329 0           while ( my ($term,$rx) = each %{$self->terms} ) {
  0            
330 0           $match++ for $name =~ /$rx/g;
331             }
332 0           my $baseQ = 10;
333 0           my $adjustment = $self->{_adjustment};
334 0           $baseQ -= $adjustment; # 4 to 0
335              
336 0 0 0       push @{$self->{$baseQ}}, $enQ
  0            
337             and next unless $match;
338              
339 0 0         if ( $VERBOSITY > 1 ) {
340 0           print "NAME: $name\n";
341 0           printf " RATIO -->> %d\n", $match;
342             }
343 0           my $queue_index = sprintf "%d",
344             $self->_term_count / $match;
345              
346 0           $queue_index -= $adjustment;
347 0 0         $queue_index = 4 if $queue_index > 4;
348 0 0         $queue_index = 0 if $queue_index < 0;
349 0           my $queue = $Qs[$queue_index];
350 0 0         if ($VERBOSITY > 2) {
351 0           print "Q:$queue [$queue_index] match: $match terms:",
352             $self->_term_count, " Adjust: $adjustment\n\n";
353             }
354 0           push @{$self->{$queue}}, $enQ;
  0            
355             }
356             } elsif ( @urls > 0 ) {
357 0           for my $url ( @urls ) {
358 0 0         next if $self->_seen($url);
359 0           my $queue = $self->_queues;
360 0 0         carp "Placing $url in '$queue'\n" if $VERBOSITY > 2;
361              
362             # unshift because seeding is priority
363 0           unshift @{$self->{$queue}},
  0            
364             WWW::Spyder::Enqueue->new($url,undef);
365             }
366             } else {
367 0           for my $enQ ( @{$self->_enqueue} ) {
  0            
368 0           my ( $url, $name ) = ( $enQ->url, $enQ->name );
369 0 0         next if $self->_seen($url);
370 0           my $queue = $self->_queues;
371 0           push @{$self->{$queue}}, $enQ;
  0            
372             }
373             }
374             }
375             #--------------------------
376             sub queue_count {
377 0     0 1   my ($self) = @_;
378 0           my $count = 0;
379 0           for my $Q ( $self->_queues ) {
380 0 0         next unless ref($self->{$Q}) eq 'ARRAY';
381 0           $count += scalar @{$self->{$Q}};
  0            
382             }
383 0           return $count;
384             }
385             #--------------------------
386             sub spyder_time {
387 0     0 1   my ($self,$raw) = @_;
388              
389 0           my $time = time() - $^T;
390 0 0         return $time if $raw;
391              
392 0           my $day = int( $time / 86400 );
393 0           my $hour = int( $time / 3600 ) % 24;
394 0           my $min = int( $time / 60 ) % 60;
395 0           my $sec = $time % 60;
396              
397             # also collect slept time!
398 0 0         return sprintf "%d day%s %02d:%02d:%02d",
399             $day, $day == 1?'':'s', $hour, $min, $sec;
400             }
401             #--------------------------
402             sub spyder_data {
403 0     0 1   my ($self, $bytes) = @_;
404 0 0 0       $self->{_bytes_GOT} += $bytes and return $bytes if $bytes;
405              
406 0 0         return 0 unless $self->{_bytes_GOT};
407              
408 0           my $for_commas = int($self->{_bytes_GOT} / 1_024);
409              
410 0           for ( $for_commas ) {
411 0           1 while s/(\d)(\d\d\d)(?!\d)/$1,$2/;
412             }
413 0           return $for_commas;
414             }
415             #--------------------------
416             sub spydered {
417 0     0 1   my ($self) = @_;
418             return wantarray ?
419 0           keys %{ $self->{_spydered} } :
  0            
420 0 0         scalar keys %{ $self->{_spydered} };
421             }
422             #--------------------------
423             #sub exclude { # what about FILES TYPES!?
424             # return undef; # not working yet!
425             # my ($self,$thing) = @_;
426             # if ( $thing =~ m<^[^:]{3,5}://> )
427             # {
428             # return $self->{_Xklood}{_domain}{$thing}++;
429             # }
430             # elsif ( $thing )
431             # {
432             # return $self->{_Xklood}{_name}{$thing}++;
433             # }
434             #}
435             #--------------------------
436             #sub excluded_domains {
437             # return undef; # not working yet!
438             # my ($self) = @_;
439             # return wantarray ?
440             # keys %{$self->{_Xklood}{_domain}} :
441             # [ keys %{$self->{_Xklood}{_domain}} ];
442             #}
443             #--------------------------
444             #sub excluded_names {
445             # return undef; # not working yet!
446             # my ($self) = @_;
447             # return wantarray ?
448             # keys %{$self->{_Xklood}{_name}} :
449             # [ keys %{$self->{_Xklood}{_name}} ];
450             #}
451             #--------------------------
452             sub go_to_seed {
453 0     0 1   my ( $self, $engine, $query ) = @_;
454 0           carp "go_to_seed() is not functional yet!\n";
455 0           return; # NOT FUNCTIONAL
456 0           my $seed = WWW::Spyder::Seed::get_seed($engine, $query);
457 0           $self->seed($seed);
458             }
459             #--------------------------
460             sub verbosity {
461 0     0 1   my ( $self, $verbosity ) = @_;
462 0 0 0       carp "Not setting verbosity! Must be integer b/t 1 & 6!\n"
463             and return
464             unless $verbosity;
465 0           $VERBOSITY = $verbosity;
466             }
467             #--------------------------
468              
469             #--------------------------
470             # PRIVATE Spyder Methods
471             #--------------------------
472             sub _reset {
473             # RESET MORE THAN THIS!?! make sure all the memory space is clean that
474             # needs be for clean iteration???
475 0     0     my ($self) = @_;
476 0           $self->{$_} = undef for qw( _linkText _linkSwitch _href _src
477             _current_enQ _page_content
478             _current_Bytes _alt _enqueue_Objects
479             _text );
480             }
481             #--------------------------
482             sub _current_enQ {
483 0     0     my ($self, $enQ) = @_;
484 0           my $last_enQ = $self->{_current_enQ};
485 0 0         $self->{_current_enQ} = $enQ if $enQ;
486 0           return $last_enQ; #<<-so we can get last while setting a new one
487             }
488             #--------------------------
489             sub _enqueue {
490 0     0     my ($self,$enQ) = @_;
491 0 0         push @{$self->{_enqueue_Objects}}, $enQ if $enQ;
  0            
492 0           return $self->{_enqueue_Objects};
493             }
494             #--------------------------
495             sub _links {
496 0     0     my ($self) = @_;
497 0           return [ map { $_->url } @{$self->_enqueue} ];
  0            
  0            
498             }
499             #--------------------------
500             sub _seen {
501 0     0     my ($self,$url) = @_;
502 0           return $self->{_seenURLs}{$url}++;
503             }
504             #--------------------------
505             sub _parse_for_terms {
506 0     0     my ($self) = @_;
507 0           $self->{_page_terms_matches} = 0;
508              
509 0 0         return 0 unless $self->{_text};
510              
511 0           while ( my ($term,$rx) = each %{$self->terms} ) {
  0            
512 0           $self->{_page_terms_matches}++ for
513             $self->{_page_content} =~ /$rx/g;
514             }
515              
516 0           my $index = int( ( $self->{_page_terms_matches} /
517             length($self->{_text}) ) * 1_000 );
518             # the algorithm might look it but isn't entirely arbitrary
519              
520 0 0         print " PARSE TERMS : $self->{_page_terms_matches} " .
521             "/ $self->{_current_Bytes}\n" if $VERBOSITY > 1;
522              
523 0 0         return 7 if $index > 25;
524 0 0         return 6 if $index > 18;
525 0 0         return 5 if $index > 14;
526 0 0         return 4 if $index > 11;
527 0 0         return 3 if $index > 7;
528 0 0         return 2 if $index > 3;
529 0 0         return 1 if $index > 0;
530 0           return 0;
531             }
532             #--------------------------
533             sub _install_html_parser {
534 0     0     my ($self) = @_;
535              
536             my $Parser = HTML::Parser->new
537             (
538             start_h =>
539             [sub {
540 1     1   7 no warnings;
  1         2  
  1         508  
541 0     0     my ( $tag, $attr ) = @_;
542              
543             # Check for broken image links if requested.
544 0 0 0       return if $tag !~ /^(?:a|img)$/ && ! $self->{image_checking};
545              
546             # need to deal with AREA tags from maps /^(?:a(?:rea)?|img)$/;
547 0           $attr->{href} =~ s,#[^/]*$,,;
548 0 0         $attr->{src} =~ s,#[^/]*$,, if $self->{image_checking};
549 0 0         return if lc($attr->{href}) =~ m,^\s*mailto:,;
550 0 0         return if lc($attr->{href}) =~ m,^\s*file:,;
551 0 0         return if lc($attr->{href}) =~ m,javascript:,;
552              
553 0 0 0       $self->{_src} ||= $attr->{src} if $self->{image_checking};
554 0   0       $self->{_href} ||= $attr->{href};
555 0   0       $self->{_alt} ||= $attr->{alt};
556 0           $self->{_linkSwitch} = 1;
557              
558             # Don't wait for the end handler if we have an image, as an image
559             # src tag doesn't have an end.
560 0 0 0       if ($attr->{src} && $self->{image_checking} && ! $attr->{href}) {
      0        
561 0   0       $self->{_linkText} ||= $self->{_alt} || '+';
      0        
562 0           decode_entities($self->{_linkText});
563              
564 0           push @{$self->{_enqueue_Objects}},
  0            
565             [ $self->{_href}, $self->{_linkText} ];
566              
567 0 0 0       push @{$self->{_enqueue_Objects}},
  0            
568             [ $self->{_src}, $self->{_linkText} ]
569             if $self->{_src} and $self->{image_checking};
570              
571             # reset all our caching variables
572 0           $self->{_linkSwitch} = $self->{_href} = $self->{_alt} =
573             $self->{_src} = $self->{_linkText} = undef;
574              
575 0           return;
576             }
577             }, 'tagname, attr'],
578             text_h =>
579             [sub {
580              
581 0 0   0     return unless(my $it = shift);
582 0 0         return if $it =~
583             m/(?:\Q\E)/;
584 0           ${$self->{_text}} .= $it;
  0            
585 0 0         $self->{_linkText} .= $it
586             if $self->{_linkSwitch};
587             }, 'dtext'],
588             end_h =>
589             [sub {
590 0     0     my ( $tag ) = @_;
591 1     1   13 no warnings; # only problem: Links
  1         2  
  1         435  
592              
593 0 0         if ($self->{image_checking}) {
594 0 0 0       return unless $tag eq 'a' or $self->{_linkSwitch} or
      0        
595             $tag eq 'img';
596             } else {
597 0 0 0       return unless $tag eq 'a' or $self->{_linkSwitch};
598             }
599              
600 0   0       $self->{_linkText} ||= $self->{_alt} || '+';
      0        
601 0           decode_entities($self->{_linkText});
602              
603 0           push @{$self->{_enqueue_Objects}},
  0            
604             [ $self->{_href}, $self->{_linkText} ];
605              
606 0 0 0       push @{$self->{_enqueue_Objects}},
  0            
607             [ $self->{_src}, $self->{_linkText} ]
608             if $self->{_src} and $self->{image_checking};
609              
610             # reset all our caching variables
611 0           $self->{_linkSwitch} = $self->{_href} = $self->{_alt} = $self->{_src} =
612             $self->{_linkText} = undef;
613 0           }, 'tagname'],
614             default_h => [""],
615             );
616 0           $Parser->ignore_elements(qw(script style));
617 0           $Parser->unbroken_text(1);
618 0           $self->html_parser($Parser);
619             }
620             #--------------------------
621             sub _install_web_agent {
622 0     0     my $self = shift;
623 0           my $jar_jar = undef;
624              
625             # If a LWP::UserAgent object was passed in to our constructor, use
626             # it.
627 0 0         if ($self->{UA}) {
628 0           $self->UA( $self->{UA} );
629              
630             # Otherwise, create a new one.
631             } else {
632 0           $self->UA( LWP::UserAgent->new );
633             }
634              
635 0           $self->UA->agent('Mozilla/5.0');
636 0           $self->UA->timeout(30);
637 0           $self->UA->max_size(250_000);
638              
639             # Get our cookie from our the jar passed in.
640 0 0         if ($self->{UA}) {
641 0           $jar_jar = $self->{UA}->cookie_jar();
642              
643             # Or else create a new cookie.
644             } else {
645 0   0       $jar_jar = HTTP::Cookies->new
646             (file => $self->cookie_file || "$ENV{HOME}/spyderCookies",
647             autosave => 1,
648             max_cookie_size => 4096,
649             max_cookies_per_domain => 5, );
650             }
651              
652 0           $self->UA->cookie_jar($jar_jar);
653             }
654             #--------------------------
655             sub _install_exit_check {
656 0     0     my ($self, $arg) = @_;
657 0           my $class = ref $self;
658              
659 0 0 0       unless ( ref($arg) and ref($arg->{exit_on}) eq 'HASH' ) {
660 1     1   6 no strict "refs";
  1         1  
  1         310  
661 0           *{$class."::_exit_check"} =
662 0 0   0     sub { return 1 unless $self->queue_count;
663 0           return 0;
664 0           };
665 0           return;
666             }
667              
668             # checks can be: links => #, success => ratio, time => 10min...
669             # a piece of code we're going to build up to eval into method-hood
670 0           my $SUB = 'sub { my $self = shift; ' .
671             'return 1 unless $self->queue_count; ';
672             #------------------------------------------------------------
673 0 0         if ( $arg->{exit_on}{pages} ) {
674 0 0         print "Installing EXIT on links: $arg->{exit_on}{pages}\n"
675             if $VERBOSITY > 1;
676 0           $SUB .= ' return 1 if ' .
677             '$self->spydered >= ' .$arg->{exit_on}{pages} .';';
678             }
679             #------------------------------------------------------------
680 0 0         if ( $arg->{exit_on}{success} ) {
681             #set necessary obj value and add to sub code
682             }
683             #------------------------------------------------------------
684 0 0         if ( $arg->{exit_on}{time} ) {
685 0 0         print "Installing EXIT on time: $arg->{exit_on}{time}\n"
686             if $VERBOSITY > 1;
687              
688 0           my ($amount,$unit) =
689             $arg->{exit_on}{time} =~ /^(\d+)\W*(\w+?)s?$/;
690             # skip final "s" in case of hours, secs, mins
691              
692 0           my %times = ( hour => 3600,
693             min => 60,
694             sec => 1 );
695              
696 0           my $time_factor = 0;
697 0           for ( keys %times ) {
698 0 0         next unless exists $times{$unit};
699 0           $time_factor = $amount * $times{$unit};
700             }
701 0           $self->_exit_epoch($time_factor + $^T);
702              
703 0           $SUB .= q{
704             return 1 if $self->_exit_epoch < time();
705             };
706             }
707             #------------------------------------------------------------
708 0           $SUB .= '}';
709              
710 1     1   4 no strict "refs";
  1         2  
  1         748  
711 0           *{$class."::_exit_check"} = eval $SUB;
  0            
712             }
713             #--------------------------
714             sub _choose_courteously {
715 0     0     my $self = shift;
716              
717             # w/o the switch and $i-- it acts a bit more depth first. w/ it, it's
718             # basically hard head down breadth first
719 0 0         print "CHOOSING courteously!\n" if $VERBOSITY > 1;
720 0           for my $Q ( $self->_queues ) {
721 0 0         print "Looking for URL in $Q\n" if $VERBOSITY > 2;
722 0 0 0       next unless $self->{$Q} and @{$self->{$Q}} > 0;
  0            
723 0           my %seen;
724 0           my $total = scalar @{$self->{$Q}};
  0            
725 0           my $switch;
726 0           for ( my $i = 0; $i < @{$self->{$Q}}; $i++ ) {
  0            
727 0           my $enQ = $self->{$Q}[$i];
728 0           my ($url,$name) = ( $enQ->url, $enQ->name );
729              
730             # if we see one again, we've reshuffled as much as is useful
731 0 0         $seen{$url}++
732             and $switch = 1; # progress through to next Q
733              
734 0 0         return splice(@{$self->{$Q}},$i,1)
  0            
735             unless $self->_courtesy_call($enQ);
736              
737 0           my $fair_bump = int( log( $total - $i ) / log(1.5) );
738              
739 0           my $move_me_back = splice(@{$self->{$Q}},$i,1);
  0            
740 0           splice(@{$self->{$Q}},($i+$fair_bump),0,$move_me_back);
  0            
741 0 0         $i-- unless $switch;
742             }
743             }
744             # we couldn't pick one courteously
745             } # end of _choose_courteously()
746             #--------------------------
747             sub _just_choose {
748 0     0     my $self = shift;
749 0 0         print "CHOOSING first up!\n" if $VERBOSITY > 1;
750              
751 0           my $enQ;
752 0           for my $Q ( $self->_queues ) {
753 0 0         next unless ref($self->{$Q}) eq 'ARRAY';
754 0           $enQ = shift @{$self->{$Q}};
  0            
755 0           last;
756             }
757 0           my $tax = $self->_courtesy_call($enQ);
758 0 0         if ( $VERBOSITY > 4 ) {
759 0           print ' QUEUE: ';
760 0 0         print join("-:-", @{$self->{_courtesy_Queue}}), "\n"
  0            
761             if $self->{_courtesy_Queue};
762             }
763 0           my $sleep = int(rand($self->sleep_base)) + $tax;
764              
765 0 0         if ( $VERBOSITY ) {
766 0 0         printf "COURTESY NAP %d second%s ",
767             $sleep, $sleep == 1 ?'':'s';
768 0 0         printf "(Domain recently seen: %d time%s)\n",
769             $tax, $tax == 1 ?'':'s';
770             }
771 0           sleep $sleep; # courtesy to websites but human-ish w/ random
772 0           $self->slept($sleep);
773 0           return $enQ;
774             }
775             #--------------------------
776             sub _courtesy_call {
777 0     0     my ($self,$enQ) = @_;
778 0 0         return 0 unless $enQ;
779 0           my $domain = $enQ->domain;
780              
781 0 0         print 'COURTESY check: ', $domain, "\n" if $VERBOSITY > 5;
782              
783             # yes, we have seen it in the last whatever GETs
784 0           my $seen = 0;
785 0           $seen = scalar grep { $_ eq $domain }
  0            
786 0           @{$self->{_courtesy_Queue}};
787 0 0         $seen = 10 if $seen > 10;
788 0           return $seen;
789             }
790             #--------------------------
791             sub _queues { # Q9 is purely for trash so it's not returned here
792             return wantarray ?
793 0 0   0     ( 0 .. 9 ) :
794             '0';
795             }
796             #--------------------------
797             sub _snip {
798 0 0   0     my $self = shift if ref($_[0]);
799 0           my ( @text ) = @_;
800 0           s/^\s+//, s/\s+$//, s/\s+/ /g for @text;
801 0 0         return wantarray ? @text : shift @text;
802             }
803             #--------------------------
804             # Spyder ENDS
805             #--------------------------
806             }# WWW::Spyder privacy ends
807              
808              
809             #--------------------------
810             package WWW::Spyder::Enqueue;
811             #--------------------------
812             {
813 1     1   5 use Carp;
  1         2  
  1         116  
814             #---------------------------------------------------------------------
815 1         6 use overload( q{""} => '_stringify',
816 1     1   6 fallback => 1 );
  1         2  
817             #---------------------------------------------------------------------
818             # 0 -->> URL
819             # 1 -->> name, if any, of link URL was got from
820             # 2 -->> domain
821             #--------------------------
822             sub new {
823 0     0     my ( $caller, $url, $name ) = @_;
824 0   0       my $class = ref($caller) || $caller;
825 0 0         croak "Here I am. " if ref $url;
826 0 0         return undef unless $url;
827 0 0         if ( length($url) > 512 ) { # that's toooo long, don't you think?
828 0           $url = substr($url,0,512);
829             }
830 0 0 0       if ( $name and length($name) > 512 ) {
831 0           $name = substr($url,0,509) . '...';
832             }
833 0 0         $name = '-' unless $name; # need this to find a bug later
834 0           my ( $domain ) = $url =~ m,^[^:]+:/+([^/]+),;
835 0           bless [ $url, $name, lc($domain) ], $class;
836             }
837             #--------------------------
838             sub url {
839 0     0     return $_[0]->[0];
840             }
841             #--------------------------
842             sub name {
843 0     0     return $_[0]->[1];
844             }
845             #--------------------------
846             sub domain {
847 0     0     return $_[0]->[2];
848             }
849             #--------------------------
850             sub _stringify {
851 0     0     return $_[0]->[0];
852             }
853             #--------------------------
854             }#privacy for WWW::Spyder::Enqueue ends
855              
856              
857             #--------------------------
858             package WWW::Spyder::Page;
859             #--------------------------
860 1     1   279 use strict;
  1         1  
  1         27  
861 1     1   3 use warnings;
  1         2  
  1         22  
862 1     1   15 use Carp;
  1         1  
  1         95  
863             {
864             sub new {
865 0     0     my ( $caller, %arg ) = @_;
866 0   0       my $class = ref($caller) || $caller;
867 0           my $self = bless {}, $class;
868              
869 0           while ( my ( $method, $val ) = each %arg ) {
870              
871 1     1   4 no strict "refs";
  1         1  
  1         22  
872 1     1   4 no warnings;
  1         1  
  1         410  
873 0           my $attribute = '_' . $method;
874              
875 0 0         if ( ref $val eq 'ARRAY' ) {
876 0           *{"$class::$method"} = sub {
877 0     0     my($self,$arg) = @_;
878 0 0         return @{$self->{$attribute}} unless $arg;
  0            
879 0           push(@{$self->{$attribute}}, @{$arg});
  0            
  0            
880 0           };
881             } else {
882 0           *{"$class::$method"} = sub {
883 0     0     my($self,$arg) = @_;
884             # get if already set and deref if needed
885 0 0 0       if ( not $arg and exists $self->{$attribute} ) {
886 0           return ref($self->{$attribute}) eq 'SCALAR' ?
887 0 0         ${$self->{$attribute}} : $self->{$attribute};
888             }
889 0 0         $self->{$attribute} = $arg if $arg; #only set one time!
890 0           };
891             }
892 0           $self->$method($val);
893             }
894 0           return $self;
895             }
896             #--------------------------
897             sub links {
898 0     0     my ( $self ) = @_;
899 0           return map {$_->url} @{$self->{_pages_enQs}};
  0            
  0            
900             }
901             #--------------------------
902             sub next_link {
903 0     0     my ( $self ) = @_;
904 0           shift @{$self->{_pages_enQs}};
  0            
905             }
906             #--------------------------
907             }#privacy for ::Page ends
908              
909              
910             #--------------------------
911             package WWW::Spyder::Exclusions;
912             #--------------------------
913             {
914             # THIS PACKAGE IS NOT BEING USED
915             my %_domains = qw(
916             ad.doubleclick.net 1
917             ads.clickagents.com 1
918             );
919             my %_names = qw(
920              
921             );
922             #--------------------------
923             sub exclude_domain {
924 0     0     $_domains{shift}++;
925             }
926             #--------------------------
927             sub excluded {
928 0     0     my $what = shift;
929 0 0         exists $_domains{$what} || $_names{$what};
930             }
931             #--------------------------
932             }#privacy ends
933              
934              
935             #--------------------------
936             package WWW::Spyder::Seed;
937             #--------------------------
938             {
939 1     1   11 use URI::Escape;
  1         2  
  1         48  
940 1     1   4 use Carp;
  1         1  
  1         270  
941              
942             my %engine_url =
943             (
944             google => 'http://www.google.com/search?q=',
945             yahoo => 1
946             );
947              
948             # should we exclude the search domain at this point? i think so because
949             # otherwise we've introduced dozens of erroneous links and the engine
950             # is gonna get hammered over time for it
951             #--------------------------
952             sub get_seed {
953              
954 0   0 0     my $engine = shift || croak "Must provide search engine! " .
955             join(', ', sort keys %engine_url) . "\n";
956              
957 0   0       my $query = shift || croak "Must provide query terms!\n";
958 0           $query = uri_escape($query);
959              
960 0 0         croak "$engine is not a valid choice!\n"
961             unless exists $engine_url{lc$engine};
962              
963 0           return $engine_url{lc$engine} . $query;
964             }
965              
966             } # Privacy for WWW::Spyder::Seed ends
967              
968             1;
969              
970             # Plain Old D'errrrr
971              
972             =pod
973              
974             =head1 NAME
975              
976             WWW::Spyder - a simple non-persistent web crawler.
977              
978             =head1 VERSION
979              
980             0.24
981              
982             =head1 SYNOPSIS
983              
984             A web spider that returns plain text, HTML, and other information per
985             page crawled and can determine what pages to get and parse based on
986             supplied terms compared to the text in links as well as page content.
987              
988             use WWW::Spyder;
989             # Supply your own LWP::UserAgent-compatible agent.
990             use WWW::Mechanize;
991              
992             my $start_url = "http://my-great-domain.com/";
993             my $mech = WWW::Mechanize->new(agent => "PreferredAgent/0.01")
994              
995             my $spyder = WWW::Spyder->new(
996             report_broken_links => 1,
997             seed => $start_url,
998             sleep_base => 5,
999             UA => $mech
1000             );
1001             while ( my $page = $spyder->crawl ) {
1002             # do something with the page...
1003             }
1004              
1005             =head1 METHODS
1006              
1007             =over 2
1008              
1009             =item * $spyder->new()
1010              
1011             Construct a new spyder object. Without at least the seed() set, or
1012             go_to_seed() turned on, the spyder isn't ready to crawl.
1013              
1014             $spyder = WWW::Spyder->new(shift||die"Gimme a URL!\n");
1015             # ...or...
1016             $spyder = WWW::Spyder->new( %options );
1017              
1018             Options include: sleep_base (in seconds), exit_on (hash of methods and
1019             settings), report_broken_links, image_checking (verifies the images pointed to
1020             by tags, disable_cnap (disables the courtesy nap when verbose
1021             output is enabled), and UA (you can pass in an instantiated LWP::UserAgent
1022             object via UA, i.e. UA => $ua_obj). Examples below.
1023              
1024             =item * $spyder->seed($url)
1025              
1026             Adds a URL (or URLs) to the top of the queues for crawling. If the
1027             spyder is constructed with a single scalar argument, that is considered
1028             the seed.
1029              
1030             =item * $spyder->bell([bool])
1031              
1032             This will print a bell ("\a") to STDERR on every successfully crawled
1033             page. It might seem annoying but it is an excellent way to know your
1034             spyder is behaving and working. True value turns it on. Right now it
1035             can't be turned off.
1036              
1037             =item * $spyder->spyder_time([bool])
1038              
1039             Returns raw seconds since I was created if given a
1040             boolean value, otherwise returns "D day(s) HH::MM:SS."
1041              
1042             =item * $spyder->terms([list of terms to match])
1043              
1044             The more terms, the more the spyder is going to grasp at. If you give
1045             a straight list of strings, they will be turned into very open
1046             regexes. E.g.: "king" would match "sulking" and "kinglet" but not
1047             "King." It is case sensitive right now. If you want more specific
1048             matching or different behavior, pass your own regexes instead of
1049             strings.
1050              
1051             $spyder->terms( qr/\bkings?\b/i, qr/\bqueens?\b/i );
1052              
1053             terms() is only settable once right now, then it's a done deal.
1054              
1055             =item * $spyder->spyder_data()
1056              
1057             A comma formatted number of kilobytes retrieved so far. B give
1058             it an argument. It's a set/get routine.
1059              
1060             =item * $spyder->slept()
1061              
1062             Returns the total number of seconds the spyder has slept while
1063             running. Useful for getting accurate page/time counts (spyder
1064             performance) discounting the added courtesy naps.
1065              
1066             =item * $spyder->UA->...
1067              
1068             The user agent. It should be an L or a well-behaved
1069             subclass like L. Here are the initialized values you
1070             might want to tweak-
1071              
1072             $spyder->UA->timeout(30);
1073             $spyder->UA->max_size(250_000);
1074             $spyder->UA->agent('Mozilla/5.0');
1075              
1076             Changing the agent name can hurt your spyder because some servers won't
1077             return content unless it's requested by a "browser" they recognize.
1078              
1079             You should probably add your email with from() as well.
1080              
1081             $spyder->UA->from('bluefintuna@fish.net');
1082              
1083             =item * $spyder->cookie_file([local_file])
1084              
1085             They live in $ENV{HOME}/spyderCookie by default but you can set your
1086             own file if you prefer or want to save different cookie files for
1087             different spyders.
1088              
1089             =item * $spyder->get_broken_links
1090              
1091             Returns a reference to a list of broken link URLs if report_broken_links was
1092             was enabled in the constructor.
1093              
1094             =item * $spyder->go_to_seed
1095              
1096             =item * $spyder->queue_count
1097              
1098             =item * $spyder->show_attributes
1099              
1100             =item * $spyder->spydered
1101              
1102             =item * $spyder->crawl
1103              
1104             Returns (and removes) a Spyder page object from the queue of spydered pages.
1105              
1106             =back
1107              
1108             =head2 Sypder::Page methods
1109              
1110             =over 6
1111              
1112             =item * $page->title
1113              
1114             =item * $page->text
1115              
1116             =item * $page->raw
1117              
1118             =item * $page->url
1119              
1120             =item * $page->domain
1121              
1122             =item * $page->link_name
1123              
1124             =item * $page->link
1125              
1126             =item * $page->description
1127              
1128             =item * $page->pages_enQs
1129              
1130             =back
1131              
1132             =head2 Weird courteous behavior
1133              
1134             Courtesy didn't used to be weird, but that's another story. You will
1135             probably notice that the courtesy routines force a sleep when a
1136             recently seen domain is the only choice for a new link. The sleep is
1137             partially randomized. This is to prevent the spyder from being
1138             recognized in weblogs as a robot.
1139              
1140             =head2 The web and courtesy
1141              
1142             B, I beg of thee, exercise the most courtesy you can. Don't
1143             let impatience get in the way. Bandwidth and server traffic are
1144             C<$MONEY> for real. The web is an extremely disorganized and corrupted
1145             database at the root but companies and individuals pay to keep it
1146             available. The less pain you cause by banging away on a webserver with
1147             a web agent, the more welcome the next web agent will be.
1148              
1149             B: Google seems to be excluding generic LWP agents now. See, I
1150             told you so. A single parallel robot can really hammer a major server,
1151             even someone with as big a farm and as much bandwidth as Google.
1152              
1153             =head2 VERBOSITY
1154              
1155             =over 2
1156              
1157             =item * $spyder->verbosity([1-6]) -OR-
1158              
1159             =item * $WWW::Spyder::VERBOSITY = ...
1160              
1161             Set it from 1 to 6 right now to get varying amounts of extra info to
1162             STDOUT. It's an uneven scale and will be straightened out pretty soon.
1163             If kids have a preference for sending the info to STDERR, I'll do
1164             that. I might anyway.
1165              
1166             =back
1167              
1168             =head1 SAMPLE USAGE
1169              
1170             =head2 See "spyder-mini-bio" in this distribution
1171              
1172             It's an extremely simple, but fairly cool pseudo bio-researcher.
1173              
1174             =head2 Simple continually crawling spyder:
1175              
1176             In the following code snippet:
1177              
1178             use WWW::Spyder;
1179              
1180             my $spyder = WWW::Spyder->new( shift || die"Give me a URL!\n" );
1181              
1182             while ( my $page = $spyder->crawl ) {
1183              
1184             print '-'x70,"\n";
1185             print "Spydering: ", $page->title, "\n";
1186             print " URL: ", $page->url, "\n";
1187             print " Desc: ", $page->description || 'n/a', "\n";
1188             print '-'x70,"\n";
1189             while ( my $link = $page->next_link ) {
1190             printf "%22s ->> %s\n",
1191             length($link->name) > 22 ?
1192             substr($link->name,0,19).'...' : $link->name,
1193             length($link) > 43 ?
1194             substr($link,0,40).'...' : $link;
1195             }
1196             }
1197              
1198             as long as unique URLs are being found in the pages crawled, the
1199             spyder will never stop.
1200              
1201             Each "crawl" returns a page object which gives the following methods
1202             to get information about the page.
1203              
1204             =over 2
1205              
1206             =item * $page->links
1207              
1208             URLs found on the page.
1209              
1210             =item * $page->title
1211              
1212             Page's Title if there is one.
1213              
1214             =item * $page->text
1215              
1216             The parsed plain text out of the page. Uses HTML::Parser and tries to
1217             ignore non-readable stuff like comments and scripts.
1218              
1219             =item * $page->url
1220              
1221             =item * $page->domain
1222              
1223             =item * $page->raw
1224              
1225             The content returned by the server. Should be HTML.
1226              
1227             =item * $page->description
1228              
1229             The META description of the page if there is one.
1230              
1231             =item * $page->links
1232              
1233             Returns a list of the URLs in the page. Note: next_link() will shift
1234             the available list of links() each time it's called.
1235              
1236             =item * $link = $page->next_link
1237              
1238             next_link() destructively returns the next URI-ish object in the page.
1239             They are objects with three accessors.
1240              
1241             =back
1242              
1243             =over 6
1244              
1245             =item * $link->url
1246              
1247             This is also overloaded so that interpolating "$link" will get the
1248             URL just as the method does.
1249              
1250             =item * $link->name
1251              
1252             =item * $link->domain
1253              
1254             =back
1255              
1256             =head2 Spyder that will give up the ghost...
1257              
1258             The following spyder is initialized to stop crawling when I of
1259             its conditions are met: 10mins pass or 300 pages are crawled.
1260              
1261             use WWW::Spyder;
1262              
1263             my $url = shift || die "Please give me a URL to start!\n";
1264              
1265             my $spyder = WWW::Spyder->new
1266             (seed => $url,
1267             sleep_base => 10,
1268             exit_on => { pages => 300,
1269             time => '10min', },);
1270              
1271             while ( my $page = $spyder->crawl ) {
1272              
1273             print '-'x70,"\n";
1274             print "Spydering: ", $page->title, "\n";
1275             print " URL: ", $page->url, "\n";
1276             print " Desc: ", $page->description || '', "\n";
1277             print '-'x70,"\n";
1278             while ( my $link = $page->next_link ) {
1279             printf "%22s ->> %s\n",
1280             length($link->name) > 22 ?
1281             substr($link->name,0,19).'...' : $link->name,
1282             length($link) > 43 ?
1283             substr($link,0,40).'...' : $link;
1284             }
1285             }
1286              
1287             =head2 Primitive page reader
1288              
1289             use WWW::Spyder;
1290             use Text::Wrap;
1291              
1292             my $url = shift || die "Please give me a URL to start!\n";
1293             @ARGV or die "Please also give me a search term.\n";
1294             my $spyder = WWW::Spyder->new;
1295             $spyder->seed($url);
1296             $spyder->terms(@ARGV);
1297              
1298             while ( my $page = $spyder->crawl ) {
1299             print '-'x70,"\n * ";
1300             print $page->title, "\n";
1301             print '-'x70,"\n";
1302             print wrap('','', $page->text);
1303             sleep 60;
1304             }
1305              
1306             =head1 TIPS
1307              
1308             If you are going to do anything important with it, implement some
1309             signal blocking to prevent accidental problems and tie your gathered
1310             information to a DB_File or some such.
1311              
1312             You might want to load C. It should top the nice off
1313             at your system's max and prevent your spyder from interfering with
1314             your system.
1315              
1316             You might want to to set $| = 1.
1317              
1318             =head1 PRIVATE METHODS
1319              
1320             =head2 are private but hack away if you're inclined
1321              
1322             =head1 TO DO
1323              
1324             I is conceived to live in a future namespace as a servant class
1325             for a complex web research agent with simple interfaces to
1326             pre-designed grammars for research reports; or self-designed
1327             grammars/reports (might be implemented via Parse::FastDescent if that
1328             lazy-bones Conway would just find another 5 hours in the paltry 32
1329             hour day he's presently working).
1330              
1331             I'd like the thing to be able to parse RTF, PDF, and perhaps even
1332             resource sections of image files but that isn't on the radar right
1333             now.
1334              
1335             The tests should work differently. Currently they ask for outside
1336             resources without checking if there is either an open way to do it or
1337             if the user approves of it. Bad form all around.
1338              
1339             =head1 TO DOABLE BY 1.0
1340              
1341             Add 2-4 sample scripts that are a bit more useful.
1342              
1343             There are many functions that should be under the programmer's control
1344             and not buried in the spyder. They will emerge soon. I'd like to put
1345             in hooks to allow the user to keep(), toss(), or exclude(), urls, link
1346             names, and domains, while crawling.
1347              
1348             Clean up some redundant, sloppy, and weird code. Probably change or
1349             remove the AUTOLOAD.
1350              
1351             Put in a go_to_seed() method and a subclass, ::Seed, with rules to
1352             construct query URLs by search engine. It would be the autostart or the
1353             fallback for perpetual spyders that run out of links. It would hit a
1354             given or default search engine with the I's terms as the query.
1355             Obviously this would only work with terms() defined.
1356              
1357             Implement auto-exclusion for failure vs. success rates on names as well
1358             as domains (maybe URI suffixes too).
1359              
1360             Turn length of courtesy queue into the breadth/depth setting? make it
1361             automatically adjusting...?
1362              
1363             Consistently found link names are excluded from term strength sorting?
1364             Eg: "privacy policy," "read more," "copyright..."
1365              
1366             Fix some image tag parsing problems and add area tag parsing.
1367              
1368             Configuration for user:password by domain.
1369              
1370             ::Page objects become reusable so that a spyder only needs one.
1371              
1372             ::Enqueue objects become indexed so they are nixable from anywhere.
1373              
1374             Expand exit_on routines to size, slept time, dwindling success ratio,
1375             and maybe more.
1376              
1377             Make methods to set "skepticism" and "effort" which will influence the
1378             way the terms are used to keep, order, and toss URLs.
1379              
1380             =head1 BE WARNED
1381              
1382             This module already does some extremely useful things but it's in its
1383             infancy and it is conceived to live in a different namespace and
1384             perhaps become more private as a subservient part of a parent class.
1385             This may never happen but it's the idea. So don't put this into
1386             production code yet. I am endeavoring to keep its interface constant
1387             either way. That said, it could change completely.
1388              
1389             =head2 Also!
1390              
1391             This module saves cookies to the user's home. There will be more
1392             control over cookies in the future, but that's how it is right now.
1393             They live in $ENV{HOME}/spyderCookie.
1394              
1395             =head2 Anche!
1396              
1397             Robot Rules aren't respected. I endeavors to be polite as far
1398             as server hits are concerned, but doesn't take "no" for answer right
1399             now. I want to add this, and not just by domain, but by page settings.
1400              
1401             =head1 UNDOCUMENTED FEATURES
1402              
1403             A.k.a. Bugs. Don't be ridiculous! Bugs in B?!
1404              
1405             There is a bug that is causing retrieval of image src tags, I think
1406             but haven't tracked it down yet, as links. I also think the plain text
1407             parsing has some problems which will be remedied shortly.
1408              
1409             If you are building more than one spyder in the same script they are
1410             going to share the same exit_on parameters because it's a
1411             self-installing method. This will not always be so.
1412              
1413             See B file for more open and past issues.
1414              
1415             Let me know if you find any others. If you find one that is platform
1416             specific, please send patch code/suggestion because I might not have
1417             any idea how to fix it.
1418              
1419             =head1 WHY C
1420              
1421             I didn't want to use the more appropriate I because I think
1422             there is a better one out there somewhere in the zeitgeist and the
1423             namespace future of I is uncertain. It may end up a
1424             semi-private part of a bigger family. And I may be King of Kenya
1425             someday. One's got to dream.
1426              
1427             If you like I, have feedback, wishlist usage, better
1428             algorithms/implementations for any part of it, please let me know!
1429              
1430             =head1 THANKS TO
1431              
1432             Most all y'all. Especially Lincoln Stein, Gisle Aas, The Conway,
1433             Raphael Manfredi, Gurusamy Sarathy, and plenty of others.
1434              
1435             =head1 COMPARE WITH (PROBABLY PREFER)
1436              
1437             L, L, L, L,
1438             L, and other kith and kin.
1439              
1440             =head1 LICENCE AND COPYRIGHT
1441              
1442             Copyright (c) 2001-2008, Ashley Pond V C<< >>. All
1443             rights reserved.
1444              
1445             This module is free software; you can redistribute it and/or
1446             modify it under the same terms as Perl itself. See L.
1447              
1448             =head1 DISCLAIMER OF WARRANTY
1449              
1450             BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
1451             FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
1452             OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
1453             PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
1454             EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1455             WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
1456             ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
1457             YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
1458             NECESSARY SERVICING, REPAIR, OR CORRECTION.
1459              
1460             IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
1461             WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
1462             REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE
1463             LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL,
1464             OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE
1465             THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
1466             RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
1467             FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
1468             SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
1469             SUCH DAMAGES.
1470              
1471             =cut