File Coverage

blib/lib/WWW/Monitor/Task.pm
Criterion Covered Total %
statement 21 219 9.5
branch 0 62 0.0
condition 0 15 0.0
subroutine 7 26 26.9
pod 19 19 100.0
total 47 341 13.7


line stmt bran cond sub pod time code
1             #WWW/monitor.pm. Written in 2007 by Yaron Kahanoitch. This
2             # source code has been placed in the public domain by the author.
3             # Please be kind and preserve the documentation.
4              
5              
6             package WWW::Monitor::Task;
7              
8              
9             #use 5.008;
10 1     1   20573 use warnings;
  1         2  
  1         28  
11 1     1   6 use strict;
  1         2  
  1         30  
12 1     1   958 use HTTP::Response;
  1         31425  
  1         32  
13 1     1   956 use HTTP::Request;
  1         689  
  1         22  
14 1     1   6 use HTTP::Headers;
  1         1  
  1         20  
15 1     1   4 use HTTP::Status;
  1         2  
  1         314  
16 1     1   1268 use HTML::TreeBuilder;
  1         4  
  1         13  
17             #use Carp;
18              
19              
20              
21             our(@ISA, @EXPORT, @EXPORT_OK, $VERSION);
22              
23             $VERSION = 0.24;
24              
25             @ISA = qw(
26             Exporter
27             );
28             @EXPORT = qw ();
29             @EXPORT_OK = qw ();
30              
31             our $HASH_SEPARATOR = "\n";
32             our $HASH_KEY_PREFIX = "__HASH_KEY__";
33              
34             =head1 NAME
35              
36             WWW::Monitor::Task - A Task class for monitoring single web page
37             against a cached version.
38              
39             =head1 VERSION
40              
41             Version 0.1
42              
43             =cut
44              
45             =head1 Description
46              
47             This class is responsible for tracking a single web page and reporting
48             changes. This class should be considered as a private asset of
49             L. For details please refer to
50              
51             =head1 EXPORT
52              
53             =head1 FUNCTIONS
54              
55             =head2 new
56              
57             A constructor.
58              
59             =cut
60              
61             sub new {
62 0     0 1   my $this = shift;
63 0           my %arg;
64 0 0         unless (@_ % 2) {
65 0           %arg = @_;
66             } else {
67 0           carp ("Parameters for WWW::Monitor::Task should be given as pair of 'OPTION'=>'VAL'");
68             }
69 0   0       my $class = ref($this) || $this;
70 0           my $self = {};
71 0 0         carp ("Url is not given") unless exists $arg{URL};
72 0           $self->{url} = $arg{URL};
73 0           $self->{cache} = $arg{CACHE};
74 0           bless($self, $class);
75             }
76              
77             =head2 run ( mechanize, carrier, )
78              
79             Executes Task. Parameters:
80              
81             mechanize - Web mechanize object.
82              
83             L assumes that the given object implements or
84             inherits WWW::mechnize abstraction. See
85             L.
86              
87             carrier- Object which will conduct the notification; see L for details
88              
89             cache - optional - A cache class.
90              
91             =cut
92              
93             sub run {
94 0     0 1   my $self = shift;
95 0           $self->{error} = "";
96 0           my ($mechanize,$carrier) = (shift,shift);
97 0           my $cache = "";
98 0 0         if (@_) { $cache = shift;}
  0            
99 0           my $url_i = $self->{url};
100 0 0         $self->{cache} = $cache if ($cache);
101 0           my $responses = {};
102              
103             #Get Url data. Output data is stored in the hash ref $responses.
104 0 0         $self->get_url_data($mechanize,$url_i,$responses) or return 0;
105              
106             #Compares Pages list with cache.
107 0           my ($url_keys_for_comapre,$old_pages_to_compare,$new_pagets_to_compare,$missing_pages,$added_pages,$existsInCache) = $self->sync_cache($url_i,$responses);
108              
109             # if a page does not exist in cache we don't want to notify this
110 0 0         return 1 unless ($existsInCache);
111              
112             #Activate Notification.
113 0           $self->be_notified($carrier,$url_i,$missing_pages,$added_pages,$old_pages_to_compare,$new_pagets_to_compare,$url_keys_for_comapre);
114 0           return 1;
115             }
116              
117             =head2 be_notified
118              
119             (Private method)
120             Tests if a page has changed. If yes, notification call back is being called.
121              
122             =cut
123              
124             sub be_notified {
125 0     0 1   my $self = shift;
126 0           my $notify_ind = 0;
127 0           my ($carrier,$url,$missing_pages,$added_pages,$old_pages_to_compare,$new_pages_to_compare,$url_keys_for_comapre) = @_;
128 0           my $cache = $self->{cache};
129 0           my $ret = 1;
130             #Extract textual information from missing pages.
131 0           $self->{missing_parts} = $missing_pages;
132 0           my $notify_ind1 = values(%$missing_pages);
133              
134             #Extract added information from added pages.
135 0           $self->{added_parts} = $added_pages;
136 0           my $notify_ind2 = values(%$added_pages);
137            
138 0           my $index = 0;
139             #Go over on all pages that exists in cache and perform textual comparison
140 0           $self->{changed} = {};
141 0 0         if (@$old_pages_to_compare) {
142 0           while ($index < scalar(@$old_pages_to_compare)) {
143 0           my $t1 = $self->format_html($old_pages_to_compare->[$index]);
144 0           my $t2 = $self->format_html($new_pages_to_compare->[$index]);
145            
146 0 0         if ($$t1 ne $$t2) {
147 0           my $tmp = [$old_pages_to_compare->[$index], $new_pages_to_compare->[$index] ];
148 0           $self->{changed}{$url_keys_for_comapre->[$index]} = $tmp;
149 0           $cache->set($url_keys_for_comapre->[$index],$new_pages_to_compare->[$index]->as_string);
150 0           $notify_ind = 1;
151             }
152 0           ++$index;
153             }
154             }
155              
156              
157             #If notification is required, perform it.
158 0 0 0       if ($notify_ind or $notify_ind1 or $notify_ind2) {
      0        
159 0           $self->{time1} = HTTP::Date::time2str($self->validity($url));
160 0           $self->{time2} = HTTP::Date::time2str(time());
161 0           $self->store_validity($url,time());
162 0           return $carrier->notify($url,$self);
163 0           } else { return 1;}
164             }
165              
166             =head2 is_html
167              
168             (Private method)
169             Return true if page is html
170              
171             =cut
172              
173             sub is_html {
174 0     0 1   my $self = shift;
175 0           my $response = shift;
176 0           return $response->header('Content-Type') =~ m%^text/html%;
177             }
178              
179             =head2 missing_parts
180              
181             Return hash reference which includes parts that exists only in old cached version. Every entry in the returned list is a reference to HTTP::REsponse object.
182              
183             =cut
184              
185             sub missing_parts {
186 0     0 1   my $self = shift;
187 0           return $self->{missing_parts};
188             }
189              
190             =head2 added_parts
191              
192             Return hash reference which includes parts that exists only in the new cached version.Every entry in the returned list is a reference to HTTP::REsponse object.
193              
194             =cut
195              
196             sub added_parts {
197 0     0 1   my $self = shift;
198 0           return $self->{added_parts};
199             }
200              
201             =head2 old_version_time_stamp
202              
203             Return the time when the url was previously cached. Time is returned in seconds since epoch.
204              
205             =cut
206              
207             sub old_version_time_stamp {
208 0     0 1   my $self = shift;
209 0           return $self->{time1};
210             }
211              
212             =head2 new_version_time_stamp
213              
214             Return the time when the url was queried. Time is returned in seconds since epoch.
215              
216             =cut
217              
218             sub new_version_time_stamp {
219 0     0 1   my $self = shift;
220 0           return $self->{time2};
221             }
222              
223             =head2 changed_parts
224              
225             Return a list that consists of all changed parts.
226              
227             =cut
228              
229             sub changed_parts {
230 0     0 1   my $self = shift;
231 0           return keys %{$self->{changed}};
  0            
232             }
233              
234             =head2 get_old_new_pair [ urls key ]
235              
236             Return a list of two elements. The first one is the old cached version and the second one is the new version.
237             The given url key must be one of the keys returned by changed_parts method.
238             Each of the pair two pairs is a reference to L object.
239              
240              
241              
242             =cut
243              
244             sub get_old_new_pair {
245 0     0 1   my $self = shift;
246 0           my $url_key = shift;
247 0 0         if (exists $self->{changed}{$url_key}) {
248 0           return @{$self->{changed}{$url_key}};
  0            
249             } else {
250 0           return 0;
251             }
252             }
253              
254             =head2 format_html [ leftmargin, rightmargin]
255              
256             Return a textual version of HTML
257             left and right margins set the margin for the returned data.
258              
259             =cut
260              
261             sub format_html {
262 0     0 1   my $self = shift;
263 0           my $response_ref = shift;
264 0           my $leftmargin = 0;
265 0           my $rightmargin = 120;
266              
267 0 0         if (@_) {
268 0           $leftmargin = shift;
269 0           $rightmargin = shift;
270             }
271            
272 0           my $reftype = ref($response_ref);
273 0 0 0       if (($reftype ne 'REF') and $self->is_html($response_ref)) {
    0          
274 0           my $tree = HTML::TreeBuilder->new->parse($response_ref->content);
275 0           my $formatter = HTML::FormatText->new(leftmargin => $leftmargin, rightmargin => $rightmargin);
276 0           my $ret = $formatter->format($tree);
277 0           return \$ret;
278             } elsif ($reftype eq 'REF') { #Backward compatibility case to ver 0.126
279 0           my $tree = HTML::TreeBuilder->new->parse($response_ref);
280 0           my $formatter = HTML::FormatText->new(leftmargin => $leftmargin, rightmargin => $rightmargin);
281 0           my $ret = $formatter->format($tree);
282 0           return \$ret;
283             } else { #We have non html data
284 0           my $content = $response_ref->content;
285 0           return \$content;
286             }
287             }
288              
289             =head2 get_hash_cache_key
290              
291             (Private method)
292             Return a hash key that stores information about the entire visible part or the URL.
293              
294             =cut
295              
296             sub get_hash_cache_key {
297 0     0 1   my $self = shift;
298 0           my $url = shift;
299 0           return $HASH_KEY_PREFIX.$url;
300             }
301              
302             =head2 get_cache_hash
303              
304             (Private Method)
305             Returns all urls which were last cached.
306             return true if the url was previously hashed.
307              
308             =cut
309              
310             sub get_cache_hash {
311 0     0 1   my ($self,$url,$is_cached_site) = @_;
312 0           my $cache = $self->{cache};
313 0           my $ret = {};
314 0           $$is_cached_site = 1;
315 0           my $hash_key = $self->get_hash_cache_key($url);
316 0 0         $cache->exists($hash_key) or do { $$is_cached_site = 0;return 0;};
  0            
  0            
317 0           foreach $hash_key (split($HASH_SEPARATOR, $cache->get($hash_key))) {
318 0           my $tmp = $cache->get($hash_key);
319 0           my $tmp2 = HTTP::Response->parse( $tmp );
320 0 0         if ($tmp2) {
321 0           $ret->{$hash_key} = $tmp2;
322             } else { #Backward compatibility to version 0.126
323 0           $ret->{$hash_key} = \$tmp;
324             }
325             }
326 0           return $ret;
327             }
328              
329             =head2 store_validity
330              
331             (Private method)
332             Store current time in the main hash key
333              
334             =cut
335              
336             sub store_validity {
337 0     0 1   my ($self,$url) = (@_);
338 0           my $cache = $self->{cache};
339 0           my $hash_key = $self->get_hash_cache_key($url);
340 0 0         $cache->set_validity($hash_key,time()) if ($cache->exists($hash_key));
341 0           return 1;
342            
343             }
344              
345             =head2 validity
346              
347             (private method)
348             Retreive date validity of per stores url
349              
350             =cut
351              
352             sub validity {
353 0     0 1   my ($self,$url) = (@_);
354 0           my $cache = $self->{cache};
355 0           my $hash_key = $self->get_hash_cache_key($url);
356 0 0         if ($cache->exists($hash_key)) {
357 0           return $cache->validity($hash_key);
358             }
359 0           return 0;
360             }
361              
362             =head2 store_cache_hash
363              
364             Store General information of a web address, including all frames and dates.
365              
366             =cut
367              
368             sub store_cache_hash {
369 0     0 1   my ($self,$url,$data,$added_data,$deleted_data) = (@_);
370 0           my $cache = $self->{cache};
371 0           my $hash_key = $self->get_hash_cache_key($url);
372 0           my $header = join($HASH_SEPARATOR,keys %$data);
373 0           $cache->set($hash_key,join($HASH_SEPARATOR,keys %$data));
374 0           while (my ($key,$value) = each %$added_data) {
375 0           $cache->set($key,$value->as_string);
376 0           $cache->set_validity($key,time());
377             }
378 0           while (my ($key2,$value2) = each %$deleted_data) {
379 0           $cache->purge($key2,$value2);
380             }
381 0           return 1;
382             }
383              
384             =head2 sync_cache
385              
386             (Private method)
387              
388             =cut
389              
390              
391             #sync_cache (Privatre method) takes newly retrieved data, and stores and compresses it with
392             # the cache data. That is, It returns as follows:
393             # might_be_changed - Urls that are included in the retrieved pages and are in the cache.
394             # Those pages are potentialy changed, and therefore should be examinated by HTML comparison.
395             # deleted_data - Pages which exist in the cache and not in the new set.
396             # added_data - Pages which exist only in the new version.
397             # In addition, the sub purges all deleted pages from cache and stores the added pages.
398             # Due to performance reasons, all the "might_be_changed" pages are not cached.
399             # This is left for the caller to do.
400             sub sync_cache {
401 0     0 1   my ($self,$url,$new_data_http) = @_;
402 0           my $cache = $self->{cache};
403 0           my $is_cached_site;
404 0           my $old_data = $self->get_cache_hash($url,\$is_cached_site);
405 0           my ($added_data,$deleted_data) = ({},{});
406 0           my @old_pages_to_compare;
407             my @new_pages_to_compare;
408 0           my @url_keys_for_comapre;
409 0           my $index_new = 0;my $index_old = 0;
  0            
410 0           my @new_keys = sort (keys %$new_data_http);
411 0 0         my @old_keys = ($old_data)?(sort(keys %$old_data)):();
412             # print "Scalars: ", scalar(@new_keys), "==",scalar(@old_keys),"\n";
413 0   0       while ($index_new < scalar(@new_keys) and $index_old < scalar(@old_keys)) {
414 0 0         if ($new_keys[$index_new] eq $old_keys[$index_old]) {
415 0 0         if ($new_data_http->{$new_keys[$index_new]}->code() != RC_NOT_MODIFIED) {
416 0           push @old_pages_to_compare, $old_data->{ $old_keys[$index_old]};
417 0           my $a_response = $new_data_http->{$new_keys[$index_new]};
418 0           push @new_pages_to_compare, $a_response;
419 0           push @url_keys_for_comapre,$new_keys[$index_new];
420             }
421 0           ++$index_old;++$index_new;next;
  0            
  0            
422             }
423 0 0         if ($new_keys[$index_new] lt $old_keys[$index_old]) {
424 0           my $a_response = $new_data_http->{$new_keys[$index_new]};
425 0           $added_data->{$new_keys[$index_new]} = $a_response;
426 0           ++$index_new;
427 0           next;
428             }
429 0           $deleted_data->{$old_keys[$index_old]} = $old_data->{$old_keys[$index_old]};
430 0           ++$index_old;next;
  0            
431             }
432 0           while ($index_new < scalar(@new_keys)) {
433 0           my $a_response = $new_data_http->{$new_keys[$index_new]};
434 0           $added_data->{$new_keys[$index_new]} = $a_response;
435 0           ++$index_new;
436             }
437 0           while ($index_old < scalar(@old_keys)) {
438 0           $deleted_data->{$old_keys[$index_old]} = $old_data->{$old_keys[$index_old]};
439 0           ++$index_old;
440             }
441             # print "Goota cache\n";
442 0 0         $self->store_cache_hash($url,$new_data_http,$added_data,$deleted_data) or die ("Cannot store $url in cache");
443 0           return (\@url_keys_for_comapre,\@old_pages_to_compare,\@new_pages_to_compare,$deleted_data,$added_data,$is_cached_site);
444             }
445              
446             =head2 get_url_data
447              
448             (Private method)
449              
450             =cut
451              
452             # get_url_data recurses over all pages which construct a given web page--including all type
453             # of included frames and dynamic pages--and retrieves them into a given hash reference
454             # $response.
455             sub get_url_data {
456 0     0 1   my $self = shift;
457 0           my $mechanize = shift;
458 0           my $url = shift;
459 0           my $responses = shift;
460 0           my $cache = $self->{cache};
461 0           my $r = HTTP::Request->new('GET',$url);
462             # Only allow "identity" for the time being
463 0           $r->header( 'Accept-Encoding', 'identity' );
464 0 0         if ($cache->exists($url)) {
465 0           my $validity = $cache->validity($url);
466 0 0         $r->header('If-Modified-Since'=>HTTP::Date::time2str($cache->validity($url))) if ($validity);
467             }
468 0           my $response = $mechanize->request( $r );
469            
470 0 0         if ($response->code() == 304) {
    0          
471 0           $response = HTTP::Response->parse($cache->get($url));
472 0           $mechanize->_update_page($r,$response);
473             } elsif(!($self->{status} = $response->is_success())) {
474 0           $self->{error} = $response->status_line;
475 0           return 0;
476             }
477 0           $responses->{$url} = $response;
478 0           my $frames = [];
479 0           my $output = $mechanize->find_all_links( tag_regex => qr/^([ia]?frame)$/i);
480 0 0         push @$frames,@$output if ($output);
481 0           $output = $mechanize->find_all_links( tag_regex => qr/meta/);
482 0 0         push @$frames,@$output if ($output);
483              
484 0           foreach my $link (@$frames) {
485 0 0         next unless ($link->url_abs =~ m%^http.*//%);
486 0 0         unless (exists $responses->{$link->url_abs()}) {
487 0 0         $self->get_url_data($mechanize,$link->url_abs(),$responses) or return 0;
488             }
489             }
490 0           return 1;
491             }
492              
493             =head2 success
494              
495             return true upon success of the last run execution.
496              
497             =cut
498              
499             sub success {
500 0     0 1   my $self = shift;
501 0           return $self->{status};
502             }
503              
504              
505             =head1 AUTHOR
506              
507             Yaron Kahanovitch, C<< >>
508              
509             =head1 BUGS
510              
511             Please report any bugs or feature requests to
512             C, or through the web interface at
513             L.
514             I will be notified, and then you'll automatically be notified of progress on
515             your bug as I make changes.
516              
517             =head1 SUPPORT
518              
519             You can find documentation for this module with the perldoc command. perldoc WWW::Monitor
520             You can also look for information at:
521              
522             =over 4
523              
524             =item * AnnoCPAN: Annotated CPAN documentation
525              
526             L
527              
528             =item * CPAN Ratings
529              
530             L
531              
532             =item * RT: CPAN's request tracker
533              
534             L
535              
536             =item * Search CPAN
537              
538             L
539              
540             =back
541              
542             =head1 ACKNOWLEDGMENTS
543              
544             =head1 COPYRIGHT & LICENSE
545              
546             Copyright 2007 Yaron Kahanovitch, all rights reserved.
547              
548             This program is free software; you can redistribute it and/or modify it
549             under the same terms as Perl itself.
550              
551             1; # End of WWW::Monitor::Task