File Coverage

blib/lib/XML/RSS/TimingBot.pm
Criterion Covered Total %
statement 341 363 93.9
branch 81 122 66.3
condition 42 75 56.0
subroutine 52 54 96.3
pod 11 20 55.0
total 527 634 83.1


line stmt bran cond sub pod time code
1              
2             require 5;
3             package XML::RSS::TimingBot; # Time-stamp: "2004-05-19 12:50:53 ADT"
4 7     7   63099 use LWP::UserAgent::Determined ();
  7         577536  
  7         2333  
5             @ISA = ('LWP::UserAgent::Determined');
6              
7 7     7   86 use strict;
  7         13  
  7         229  
8 7     7   42 use vars qw($VERSION);
  7         21  
  7         470  
9             $VERSION = '2.03';
10              
11 7     7   11294 use LWP::Debug ();
  7         5637  
  7         150  
12 7     7   9004 use XML::RSS::Timing ();
  7         44709  
  7         1131  
13              
14 7 50   7   87206 BEGIN { *DEBUG = sub () {0} unless defined &DEBUG }
15              
16             die "Where's _elem?!!?" unless __PACKAGE__->can('_elem');
17             #--------------------------------------------------------------------------
18             # Some incidental accessors:
19 9     9 1 138 sub minAge { shift->_elem( 'minAge' , @_) }
20 9     9 1 953 sub maxAge { shift->_elem( 'maxAge' , @_) }
21 0     0 0 0 sub min_age { shift->minAge(@_) } #alias
22 0     0 0 0 sub max_age { shift->maxAge(@_) } #alias
23              
24 60     60 0 256 sub rss_semaphore_file { shift->_elem( 'rss_semaphore_file' , @_) }
25              
26             #==========================================================================
27              
28             sub feed_get_last_modified {
29             # This is not an epochtime, it's a string that we probably got
30             # from the remote server.
31             # As says RFC2616 section 14.25 : <<
32             # Note: When handling an If-Modified-Since header field, some
33             # servers will use an exact date comparison function, rather than a
34             # less-than function, for deciding whether to send a 304 (Not
35             # Modified) response. To get best results when sending an If-
36             # Modified-Since header field for cache validation, clients are
37             # advised to use the exact date string received in a previous Last-
38             # Modified header field whenever possible.
39             # >> (among many other wise things)
40             # Example value: "Sun, 04 Apr 2004 11:58:04 GMT"
41              
42 46     46 1 289 my($self, $url) = @_;
43 46         68 DEBUG and print "Getting lastmod value for $url ...\n";
44 46         120 return $self->get_datum($url, 'lastmodified');
45             }
46              
47             sub feed_get_next_update { # this is an epochtime
48 47     47 1 86 my($self, $url) = @_;
49 47         65 DEBUG and print "Getting next-update value for $url ...\n";
50 47         152 return $self->get_datum($url, 'nextupdate');
51             }
52              
53             sub feed_get_etag { # this is a string
54             # See RFC 2616, "3.11 Entity Tags"
55 16     16 1 483 my($self, $url) = @_;
56 16         22 DEBUG and print "Getting etag value for $url ...\n";
57 16         50 return $self->get_datum($url, 'etag');
58             }
59              
60             sub feed_set_last_modified {
61 12     12 1 49 my($self, $url, $last_modified_time) = @_;
62 12         22 DEBUG and print "Setting lastmod for $url to $last_modified_time\n";
63 12         57 $self->set_datum($url, 'lastmodified', $last_modified_time);
64             }
65              
66             sub feed_set_next_update {
67 12     12 1 26 my($self, $url, $next_update_time) = @_;
68 12         22 DEBUG and print "Setting next-update for $url to $next_update_time\n";
69 12         42 $self->set_datum($url, 'nextupdate', $next_update_time);
70             }
71              
72             sub feed_set_etag {
73 6     6 1 16 my($self, $url, $etag) = @_;
74 6         12 DEBUG and print "Setting etag for $url to $etag\n";
75 6         19 $self->set_datum($url, 'etag', $etag);
76             }
77              
78             #--------------------------------------------------------------------------
79              
80             sub new {
81 30     30 1 13336 my $self = shift->SUPER::new(@_);
82 30         58250 $self->_rssagent_init();
83 30         101 return $self;
84             }
85              
86             sub _rssagent_init {
87 30     30   69 my $self = shift;
88 30         261 $self->agent("XmlRssTimingBot/$VERSION (" . $self->agent . ")" );
89            
90 30 50       3622 $self->rss_semaphore_file(1) unless $^O =~ m/(?:Mac|MSWin)/;
91              
92             # Whatever needs doing here
93 30         255 return;
94             }
95              
96             #==========================================================================
97              
98             sub commit { # save all our new data for these various feeds we've been seeing
99 10     10 1 2564 my $self = shift;
100 10         45 return $self->_stupid_commit;
101             }
102              
103             sub datum_from_db {
104 48     48 0 100 my($self, $url, $varname) = @_;
105 48         158 return $self->_stupid_datum_from_db($url, $varname);
106             }
107              
108             #==========================================================================
109              
110 12     12 0 18 sub rssagent_just_before_real_request { # Override if you like
111             #my($self, $args_ref) = @_;
112             }
113 12     12 0 31 sub rssagent_just_after_real_request { # Override if you like
114             #my($self, $response, $args_ref) = @_;
115             }
116              
117             #==========================================================================
118              
119             sub simple_request {
120 28     28 1 60563 my($self, @args) = @_;
121 28         133 LWP::Debug::trace('simple_request()');
122              
123 28   100     561 LWP::Debug::debug("Trying simple_request with args: ["
124             . join(',', map $_||"''", @args) . "]");
125              
126 28         759 DEBUG and print( "Trying simple_request with args: ["
127             . join(',', map $_||"''", @args) . "]");
128              
129 28         50 my $resp;
130 28         108 my $maybe_response = $self->_rssagent_maybe_null_response($args[0]);
131 28 100       90 if( $maybe_response ) {
132 16         55 LWP::Debug::debug("Returning cached response");
133 16         111 return $maybe_response;
134             }
135              
136 12         64 $self-> _rssagent_add_header_conditions(@args);
137 12         54 $self-> rssagent_just_before_real_request(\@args);
138 12         87 $resp = $self->SUPER::simple_request(@args);
139 12         3105092 $self-> rssagent_just_after_real_request($resp, \@args);
140 12         65 $self-> _rssagent_response_consider($resp);
141              
142 12         47 LWP::Debug::debug("Returning uncached response");
143 12         97 return $resp;
144             }
145              
146             #==========================================================================
147              
148             sub _rssagent_maybe_null_response {
149             # Return a (virtual) response object if you want to block this request.
150             # Otherwise, to allow this request to actually happen, return a false value.
151              
152 28     28   56 my($self, $req) = @_;
153 28         82 LWP::Debug::trace('_rssagent_maybe_null_response()');
154 28         283 my $url = $req->uri;
155              
156 28         334 my $not_until = $self->feed_get_next_update($url);
157 28 100       433 unless(defined $not_until) {
158 11         240 LWP::Debug::debug( "No restrictions on when to get $url");
159 11         227 DEBUG > 1 and print "No restrictions on when to get $url\n";
160 11         33 return undef;
161             }
162              
163 17         64 my $now = $self->now;
164 17 100       74 if($not_until >= $now) {
165 16         82 LWP::Debug::debug( "It's now $now, but I shouldn't look at $url until $not_until.");
166 16         143 DEBUG > 1 and print "It's now $now, but I shouldn't look at $url until $not_until.\n";
167 16         63 return $self->_rss_agent_null_response($req, $not_until);
168             }
169              
170             # Else give the all-clear
171 1         2 return undef;
172             }
173              
174             #==========================================================================
175              
176             sub _rssagent_response_consider {
177 12     12   33 my($self, $response) = @_;
178             # Possibly extract the RSS-timing content from this response.
179              
180 12         76 LWP::Debug::trace('_maybe_null_response()');
181 12         84 my $code = $response->code;
182 12 100       174 unless($code eq '200') { # or $code eq '304' ?
183 1         7 LWP::Debug::debug('Not trying to find RSS content in this $code response');
184 1         7 return;
185             }
186              
187 11         60 my $url = $self->_url_from_response( $response );
188              
189 11 100       519 return unless $self->_looks_like_rss( $url, $response );
190              
191 9         50 my $now = $self->now();
192              
193 9         158 my $time_string_from_resp = $self->_time_string_from_resp($response, $now);
194 9 50       70 $self->feed_set_last_modified($url, $time_string_from_resp)
195             if defined $time_string_from_resp;
196            
197 9         154 my $etag = $self->_etag_from_resp($response);
198 9 100       54 $self->feed_set_etag($url, $etag) if defined $etag;
199            
200 9 50       113 $self->_ponder_next_update($url, $now, $response) unless $code eq '304';
201 9         24 return;
202             }
203              
204             #==========================================================================
205              
206             sub _looks_like_rss {
207 11     11   25 my($self, $url, $response) = @_;
208 11         45 LWP::Debug::trace('_ponder_next_update');
209 11         47 my $content;
210              
211             # Look for rss/rdf in the first 2000 bytes
212             # TODO: support Atom here? Does anyone ever use sy:* stuff in Atom?
213              
214 11         68 my $c = $response->content_ref;
215 11 50 33     270 unless( $c and $$c ) {
216 0         0 LWP::Debug::debug("Content from $url is apparently null.");
217 0         0 print "NULL!\n";
218             # so it's sure not RSS!
219 0         0 return 0;
220             }
221              
222 11 100       138 if ( $$c =~ m{^[^\x00]{0,2000}?(?:
223 9         82 LWP::Debug::debug("Content from $url looks like RSS/RDF.");
224 9         127 return 1;
225             }
226            
227 2         19 LWP::Debug::debug("Content from $url doesn't look like RSS/RDF.");
228 2         30 return 0;
229             }
230              
231             #==========================================================================
232              
233             sub _ponder_next_update {
234 9     9   24 my($self, $url, $now, $response) = @_;
235 9         77 LWP::Debug::trace('_ponder_next_update');
236 9         74 my $content = $response->content;
237 9         188 $content =~ s///sg; # kill XML comments
238              
239 9 50       66 unless( $content =~ m{^[^\x00]{0,2000}?
240             # Make super-sure that the our apparent start-tag wasn't just in a comment!
241 0         0 LWP::Debug::debug("Content from $url doesn't look like RSS/RDF.");
242 0         0 return;
243             }
244            
245 9   33     144 my $timing = $self->{"_rss_timing_obj"} || XML::RSS::Timing->new;
246 9         2280 $timing->use_exceptions(0);
247 9 50       77 $timing->minAge( $self->minAge ) if defined $self->minAge;
248 9 50       120 $timing->maxAge( $self->maxAge ) if defined $self->maxAge;
249              
250             # Note that we use our server-time, not the other server's time
251 9         197 $timing->last_polled($now);
252            
253 9         149 $self->_scan_xml_timing(\$content, $timing);
254              
255 9 50       50 if( $timing->complaints ) {
256 0         0 LWP::Debug::debug("Errors in this feed's timing fields:\n"
257             . map("* $_\n", $timing->complaints)
258             . "]... so ignoring it all.\n"
259             );
260 0         0 return;
261             }
262              
263             # Now actually learn...
264 9         148 my $next_update = $timing->nextUpdate();
265 9         1015 LWP::Debug::debug("Remembering not to poll $url until $next_update");
266 9         110 $self->feed_set_next_update( $url, $next_update );
267             # Now, we /could/ also slip this into the response as a faked-out
268             # "Expires" header value, except that 1) who the hell ever looks at
269             # those, and 2) "Expires" is the expiration time expressed against
270             # the REMOTE server's clock, whereas nextUpdate is expressed
271             # against OUR clock. So mixing these up would screw up all kinds of
272             # things in the unhappy event of clock skew combined with someone
273             # actually looking at a fake-o Expires value.
274              
275 9         169 return;
276             }
277              
278             #==========================================================================
279              
280             sub _scan_xml_timing {
281 93     93   42894 my($self, $contentref, $timingobj) = @_;
282 93 100 66     504 return unless $contentref and $$contentref;
283 92         96 DEBUG > 1 and print "# _scan_xml_timing << self <$self>; contentref <$contentref>; timingobj <$timingobj>\n";
284 92         322 $self->_scan_for_ttl( $contentref, $timingobj );
285 92         234 $self->_scan_for_skipDays( $contentref, $timingobj );
286 92         225 $self->_scan_for_skipHours( $contentref, $timingobj );
287 92         287 $self->_scan_for_updatePeriod( $contentref, $timingobj );
288 92         842 $self->_scan_for_updateFrequency( $contentref, $timingobj );
289 92         264 $self->_scan_for_updateBase( $contentref, $timingobj );
290 92         190 return;
291             }
292              
293 92     92   144 sub _scan_for_updateFrequency {my($s,$c,$t)=@_;$s->_scan_xml('updateFrequency', $c, $t) }
  92         198  
294 92     92   326 sub _scan_for_updatePeriod {my($s,$c,$t)=@_;$s->_scan_xml('updatePeriod', $c, $t) }
  92         281  
295 92     92   217 sub _scan_for_updateBase {my($s,$c,$t)=@_;$s->_scan_xml('updateBase', $c, $t) }
  92         201  
296 92     92   124 sub _scan_for_ttl {my($s,$c,$t)=@_;$s->_scan_xml('ttl', $c, $t) }
  92         233  
297 92     92   147 sub _scan_for_skipDays {my($s,$c,$t)=@_;$s->_scan_xml('skipDays' , $c, $t, 'day' ) }
  92         255  
298 92     92   237 sub _scan_for_skipHours {my($s,$c,$t)=@_;$s->_scan_xml('skipHours', $c, $t, 'hour') }
  92         270  
299              
300             #==========================================================================
301              
302             sub _etag_from_resp {
303 9     9   26 my($self, $response) = @_;
304 9         135 my $etag = $response->header('ETag');
305 9 50 66     610 return undef unless defined $etag
      66        
      33        
306             and length($etag)
307             and length($etag) < 251 # A good sanity limit, I think
308             and $etag !~ m/[\n\r]/ # Enforce this minimal sanity on content
309             ;
310 6         14 DEBUG and print "Using etag $etag for resp-obj $response\'s etag\n";
311 6         18 return $etag;
312             }
313              
314             sub _time_string_from_resp {
315 9     9   21 my($self, $response, $now) = @_;
316 9         127 require HTTP::Date;
317 9         94 foreach my $time_string (
318             $response->header('Last-Modified'),
319             $response->header('Date'),
320             HTTP::Date::time2str( $now ),
321             ) {
322             next unless # enforce minimal sanity on the value...
323 9 50 33     1088 defined $time_string
      33        
324             and $time_string =~ m/^[- \,\.\:a-zA-Z0-9]{4,40}$/s
325             and $time_string =~ m/[0-9A-Za-z]/;
326 9         18 DEBUG and print "Using time-string \"$time_string\" for resp-obj $response\'s lastmod\n";
327 9         28 return $time_string;
328             }
329 0         0 return undef;
330             }
331              
332             #==========================================================================
333              
334             sub _url_from_response {
335 11     11   26 my($self, $response) = @_;
336              
337 11         26 my $this_res = $response;
338 11         37 for(1 .. 30) { # get the original request's URL
339 11   50     52 $this_res = ($this_res->previous || last);
340             }
341 11         162 return $this_res->request->uri;
342             }
343              
344             #==========================================================================
345              
346             sub _rssagent_add_header_conditions {
347 12     12   28 my $self = shift;
348 12         52 $self->_rssagent_add_ifmod_header(@_);
349 12         50 $self->_rssagent_add_ifnonematch_header(@_);
350 12         18 return;
351             }
352              
353             sub _rssagent_add_ifmod_header {
354 12     12   28 my($self, $req) = @_;
355 12         43 LWP::Debug::trace('_rssagent_add_ifmod_header()');
356 12         73 my $url = $req->uri;
357              
358 12         119 my $lastmod = $self->feed_get_last_modified( $url );
359              
360 12 100 66     133 if(defined $lastmod and length $lastmod) {
361 1         4 LWP::Debug::debug("Setting If-Modified-Since on get-$url to $lastmod");
362 1         7 DEBUG and print "Setting If-Modified-Since on get-$url to $lastmod\n";
363 1         9 $req->header('If-Modified-Since' => $lastmod);
364             } else {
365 11         47 LWP::Debug::debug("I see no last-polled time for $url");
366 11         86 DEBUG and print "I see no last-polled time for $url\n";
367             }
368 12         65 return;
369             }
370              
371             sub _rssagent_add_ifnonematch_header {
372 12     12   23 my($self, $req) = @_;
373 12         35 LWP::Debug::trace('_rssagent_add_ifnonematch_header()');
374 12         68 my $url = $req->uri;
375              
376 12         117 my $etag = $self->feed_get_etag( $url );
377              
378 12 100 66     137 if(defined $etag and length $etag) {
379 1         4 LWP::Debug::debug("Setting If-None-Match on get-$url to $etag");
380 1         7 DEBUG and print "Setting If-None-Match on get-$url to $etag\n";
381 1         3 $req->header('If-None-Match' => $etag);
382             } else {
383 11         50 LWP::Debug::debug("I see no etag for $url");
384 11         91 DEBUG and print "I see no etag for $url\n";
385             }
386 12         55 return;
387             }
388              
389             #==========================================================================
390              
391             sub _rss_agent_null_response {
392 17     17   12631 my($self, $request, $not_until) = @_;
393 17         820 require HTTP::Response;
394 17         85 require HTTP::Date;
395 17         76 require HTTP::Status;
396              
397 17         57 my $now_str = HTTP::Date::time2str( $self->now );
398 17         361 my $not_until_str = HTTP::Date::time2str( $not_until - 1);
399             # The -1 is because "Expires" means the last moment when it's still
400             # good, and not_until is the first moment when we can check. Q.E.D.
401            
402 17         324 my $response = HTTP::Response->new(
403             HTTP::Status::RC_NOT_MODIFIED() => "Not Modified (" . __PACKAGE__
404             . " says it won't change until after $not_until_str)"
405             );
406 17         1082 my $h = $response->headers;
407 17         159 $h->header( "Client-Date" => $now_str);
408 17         11391 $h->header( "Date" => $now_str);
409 17         964 $h->header( "Expires" => $not_until_str);
410 17         846 $response->request($request);
411              
412 17         231 return $response;
413             }
414              
415             #==========================================================================
416              
417             sub now {
418             # This is here just so we can change what we mean by 'now', when we're
419             # running tests. Trust me, it's handy.
420 43 100 66 43 0 334 return $_[0]->{'_now_hack'} if ref $_[0] and defined $_[0]->{'_now_hack'};
421 42         171 return time();
422             }
423              
424             ###########################################################################
425             #
426             # get_datum and set_datum implement the caching that both speeds things up
427             # and allows a layer of indirection so that changes don't happen until
428             # we commit
429              
430             sub get_datum {
431 109     109 0 187 my($self, $url, $varname) = @_;
432 109         604 $url =~ s{\s+}{}g;
433 109 50       497 return unless length $url;
434              
435             # First look in our dirty cache
436 109   100     566 my $for_db = ($self->{'rsstimingbot_for_db'} ||= {});
437 109 100 66     331 if( $for_db->{$url} and exists $for_db->{$url}{$varname} ) {
438 15         133 DEBUG > 6 and print " Found $varname for $url in dirty cache\n";
439 15         68 return $for_db->{$url}{$varname};
440             }
441              
442             # then look in our has-been-read-from-disk cache
443 94   100     627 my $from_db = ($self->{'rsstimingbot_from_db'} ||= {});
444 94 100 100     623 if( $from_db->{$url} and exists $from_db->{$url}{$varname} ) {
445 46         115 DEBUG > 6 and print " Found $varname for $url in clean cache\n";
446 46         260 return $from_db->{$url}{$varname};
447             }
448              
449             # and finally, as a last resort, actually fetch from the real DB
450             return(
451 48         1003 $from_db->{$url}{$varname} = $self->datum_from_db($url, $varname)
452             );
453             }
454              
455             sub set_datum {
456 30     30 0 79 my($self, $url, $varname, $value) = @_;
457 30         96 $url =~ s{\s+}{}g;
458 30 50       189 return unless length $url;
459 30         235 $self->{'rsstimingbot_from_db' }{$url}{$varname}
460             = $self->{'rsstimingbot_for_db'}{$url}{$varname}
461             = $value;
462             # And upon commit, we'll save all of 'rsstimingbot_for_db' to database
463             }
464              
465             ###########################################################################
466             #
467             # Our lame default storage methods, in case you didn't override
468             # commit and datum_from_db
469              
470             sub _stupid_datum_from_db {
471 48     48   321 my($self, $url, $varname) = @_;
472 48         225 my $dbfile = $self->_stupid___url2dbfile($url);
473 48         221 DEBUG > 1 and print " DB: Getting datum $varname for $url ...\n";
474              
475 48 100       2382 return undef unless -e $dbfile;
476              
477 23         78 my $unlocker = $self->_stupid_lock();
478              
479 23 50       762 open( STUPID_DB, $dbfile)
480             or die "Can't read-open $dbfile for $url : $!\nAborting";
481 23         51 my @f;
482 23         120 local $/ = "\n";
483 23   50     83 my $from_db = ($self->{'rsstimingbot_from_db'} ||= {});
484 23         29 DEBUG > 8 and print " Reading DB file $dbfile...\n";
485 23         413 while() {
486 95         134 chomp;
487 95         471 @f = split ' ', $_, 3;
488             # Yup, just three space-separated fields: "url varname value"
489 95 50       241 if( @f >= 2 ) {
490 95         117 DEBUG > 9 and print " Datum read {$f[0]} {$f[1]} {$f[2]}\n";
491 95 50       786 $from_db->{ $f[0] }->{ $f[1] } =
492             defined($f[2]) ? $f[2] : ""; # because of split's behavior
493             }
494             }
495 23         313 close(STUPID_DB);
496 23 50       87 $unlocker and $unlocker->();
497 23         29 DEBUG > 8 and print " Done reading DB file $dbfile\n";
498            
499 23 100 100     252 return $from_db->{$url}{$varname}
500             if $from_db->{$url} and exists $from_db->{$url}{$varname};
501 9         178 return undef;
502             }
503              
504             # See XML::RSS::TimingBotDBI for a better example of a commit method
505              
506             sub _stupid_commit { # write all our dirty cache to DB files
507 10     10   18 my $self = shift;
508 10   50     45 my $for_db = $self->{'rsstimingbot_for_db'} || return;
509 10         13 DEBUG > 1 and print " I see ", scalar(keys %$for_db), " url-data in $self to save\n";
510 10 100       45 return unless %$for_db;
511            
512 7         13 my %path2mods;
513 7         40 foreach my $url (sort keys %$for_db) {
514 7         29 my $dbfile = $self->_stupid___url2dbfile($url);
515 7         17 foreach my $varname (sort keys %{ $for_db->{$url} }) {
  7         44  
516 13         34 my $value = $for_db->{$url}{$varname};
517 13 50       156 $path2mods{$dbfile}{ "$url $varname" } = defined($value) ? $value : '';
518             }
519             }
520 7         13 DEBUG > 7 and print " Committing to ", scalar(keys %path2mods),
521             " database files...\n";
522              
523 7         24 my $unlocker = $self->_stupid_lock();
524              
525 7         66 foreach my $dbfile (keys %path2mods) {
526 7         35 $self->_stupid___mod_db( $dbfile, $path2mods{$dbfile} );
527 7         39 $path2mods{$dbfile} = undef; # potentially free up some memory
528             }
529 7         26 DEBUG > 7 and print " Done committing all database files\n";
530              
531 7 50       33 $unlocker and $unlocker->();
532 7         25 %$for_db = ();
533 7         56 return;
534             }
535              
536             #
537             # And now some very internal stuff:
538             #
539              
540             sub _stupid_lock {
541 30     30   41 my $self = shift;
542 30         82 my $file = $self->rss_semaphore_file;
543 30 50       339 return unless $file;
544              
545 30 50       164 $file = $self->_stupid___url2dbfile("http://lock.nowhere.int/lock")
546             if $file eq '1';
547              
548 30         45 DEBUG > 2 and
549             print "About to request (maybe wait for!) an exclusive lock on $file\n";
550              
551 30         105 return $self->_getsem($file);
552             }
553              
554              
555             sub _stupid___my_db_path {
556 85     85   131 my $self = shift;
557             return $self->{'_dbpath'}
558             || $ENV{'TIMINGBOTPATH'}
559             || $ENV{'APPDATA'}
560             || $ENV{'HOME'}
561 85   0     7114 || do { require File::Spec; File::Spec->curdir; }
562             }
563              
564             sub _stupid___url2dbfile {
565 85     85   136 my($self, $url) = @_;
566 85         540 require File::Spec;
567 85         135 DEBUG > 2 and print " Pondering filespec for url $url\n";
568 85         289 my $url_stem = lc $url;
569 85         695 $url_stem =~ s{^http://(?:www\.)?}{}s;
570 85         345 $url_stem =~ s{\.(?:xml|rss|rdf)$}{}s;
571 85         541 $url_stem =~ s/\W+//sg; # includes killing all whitespace
572 85 50       229 $url_stem = 'misc' unless length $url_stem;
573 85 100       309 $url_stem = substr($url_stem,0,30) if length($url) > 30; # truncate
574            
575 85         439 my $rssdir = File::Spec->catfile( $self->_stupid___my_db_path, 'rssdata');
576 85 50       3104 if( -e $rssdir ) {
577 85         291 DEBUG > 12 and print " RSSdir $rssdir exists already.\n";
578             } else {
579 0 0       0 if( mkdir($rssdir, 0777) ) {
580 0         0 DEBUG > 1 and print " Successfully created RSSdir $rssdir\n";
581             } else {
582 0         0 die "Can't mkdir $rssdir: $!\nAborting";
583             }
584             }
585 85         16229 my $path = File::Spec->catfile( $rssdir, $url_stem );
586 85         205 DEBUG > 6 and print "# Path to stupid DB: $path\n";
587 85         410 return $path;
588             }
589              
590             # . . . . . . . . . . . . . . . . . . . . . . . . .
591              
592             sub _stupid___mod_db {
593 7     7   17 my($self, $dbfile, $to_write) = @_;
594              
595 7 100 66     142 if( -e $dbfile and -s _ ) {
596 4         7 DEBUG > 9 and print "Reading db $dbfile ...\n";
597 4 50       135 open( STUPID_DB, $dbfile)
598             or die "Can't read-open $dbfile : $!\nAborting";
599 4         10 my @f;
600 4         82 local $/ = "\n";
601 4         115 while() {
602 10         19 chomp;
603 10         54 @f = split ' ', $_, 3; # yup, just three space-separated fields
604 10 50 66     163 $to_write->{ "$f[0] $f[1]" } = defined($f[2]) ? $f[2] : ""
    100          
605             if @f >= 2 and ! exists $to_write->{ "$f[0] $f[1]" };
606             }
607 4         64 close(STUPID_DB);
608             } else {
609 3         5 DEBUG > 9 and print "No db $dbfile to read, so just writing new.\n";
610             }
611              
612 7         13 DEBUG > 8 and print " Saving DB file $dbfile (", scalar(keys %$to_write), " entries)\n";
613              
614 7 50       626 open( STUPID_DB, ">$dbfile" ) or die "Can't write-open $dbfile: $!\nAborting";
615 7         17 my $value;
616 7         48 foreach my $key (sort keys %$to_write) {
617 20 50       66 next unless defined( $value = $to_write->{$key} );
618 20         43 $value =~ tr/\n\r//d; # Enforce sanity
619 20         138 print STUPID_DB "$key $value\n";
620             }
621 7         369 close(STUPID_DB);
622 7         13 DEBUG > 8 and print " Done saving DB file $dbfile\n";
623 7         19 return;
624             }
625              
626             ############################################################################
627              
628             sub _getsem {
629 30     30   55 my($self, $file, $be_nonblocking) = @_;
630             # Lock this semaphore file. Returns the unlocker sub!!
631             #
632             # To have the lock be non-blocking, specify a true second parameter.
633             # In that case, returns false if can't get the lock. Unlocker
634             # sub otherwise.
635            
636 30 50 33     345 unless(defined $file and length $file) {
637 0         0 require Carp;
638 0         0 Carp::confess("Filename argument to _getsem must be contentful!")
639             }
640            
641 30 50       2062894 open(my $fh, ">$file") or Carp::croak("Can't write-open $file\: $!");
642             #chmod 0666, $file; # yes, make it world-writeable. or at least try.
643            
644 30 50       110 if($be_nonblocking) { # non-blocking!
645 0 0       0 eval { flock($fh, 2 | 4) } # Exclusive + NONblocking
  0         0  
646             or return; # couldn't get a lock.
647             } else { # normal case: Exclusive, Blocking
648 30         320 eval { flock($fh, 2) } # Exclusive
649 30 50       66 or do { require Carp; Carp::confess("Can't exclusive-block lock $file: $!") };
  0         0  
  0         0  
650             # should never just fail -- should queue up forever
651             }
652            
653 30 50       449 unless( print $fh "I am a lowly " , __PACKAGE__ , " semaphore file\cm\cj" ) {
654 0         0 require Carp;
655 0         0 Carp::confess("Can't write to $file\: $!");
656             }
657            
658             return(
659             sub {
660 30 50   30   73 if($fh) { # So we can call multiple times
661 30         33 DEBUG > 1 and print "Releasing lock on $file\n";
662 30         1693 close($fh); # Presumably will never fail!
663             # Will release the lock.
664 30         66 undef $fh;
665 30         117 return 1;
666             } else {
667 0         0 return '';
668             }
669             }
670 30         355 );
671             # Now, I /could/ just have this work by returning the globref --
672             # then when all the references to the glob go to 0, the FH
673             # closes, and the lock is released. However, this /relies/ on
674             # the timing of garbage collection.
675             }
676              
677              
678             ###########################################################################
679             #
680             # XML HELL BEGINS HERE
681             #
682              
683             sub _scan_xml {
684 552     552   941 my($self, $tag, $c, $timing, $subtag) = @_;
685 552 50       2216 die "Crazy tag \"$tag\"!!"
686             unless $tag =~ m/^[a-zA-Z_][a-zA-Z_0-9]*$/s; # sanity
687 552 50 33     3468 die "Contentref has to be a scalar ref" unless $c and ref($c) eq 'SCALAR';
688 552 50 33     3631 die "Timing object has to be an object!" unless $timing and ref($timing)
      33        
689             and ref($timing) ne 'SCALAR';
690              
691 552         752 my $method = $tag;
692              
693 552         513 DEBUG > 5 and print "# _scan_xml << self <$self>; timingobj <$timing>\n# tag <$tag>; subtag ",
694             defined($subtag) ? "<$subtag>" : "(nil)", "\n",
695             "# Content {\n$$c\n# }\n",
696             ;
697              
698 552 100       1042 unless(defined $subtag) { # common case: just someval
699 368 100       29155 if( $$c =~
700             m{
701             <
702             (?: [a-zA-Z_][-_\.a-zA-Z0-9]* \: )? # optional namespace
703             $tag\b
704             .*? # optional attributes and whitespace and junk
705             >
706             \s*
707             ([^<>\s"]+)
708             \s*
709            
710             (?: [a-zA-Z_][-_\.a-zA-Z0-9]* \: )? # optional namespace
711             $tag
712             \s* # just the optional (and rare) whitespace
713             >
714             }sx
715             ) {
716 94         196 my $it = $1;
717 94         361 LWP::Debug::debug("Content has $method value: \"$it\"");
718 94         386 DEBUG > 2 and print(" Content has $method value: \"$it\"!!\n");
719 94         494 $timing->$method( $it );
720             } else {
721 274         1290 LWP::Debug::debug("Content has no $method value");
722 274         1086 DEBUG > 2 and print(" Content has no $method value\n");
723             }
724 368         3609 return;
725             }
726              
727             # Else it's a tag and subtaggy thing
728 184 50       637 die "Crazy subtag \"$tag\"!!"
729             unless $subtag =~ m/^[a-zA-Z_][a-zA-Z_0-9]*$/s; # sanity
730              
731 184 100       17183 if( $$c =~
732             m{
733             <
734             (?: [a-zA-Z_][-_\.a-zA-Z0-9]* \: )? # optional namespace
735             $tag
736             \b.*?>
737             \s*
738             (
739             (?:
740             <
741             (?: [a-zA-Z_][-_\.a-zA-Z0-9]* \: )? # optional namespace
742             $subtag
743             \b.*?>
744             \s*
745             [^<>\s"]+
746             \s*
747            
748             (?: [a-zA-Z_][-_\.a-zA-Z0-9]* \: )? # optional namespace
749             $subtag
750             \s*>
751             \s*
752             )+
753             )
754            
755             (?: [a-zA-Z_][-_\.a-zA-Z0-9]* \: )? # optional namespace
756             $tag
757             \s*>
758             }sx
759             ) {
760 32         80 my $there = $1;
761 32         38 DEBUG > 3 and print " $method+subtag valuecluster \"$there\"\n";
762 32         649 my(@them) = ( $there =~
763             # Our previous RE made sure that this is a very simply-structured
764             # area, so we can get away with just this simple regexp:
765             m{
766             >
767             \s*
768             ([^<>\s"]+)
769             \s*
770            
771             }xsg
772             );
773 32         183 LWP::Debug::debug("Content $method+$subtag values: @them");
774 32         103 DEBUG > 2 and print(" Content $method+$subtag values: (",
775             join(q<,>, map("\"$_\" ", @them)), ") !!\n");
776 32         190 $timing->$method( @them ); # yes, we call the method, not a submethod
777             } else {
778 152         619 LWP::Debug::debug("Content has no $method+$subtag values");
779 152         524 DEBUG > 2 and print(" Content has no $method+$subtag values\n");
780             }
781              
782             # Qvia sicvt exaltantvr Caeli a Terra,
783             # sic exaltatae svnt viae Meae a viis vestris,
784             # et cogitationes Meae a cogitationibvs vestris!
785             # VGABVGA!
786              
787 184         1423 return;
788             }
789              
790             # EndHell.
791             ###########################################################################
792            
793             1;
794             __END__