| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
require 5; |
|
3
|
|
|
|
|
|
|
package XML::RSS::TimingBot; # Time-stamp: "2004-05-19 12:50:53 ADT" |
|
4
|
7
|
|
|
7
|
|
63099
|
use LWP::UserAgent::Determined (); |
|
|
7
|
|
|
|
|
577536
|
|
|
|
7
|
|
|
|
|
2333
|
|
|
5
|
|
|
|
|
|
|
@ISA = ('LWP::UserAgent::Determined'); |
|
6
|
|
|
|
|
|
|
|
|
7
|
7
|
|
|
7
|
|
86
|
use strict; |
|
|
7
|
|
|
|
|
13
|
|
|
|
7
|
|
|
|
|
229
|
|
|
8
|
7
|
|
|
7
|
|
42
|
use vars qw($VERSION); |
|
|
7
|
|
|
|
|
21
|
|
|
|
7
|
|
|
|
|
470
|
|
|
9
|
|
|
|
|
|
|
$VERSION = '2.03'; |
|
10
|
|
|
|
|
|
|
|
|
11
|
7
|
|
|
7
|
|
11294
|
use LWP::Debug (); |
|
|
7
|
|
|
|
|
5637
|
|
|
|
7
|
|
|
|
|
150
|
|
|
12
|
7
|
|
|
7
|
|
9004
|
use XML::RSS::Timing (); |
|
|
7
|
|
|
|
|
44709
|
|
|
|
7
|
|
|
|
|
1131
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
7
|
50
|
|
7
|
|
87206
|
BEGIN { *DEBUG = sub () {0} unless defined &DEBUG } |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
die "Where's _elem?!!?" unless __PACKAGE__->can('_elem'); |
|
17
|
|
|
|
|
|
|
#-------------------------------------------------------------------------- |
|
18
|
|
|
|
|
|
|
# Some incidental accessors: |
|
19
|
9
|
|
|
9
|
1
|
138
|
sub minAge { shift->_elem( 'minAge' , @_) } |
|
20
|
9
|
|
|
9
|
1
|
953
|
sub maxAge { shift->_elem( 'maxAge' , @_) } |
|
21
|
0
|
|
|
0
|
0
|
0
|
sub min_age { shift->minAge(@_) } #alias |
|
22
|
0
|
|
|
0
|
0
|
0
|
sub max_age { shift->maxAge(@_) } #alias |
|
23
|
|
|
|
|
|
|
|
|
24
|
60
|
|
|
60
|
0
|
256
|
sub rss_semaphore_file { shift->_elem( 'rss_semaphore_file' , @_) } |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
#========================================================================== |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub feed_get_last_modified { |
|
29
|
|
|
|
|
|
|
# This is not an epochtime, it's a string that we probably got |
|
30
|
|
|
|
|
|
|
# from the remote server. |
|
31
|
|
|
|
|
|
|
# As says RFC2616 section 14.25 : << |
|
32
|
|
|
|
|
|
|
# Note: When handling an If-Modified-Since header field, some |
|
33
|
|
|
|
|
|
|
# servers will use an exact date comparison function, rather than a |
|
34
|
|
|
|
|
|
|
# less-than function, for deciding whether to send a 304 (Not |
|
35
|
|
|
|
|
|
|
# Modified) response. To get best results when sending an If- |
|
36
|
|
|
|
|
|
|
# Modified-Since header field for cache validation, clients are |
|
37
|
|
|
|
|
|
|
# advised to use the exact date string received in a previous Last- |
|
38
|
|
|
|
|
|
|
# Modified header field whenever possible. |
|
39
|
|
|
|
|
|
|
# >> (among many other wise things) |
|
40
|
|
|
|
|
|
|
# Example value: "Sun, 04 Apr 2004 11:58:04 GMT" |
|
41
|
|
|
|
|
|
|
|
|
42
|
46
|
|
|
46
|
1
|
289
|
my($self, $url) = @_; |
|
43
|
46
|
|
|
|
|
68
|
DEBUG and print "Getting lastmod value for $url ...\n"; |
|
44
|
46
|
|
|
|
|
120
|
return $self->get_datum($url, 'lastmodified'); |
|
45
|
|
|
|
|
|
|
} |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub feed_get_next_update { # this is an epochtime |
|
48
|
47
|
|
|
47
|
1
|
86
|
my($self, $url) = @_; |
|
49
|
47
|
|
|
|
|
65
|
DEBUG and print "Getting next-update value for $url ...\n"; |
|
50
|
47
|
|
|
|
|
152
|
return $self->get_datum($url, 'nextupdate'); |
|
51
|
|
|
|
|
|
|
} |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub feed_get_etag { # this is a string |
|
54
|
|
|
|
|
|
|
# See RFC 2616, "3.11 Entity Tags" |
|
55
|
16
|
|
|
16
|
1
|
483
|
my($self, $url) = @_; |
|
56
|
16
|
|
|
|
|
22
|
DEBUG and print "Getting etag value for $url ...\n"; |
|
57
|
16
|
|
|
|
|
50
|
return $self->get_datum($url, 'etag'); |
|
58
|
|
|
|
|
|
|
} |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
sub feed_set_last_modified { |
|
61
|
12
|
|
|
12
|
1
|
49
|
my($self, $url, $last_modified_time) = @_; |
|
62
|
12
|
|
|
|
|
22
|
DEBUG and print "Setting lastmod for $url to $last_modified_time\n"; |
|
63
|
12
|
|
|
|
|
57
|
$self->set_datum($url, 'lastmodified', $last_modified_time); |
|
64
|
|
|
|
|
|
|
} |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub feed_set_next_update { |
|
67
|
12
|
|
|
12
|
1
|
26
|
my($self, $url, $next_update_time) = @_; |
|
68
|
12
|
|
|
|
|
22
|
DEBUG and print "Setting next-update for $url to $next_update_time\n"; |
|
69
|
12
|
|
|
|
|
42
|
$self->set_datum($url, 'nextupdate', $next_update_time); |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub feed_set_etag { |
|
73
|
6
|
|
|
6
|
1
|
16
|
my($self, $url, $etag) = @_; |
|
74
|
6
|
|
|
|
|
12
|
DEBUG and print "Setting etag for $url to $etag\n"; |
|
75
|
6
|
|
|
|
|
19
|
$self->set_datum($url, 'etag', $etag); |
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
#-------------------------------------------------------------------------- |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub new { |
|
81
|
30
|
|
|
30
|
1
|
13336
|
my $self = shift->SUPER::new(@_); |
|
82
|
30
|
|
|
|
|
58250
|
$self->_rssagent_init(); |
|
83
|
30
|
|
|
|
|
101
|
return $self; |
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
sub _rssagent_init { |
|
87
|
30
|
|
|
30
|
|
69
|
my $self = shift; |
|
88
|
30
|
|
|
|
|
261
|
$self->agent("XmlRssTimingBot/$VERSION (" . $self->agent . ")" ); |
|
89
|
|
|
|
|
|
|
|
|
90
|
30
|
50
|
|
|
|
3622
|
$self->rss_semaphore_file(1) unless $^O =~ m/(?:Mac|MSWin)/; |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# Whatever needs doing here |
|
93
|
30
|
|
|
|
|
255
|
return; |
|
94
|
|
|
|
|
|
|
} |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
#========================================================================== |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub commit { # save all our new data for these various feeds we've been seeing |
|
99
|
10
|
|
|
10
|
1
|
2564
|
my $self = shift; |
|
100
|
10
|
|
|
|
|
45
|
return $self->_stupid_commit; |
|
101
|
|
|
|
|
|
|
} |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
sub datum_from_db { |
|
104
|
48
|
|
|
48
|
0
|
100
|
my($self, $url, $varname) = @_; |
|
105
|
48
|
|
|
|
|
158
|
return $self->_stupid_datum_from_db($url, $varname); |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
#========================================================================== |
|
109
|
|
|
|
|
|
|
|
|
110
|
12
|
|
|
12
|
0
|
18
|
sub rssagent_just_before_real_request { # Override if you like |
|
111
|
|
|
|
|
|
|
#my($self, $args_ref) = @_; |
|
112
|
|
|
|
|
|
|
} |
|
113
|
12
|
|
|
12
|
0
|
31
|
sub rssagent_just_after_real_request { # Override if you like |
|
114
|
|
|
|
|
|
|
#my($self, $response, $args_ref) = @_; |
|
115
|
|
|
|
|
|
|
} |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
#========================================================================== |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub simple_request { |
|
120
|
28
|
|
|
28
|
1
|
60563
|
my($self, @args) = @_; |
|
121
|
28
|
|
|
|
|
133
|
LWP::Debug::trace('simple_request()'); |
|
122
|
|
|
|
|
|
|
|
|
123
|
28
|
|
100
|
|
|
561
|
LWP::Debug::debug("Trying simple_request with args: [" |
|
124
|
|
|
|
|
|
|
. join(',', map $_||"''", @args) . "]"); |
|
125
|
|
|
|
|
|
|
|
|
126
|
28
|
|
|
|
|
759
|
DEBUG and print( "Trying simple_request with args: [" |
|
127
|
|
|
|
|
|
|
. join(',', map $_||"''", @args) . "]"); |
|
128
|
|
|
|
|
|
|
|
|
129
|
28
|
|
|
|
|
50
|
my $resp; |
|
130
|
28
|
|
|
|
|
108
|
my $maybe_response = $self->_rssagent_maybe_null_response($args[0]); |
|
131
|
28
|
100
|
|
|
|
90
|
if( $maybe_response ) { |
|
132
|
16
|
|
|
|
|
55
|
LWP::Debug::debug("Returning cached response"); |
|
133
|
16
|
|
|
|
|
111
|
return $maybe_response; |
|
134
|
|
|
|
|
|
|
} |
|
135
|
|
|
|
|
|
|
|
|
136
|
12
|
|
|
|
|
64
|
$self-> _rssagent_add_header_conditions(@args); |
|
137
|
12
|
|
|
|
|
54
|
$self-> rssagent_just_before_real_request(\@args); |
|
138
|
12
|
|
|
|
|
87
|
$resp = $self->SUPER::simple_request(@args); |
|
139
|
12
|
|
|
|
|
3105092
|
$self-> rssagent_just_after_real_request($resp, \@args); |
|
140
|
12
|
|
|
|
|
65
|
$self-> _rssagent_response_consider($resp); |
|
141
|
|
|
|
|
|
|
|
|
142
|
12
|
|
|
|
|
47
|
LWP::Debug::debug("Returning uncached response"); |
|
143
|
12
|
|
|
|
|
97
|
return $resp; |
|
144
|
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
#========================================================================== |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub _rssagent_maybe_null_response { |
|
149
|
|
|
|
|
|
|
# Return a (virtual) response object if you want to block this request. |
|
150
|
|
|
|
|
|
|
# Otherwise, to allow this request to actually happen, return a false value. |
|
151
|
|
|
|
|
|
|
|
|
152
|
28
|
|
|
28
|
|
56
|
my($self, $req) = @_; |
|
153
|
28
|
|
|
|
|
82
|
LWP::Debug::trace('_rssagent_maybe_null_response()'); |
|
154
|
28
|
|
|
|
|
283
|
my $url = $req->uri; |
|
155
|
|
|
|
|
|
|
|
|
156
|
28
|
|
|
|
|
334
|
my $not_until = $self->feed_get_next_update($url); |
|
157
|
28
|
100
|
|
|
|
433
|
unless(defined $not_until) { |
|
158
|
11
|
|
|
|
|
240
|
LWP::Debug::debug( "No restrictions on when to get $url"); |
|
159
|
11
|
|
|
|
|
227
|
DEBUG > 1 and print "No restrictions on when to get $url\n"; |
|
160
|
11
|
|
|
|
|
33
|
return undef; |
|
161
|
|
|
|
|
|
|
} |
|
162
|
|
|
|
|
|
|
|
|
163
|
17
|
|
|
|
|
64
|
my $now = $self->now; |
|
164
|
17
|
100
|
|
|
|
74
|
if($not_until >= $now) { |
|
165
|
16
|
|
|
|
|
82
|
LWP::Debug::debug( "It's now $now, but I shouldn't look at $url until $not_until."); |
|
166
|
16
|
|
|
|
|
143
|
DEBUG > 1 and print "It's now $now, but I shouldn't look at $url until $not_until.\n"; |
|
167
|
16
|
|
|
|
|
63
|
return $self->_rss_agent_null_response($req, $not_until); |
|
168
|
|
|
|
|
|
|
} |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# Else give the all-clear |
|
171
|
1
|
|
|
|
|
2
|
return undef; |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
#========================================================================== |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
sub _rssagent_response_consider { |
|
177
|
12
|
|
|
12
|
|
33
|
my($self, $response) = @_; |
|
178
|
|
|
|
|
|
|
# Possibly extract the RSS-timing content from this response. |
|
179
|
|
|
|
|
|
|
|
|
180
|
12
|
|
|
|
|
76
|
LWP::Debug::trace('_maybe_null_response()'); |
|
181
|
12
|
|
|
|
|
84
|
my $code = $response->code; |
|
182
|
12
|
100
|
|
|
|
174
|
unless($code eq '200') { # or $code eq '304' ? |
|
183
|
1
|
|
|
|
|
7
|
LWP::Debug::debug('Not trying to find RSS content in this $code response'); |
|
184
|
1
|
|
|
|
|
7
|
return; |
|
185
|
|
|
|
|
|
|
} |
|
186
|
|
|
|
|
|
|
|
|
187
|
11
|
|
|
|
|
60
|
my $url = $self->_url_from_response( $response ); |
|
188
|
|
|
|
|
|
|
|
|
189
|
11
|
100
|
|
|
|
519
|
return unless $self->_looks_like_rss( $url, $response ); |
|
190
|
|
|
|
|
|
|
|
|
191
|
9
|
|
|
|
|
50
|
my $now = $self->now(); |
|
192
|
|
|
|
|
|
|
|
|
193
|
9
|
|
|
|
|
158
|
my $time_string_from_resp = $self->_time_string_from_resp($response, $now); |
|
194
|
9
|
50
|
|
|
|
70
|
$self->feed_set_last_modified($url, $time_string_from_resp) |
|
195
|
|
|
|
|
|
|
if defined $time_string_from_resp; |
|
196
|
|
|
|
|
|
|
|
|
197
|
9
|
|
|
|
|
154
|
my $etag = $self->_etag_from_resp($response); |
|
198
|
9
|
100
|
|
|
|
54
|
$self->feed_set_etag($url, $etag) if defined $etag; |
|
199
|
|
|
|
|
|
|
|
|
200
|
9
|
50
|
|
|
|
113
|
$self->_ponder_next_update($url, $now, $response) unless $code eq '304'; |
|
201
|
9
|
|
|
|
|
24
|
return; |
|
202
|
|
|
|
|
|
|
} |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
#========================================================================== |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub _looks_like_rss { |
|
207
|
11
|
|
|
11
|
|
25
|
my($self, $url, $response) = @_; |
|
208
|
11
|
|
|
|
|
45
|
LWP::Debug::trace('_ponder_next_update'); |
|
209
|
11
|
|
|
|
|
47
|
my $content; |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
# Look for rss/rdf in the first 2000 bytes |
|
212
|
|
|
|
|
|
|
# TODO: support Atom here? Does anyone ever use sy:* stuff in Atom? |
|
213
|
|
|
|
|
|
|
|
|
214
|
11
|
|
|
|
|
68
|
my $c = $response->content_ref; |
|
215
|
11
|
50
|
33
|
|
|
270
|
unless( $c and $$c ) { |
|
216
|
0
|
|
|
|
|
0
|
LWP::Debug::debug("Content from $url is apparently null."); |
|
217
|
0
|
|
|
|
|
0
|
print "NULL!\n"; |
|
218
|
|
|
|
|
|
|
# so it's sure not RSS! |
|
219
|
0
|
|
|
|
|
0
|
return 0; |
|
220
|
|
|
|
|
|
|
} |
|
221
|
|
|
|
|
|
|
|
|
222
|
11
|
100
|
|
|
|
138
|
if ( $$c =~ m{^[^\x00]{0,2000}?(?:
|
|
223
|
9
|
|
|
|
|
82
|
LWP::Debug::debug("Content from $url looks like RSS/RDF."); |
|
224
|
9
|
|
|
|
|
127
|
return 1; |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
|
|
227
|
2
|
|
|
|
|
19
|
LWP::Debug::debug("Content from $url doesn't look like RSS/RDF."); |
|
228
|
2
|
|
|
|
|
30
|
return 0; |
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
#========================================================================== |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
sub _ponder_next_update { |
|
234
|
9
|
|
|
9
|
|
24
|
my($self, $url, $now, $response) = @_; |
|
235
|
9
|
|
|
|
|
77
|
LWP::Debug::trace('_ponder_next_update'); |
|
236
|
9
|
|
|
|
|
74
|
my $content = $response->content; |
|
237
|
9
|
|
|
|
|
188
|
$content =~ s///sg; # kill XML comments |
|
238
|
|
|
|
|
|
|
|
|
239
|
9
|
50
|
|
|
|
66
|
unless( $content =~ m{^[^\x00]{0,2000}?
|
|
240
|
|
|
|
|
|
|
# Make super-sure that the our apparent start-tag wasn't just in a comment! |
|
241
|
0
|
|
|
|
|
0
|
LWP::Debug::debug("Content from $url doesn't look like RSS/RDF."); |
|
242
|
0
|
|
|
|
|
0
|
return; |
|
243
|
|
|
|
|
|
|
} |
|
244
|
|
|
|
|
|
|
|
|
245
|
9
|
|
33
|
|
|
144
|
my $timing = $self->{"_rss_timing_obj"} || XML::RSS::Timing->new; |
|
246
|
9
|
|
|
|
|
2280
|
$timing->use_exceptions(0); |
|
247
|
9
|
50
|
|
|
|
77
|
$timing->minAge( $self->minAge ) if defined $self->minAge; |
|
248
|
9
|
50
|
|
|
|
120
|
$timing->maxAge( $self->maxAge ) if defined $self->maxAge; |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# Note that we use our server-time, not the other server's time |
|
251
|
9
|
|
|
|
|
197
|
$timing->last_polled($now); |
|
252
|
|
|
|
|
|
|
|
|
253
|
9
|
|
|
|
|
149
|
$self->_scan_xml_timing(\$content, $timing); |
|
254
|
|
|
|
|
|
|
|
|
255
|
9
|
50
|
|
|
|
50
|
if( $timing->complaints ) { |
|
256
|
0
|
|
|
|
|
0
|
LWP::Debug::debug("Errors in this feed's timing fields:\n" |
|
257
|
|
|
|
|
|
|
. map("* $_\n", $timing->complaints) |
|
258
|
|
|
|
|
|
|
. "]... so ignoring it all.\n" |
|
259
|
|
|
|
|
|
|
); |
|
260
|
0
|
|
|
|
|
0
|
return; |
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# Now actually learn... |
|
264
|
9
|
|
|
|
|
148
|
my $next_update = $timing->nextUpdate(); |
|
265
|
9
|
|
|
|
|
1015
|
LWP::Debug::debug("Remembering not to poll $url until $next_update"); |
|
266
|
9
|
|
|
|
|
110
|
$self->feed_set_next_update( $url, $next_update ); |
|
267
|
|
|
|
|
|
|
# Now, we /could/ also slip this into the response as a faked-out |
|
268
|
|
|
|
|
|
|
# "Expires" header value, except that 1) who the hell ever looks at |
|
269
|
|
|
|
|
|
|
# those, and 2) "Expires" is the expiration time expressed against |
|
270
|
|
|
|
|
|
|
# the REMOTE server's clock, whereas nextUpdate is expressed |
|
271
|
|
|
|
|
|
|
# against OUR clock. So mixing these up would screw up all kinds of |
|
272
|
|
|
|
|
|
|
# things in the unhappy event of clock skew combined with someone |
|
273
|
|
|
|
|
|
|
# actually looking at a fake-o Expires value. |
|
274
|
|
|
|
|
|
|
|
|
275
|
9
|
|
|
|
|
169
|
return; |
|
276
|
|
|
|
|
|
|
} |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
#========================================================================== |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
sub _scan_xml_timing { |
|
281
|
93
|
|
|
93
|
|
42894
|
my($self, $contentref, $timingobj) = @_; |
|
282
|
93
|
100
|
66
|
|
|
504
|
return unless $contentref and $$contentref; |
|
283
|
92
|
|
|
|
|
96
|
DEBUG > 1 and print "# _scan_xml_timing << self <$self>; contentref <$contentref>; timingobj <$timingobj>\n"; |
|
284
|
92
|
|
|
|
|
322
|
$self->_scan_for_ttl( $contentref, $timingobj ); |
|
285
|
92
|
|
|
|
|
234
|
$self->_scan_for_skipDays( $contentref, $timingobj ); |
|
286
|
92
|
|
|
|
|
225
|
$self->_scan_for_skipHours( $contentref, $timingobj ); |
|
287
|
92
|
|
|
|
|
287
|
$self->_scan_for_updatePeriod( $contentref, $timingobj ); |
|
288
|
92
|
|
|
|
|
842
|
$self->_scan_for_updateFrequency( $contentref, $timingobj ); |
|
289
|
92
|
|
|
|
|
264
|
$self->_scan_for_updateBase( $contentref, $timingobj ); |
|
290
|
92
|
|
|
|
|
190
|
return; |
|
291
|
|
|
|
|
|
|
} |
|
292
|
|
|
|
|
|
|
|
|
293
|
92
|
|
|
92
|
|
144
|
sub _scan_for_updateFrequency {my($s,$c,$t)=@_;$s->_scan_xml('updateFrequency', $c, $t) } |
|
|
92
|
|
|
|
|
198
|
|
|
294
|
92
|
|
|
92
|
|
326
|
sub _scan_for_updatePeriod {my($s,$c,$t)=@_;$s->_scan_xml('updatePeriod', $c, $t) } |
|
|
92
|
|
|
|
|
281
|
|
|
295
|
92
|
|
|
92
|
|
217
|
sub _scan_for_updateBase {my($s,$c,$t)=@_;$s->_scan_xml('updateBase', $c, $t) } |
|
|
92
|
|
|
|
|
201
|
|
|
296
|
92
|
|
|
92
|
|
124
|
sub _scan_for_ttl {my($s,$c,$t)=@_;$s->_scan_xml('ttl', $c, $t) } |
|
|
92
|
|
|
|
|
233
|
|
|
297
|
92
|
|
|
92
|
|
147
|
sub _scan_for_skipDays {my($s,$c,$t)=@_;$s->_scan_xml('skipDays' , $c, $t, 'day' ) } |
|
|
92
|
|
|
|
|
255
|
|
|
298
|
92
|
|
|
92
|
|
237
|
sub _scan_for_skipHours {my($s,$c,$t)=@_;$s->_scan_xml('skipHours', $c, $t, 'hour') } |
|
|
92
|
|
|
|
|
270
|
|
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
#========================================================================== |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
sub _etag_from_resp { |
|
303
|
9
|
|
|
9
|
|
26
|
my($self, $response) = @_; |
|
304
|
9
|
|
|
|
|
135
|
my $etag = $response->header('ETag'); |
|
305
|
9
|
50
|
66
|
|
|
610
|
return undef unless defined $etag |
|
|
|
|
66
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
306
|
|
|
|
|
|
|
and length($etag) |
|
307
|
|
|
|
|
|
|
and length($etag) < 251 # A good sanity limit, I think |
|
308
|
|
|
|
|
|
|
and $etag !~ m/[\n\r]/ # Enforce this minimal sanity on content |
|
309
|
|
|
|
|
|
|
; |
|
310
|
6
|
|
|
|
|
14
|
DEBUG and print "Using etag $etag for resp-obj $response\'s etag\n"; |
|
311
|
6
|
|
|
|
|
18
|
return $etag; |
|
312
|
|
|
|
|
|
|
} |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
sub _time_string_from_resp { |
|
315
|
9
|
|
|
9
|
|
21
|
my($self, $response, $now) = @_; |
|
316
|
9
|
|
|
|
|
127
|
require HTTP::Date; |
|
317
|
9
|
|
|
|
|
94
|
foreach my $time_string ( |
|
318
|
|
|
|
|
|
|
$response->header('Last-Modified'), |
|
319
|
|
|
|
|
|
|
$response->header('Date'), |
|
320
|
|
|
|
|
|
|
HTTP::Date::time2str( $now ), |
|
321
|
|
|
|
|
|
|
) { |
|
322
|
|
|
|
|
|
|
next unless # enforce minimal sanity on the value... |
|
323
|
9
|
50
|
33
|
|
|
1088
|
defined $time_string |
|
|
|
|
33
|
|
|
|
|
|
324
|
|
|
|
|
|
|
and $time_string =~ m/^[- \,\.\:a-zA-Z0-9]{4,40}$/s |
|
325
|
|
|
|
|
|
|
and $time_string =~ m/[0-9A-Za-z]/; |
|
326
|
9
|
|
|
|
|
18
|
DEBUG and print "Using time-string \"$time_string\" for resp-obj $response\'s lastmod\n"; |
|
327
|
9
|
|
|
|
|
28
|
return $time_string; |
|
328
|
|
|
|
|
|
|
} |
|
329
|
0
|
|
|
|
|
0
|
return undef; |
|
330
|
|
|
|
|
|
|
} |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
#========================================================================== |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
sub _url_from_response { |
|
335
|
11
|
|
|
11
|
|
26
|
my($self, $response) = @_; |
|
336
|
|
|
|
|
|
|
|
|
337
|
11
|
|
|
|
|
26
|
my $this_res = $response; |
|
338
|
11
|
|
|
|
|
37
|
for(1 .. 30) { # get the original request's URL |
|
339
|
11
|
|
50
|
|
|
52
|
$this_res = ($this_res->previous || last); |
|
340
|
|
|
|
|
|
|
} |
|
341
|
11
|
|
|
|
|
162
|
return $this_res->request->uri; |
|
342
|
|
|
|
|
|
|
} |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
#========================================================================== |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
sub _rssagent_add_header_conditions { |
|
347
|
12
|
|
|
12
|
|
28
|
my $self = shift; |
|
348
|
12
|
|
|
|
|
52
|
$self->_rssagent_add_ifmod_header(@_); |
|
349
|
12
|
|
|
|
|
50
|
$self->_rssagent_add_ifnonematch_header(@_); |
|
350
|
12
|
|
|
|
|
18
|
return; |
|
351
|
|
|
|
|
|
|
} |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
sub _rssagent_add_ifmod_header { |
|
354
|
12
|
|
|
12
|
|
28
|
my($self, $req) = @_; |
|
355
|
12
|
|
|
|
|
43
|
LWP::Debug::trace('_rssagent_add_ifmod_header()'); |
|
356
|
12
|
|
|
|
|
73
|
my $url = $req->uri; |
|
357
|
|
|
|
|
|
|
|
|
358
|
12
|
|
|
|
|
119
|
my $lastmod = $self->feed_get_last_modified( $url ); |
|
359
|
|
|
|
|
|
|
|
|
360
|
12
|
100
|
66
|
|
|
133
|
if(defined $lastmod and length $lastmod) { |
|
361
|
1
|
|
|
|
|
4
|
LWP::Debug::debug("Setting If-Modified-Since on get-$url to $lastmod"); |
|
362
|
1
|
|
|
|
|
7
|
DEBUG and print "Setting If-Modified-Since on get-$url to $lastmod\n"; |
|
363
|
1
|
|
|
|
|
9
|
$req->header('If-Modified-Since' => $lastmod); |
|
364
|
|
|
|
|
|
|
} else { |
|
365
|
11
|
|
|
|
|
47
|
LWP::Debug::debug("I see no last-polled time for $url"); |
|
366
|
11
|
|
|
|
|
86
|
DEBUG and print "I see no last-polled time for $url\n"; |
|
367
|
|
|
|
|
|
|
} |
|
368
|
12
|
|
|
|
|
65
|
return; |
|
369
|
|
|
|
|
|
|
} |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
sub _rssagent_add_ifnonematch_header { |
|
372
|
12
|
|
|
12
|
|
23
|
my($self, $req) = @_; |
|
373
|
12
|
|
|
|
|
35
|
LWP::Debug::trace('_rssagent_add_ifnonematch_header()'); |
|
374
|
12
|
|
|
|
|
68
|
my $url = $req->uri; |
|
375
|
|
|
|
|
|
|
|
|
376
|
12
|
|
|
|
|
117
|
my $etag = $self->feed_get_etag( $url ); |
|
377
|
|
|
|
|
|
|
|
|
378
|
12
|
100
|
66
|
|
|
137
|
if(defined $etag and length $etag) { |
|
379
|
1
|
|
|
|
|
4
|
LWP::Debug::debug("Setting If-None-Match on get-$url to $etag"); |
|
380
|
1
|
|
|
|
|
7
|
DEBUG and print "Setting If-None-Match on get-$url to $etag\n"; |
|
381
|
1
|
|
|
|
|
3
|
$req->header('If-None-Match' => $etag); |
|
382
|
|
|
|
|
|
|
} else { |
|
383
|
11
|
|
|
|
|
50
|
LWP::Debug::debug("I see no etag for $url"); |
|
384
|
11
|
|
|
|
|
91
|
DEBUG and print "I see no etag for $url\n"; |
|
385
|
|
|
|
|
|
|
} |
|
386
|
12
|
|
|
|
|
55
|
return; |
|
387
|
|
|
|
|
|
|
} |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
#========================================================================== |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
sub _rss_agent_null_response { |
|
392
|
17
|
|
|
17
|
|
12631
|
my($self, $request, $not_until) = @_; |
|
393
|
17
|
|
|
|
|
820
|
require HTTP::Response; |
|
394
|
17
|
|
|
|
|
85
|
require HTTP::Date; |
|
395
|
17
|
|
|
|
|
76
|
require HTTP::Status; |
|
396
|
|
|
|
|
|
|
|
|
397
|
17
|
|
|
|
|
57
|
my $now_str = HTTP::Date::time2str( $self->now ); |
|
398
|
17
|
|
|
|
|
361
|
my $not_until_str = HTTP::Date::time2str( $not_until - 1); |
|
399
|
|
|
|
|
|
|
# The -1 is because "Expires" means the last moment when it's still |
|
400
|
|
|
|
|
|
|
# good, and not_until is the first moment when we can check. Q.E.D. |
|
401
|
|
|
|
|
|
|
|
|
402
|
17
|
|
|
|
|
324
|
my $response = HTTP::Response->new( |
|
403
|
|
|
|
|
|
|
HTTP::Status::RC_NOT_MODIFIED() => "Not Modified (" . __PACKAGE__ |
|
404
|
|
|
|
|
|
|
. " says it won't change until after $not_until_str)" |
|
405
|
|
|
|
|
|
|
); |
|
406
|
17
|
|
|
|
|
1082
|
my $h = $response->headers; |
|
407
|
17
|
|
|
|
|
159
|
$h->header( "Client-Date" => $now_str); |
|
408
|
17
|
|
|
|
|
11391
|
$h->header( "Date" => $now_str); |
|
409
|
17
|
|
|
|
|
964
|
$h->header( "Expires" => $not_until_str); |
|
410
|
17
|
|
|
|
|
846
|
$response->request($request); |
|
411
|
|
|
|
|
|
|
|
|
412
|
17
|
|
|
|
|
231
|
return $response; |
|
413
|
|
|
|
|
|
|
} |
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
#========================================================================== |
|
416
|
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
sub now { |
|
418
|
|
|
|
|
|
|
# This is here just so we can change what we mean by 'now', when we're |
|
419
|
|
|
|
|
|
|
# running tests. Trust me, it's handy. |
|
420
|
43
|
100
|
66
|
43
|
0
|
334
|
return $_[0]->{'_now_hack'} if ref $_[0] and defined $_[0]->{'_now_hack'}; |
|
421
|
42
|
|
|
|
|
171
|
return time(); |
|
422
|
|
|
|
|
|
|
} |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
########################################################################### |
|
425
|
|
|
|
|
|
|
# |
|
426
|
|
|
|
|
|
|
# get_datum and set_datum implement the caching that both speeds things up |
|
427
|
|
|
|
|
|
|
# and allows a layer of indirection so that changes don't happen until |
|
428
|
|
|
|
|
|
|
# we commit |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
sub get_datum { |
|
431
|
109
|
|
|
109
|
0
|
187
|
my($self, $url, $varname) = @_; |
|
432
|
109
|
|
|
|
|
604
|
$url =~ s{\s+}{}g; |
|
433
|
109
|
50
|
|
|
|
497
|
return unless length $url; |
|
434
|
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
# First look in our dirty cache |
|
436
|
109
|
|
100
|
|
|
566
|
my $for_db = ($self->{'rsstimingbot_for_db'} ||= {}); |
|
437
|
109
|
100
|
66
|
|
|
331
|
if( $for_db->{$url} and exists $for_db->{$url}{$varname} ) { |
|
438
|
15
|
|
|
|
|
133
|
DEBUG > 6 and print " Found $varname for $url in dirty cache\n"; |
|
439
|
15
|
|
|
|
|
68
|
return $for_db->{$url}{$varname}; |
|
440
|
|
|
|
|
|
|
} |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
# then look in our has-been-read-from-disk cache |
|
443
|
94
|
|
100
|
|
|
627
|
my $from_db = ($self->{'rsstimingbot_from_db'} ||= {}); |
|
444
|
94
|
100
|
100
|
|
|
623
|
if( $from_db->{$url} and exists $from_db->{$url}{$varname} ) { |
|
445
|
46
|
|
|
|
|
115
|
DEBUG > 6 and print " Found $varname for $url in clean cache\n"; |
|
446
|
46
|
|
|
|
|
260
|
return $from_db->{$url}{$varname}; |
|
447
|
|
|
|
|
|
|
} |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
# and finally, as a last resort, actually fetch from the real DB |
|
450
|
|
|
|
|
|
|
return( |
|
451
|
48
|
|
|
|
|
1003
|
$from_db->{$url}{$varname} = $self->datum_from_db($url, $varname) |
|
452
|
|
|
|
|
|
|
); |
|
453
|
|
|
|
|
|
|
} |
|
454
|
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
sub set_datum { |
|
456
|
30
|
|
|
30
|
0
|
79
|
my($self, $url, $varname, $value) = @_; |
|
457
|
30
|
|
|
|
|
96
|
$url =~ s{\s+}{}g; |
|
458
|
30
|
50
|
|
|
|
189
|
return unless length $url; |
|
459
|
30
|
|
|
|
|
235
|
$self->{'rsstimingbot_from_db' }{$url}{$varname} |
|
460
|
|
|
|
|
|
|
= $self->{'rsstimingbot_for_db'}{$url}{$varname} |
|
461
|
|
|
|
|
|
|
= $value; |
|
462
|
|
|
|
|
|
|
# And upon commit, we'll save all of 'rsstimingbot_for_db' to database |
|
463
|
|
|
|
|
|
|
} |
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
########################################################################### |
|
466
|
|
|
|
|
|
|
# |
|
467
|
|
|
|
|
|
|
# Our lame default storage methods, in case you didn't override |
|
468
|
|
|
|
|
|
|
# commit and datum_from_db |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
sub _stupid_datum_from_db { |
|
471
|
48
|
|
|
48
|
|
321
|
my($self, $url, $varname) = @_; |
|
472
|
48
|
|
|
|
|
225
|
my $dbfile = $self->_stupid___url2dbfile($url); |
|
473
|
48
|
|
|
|
|
221
|
DEBUG > 1 and print " DB: Getting datum $varname for $url ...\n"; |
|
474
|
|
|
|
|
|
|
|
|
475
|
48
|
100
|
|
|
|
2382
|
return undef unless -e $dbfile; |
|
476
|
|
|
|
|
|
|
|
|
477
|
23
|
|
|
|
|
78
|
my $unlocker = $self->_stupid_lock(); |
|
478
|
|
|
|
|
|
|
|
|
479
|
23
|
50
|
|
|
|
762
|
open( STUPID_DB, $dbfile) |
|
480
|
|
|
|
|
|
|
or die "Can't read-open $dbfile for $url : $!\nAborting"; |
|
481
|
23
|
|
|
|
|
51
|
my @f; |
|
482
|
23
|
|
|
|
|
120
|
local $/ = "\n"; |
|
483
|
23
|
|
50
|
|
|
83
|
my $from_db = ($self->{'rsstimingbot_from_db'} ||= {}); |
|
484
|
23
|
|
|
|
|
29
|
DEBUG > 8 and print " Reading DB file $dbfile...\n"; |
|
485
|
23
|
|
|
|
|
413
|
while() { |
|
486
|
95
|
|
|
|
|
134
|
chomp; |
|
487
|
95
|
|
|
|
|
471
|
@f = split ' ', $_, 3; |
|
488
|
|
|
|
|
|
|
# Yup, just three space-separated fields: "url varname value" |
|
489
|
95
|
50
|
|
|
|
241
|
if( @f >= 2 ) { |
|
490
|
95
|
|
|
|
|
117
|
DEBUG > 9 and print " Datum read {$f[0]} {$f[1]} {$f[2]}\n"; |
|
491
|
95
|
50
|
|
|
|
786
|
$from_db->{ $f[0] }->{ $f[1] } = |
|
492
|
|
|
|
|
|
|
defined($f[2]) ? $f[2] : ""; # because of split's behavior |
|
493
|
|
|
|
|
|
|
} |
|
494
|
|
|
|
|
|
|
} |
|
495
|
23
|
|
|
|
|
313
|
close(STUPID_DB); |
|
496
|
23
|
50
|
|
|
|
87
|
$unlocker and $unlocker->(); |
|
497
|
23
|
|
|
|
|
29
|
DEBUG > 8 and print " Done reading DB file $dbfile\n"; |
|
498
|
|
|
|
|
|
|
|
|
499
|
23
|
100
|
100
|
|
|
252
|
return $from_db->{$url}{$varname} |
|
500
|
|
|
|
|
|
|
if $from_db->{$url} and exists $from_db->{$url}{$varname}; |
|
501
|
9
|
|
|
|
|
178
|
return undef; |
|
502
|
|
|
|
|
|
|
} |
|
503
|
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
# See XML::RSS::TimingBotDBI for a better example of a commit method |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
sub _stupid_commit { # write all our dirty cache to DB files |
|
507
|
10
|
|
|
10
|
|
18
|
my $self = shift; |
|
508
|
10
|
|
50
|
|
|
45
|
my $for_db = $self->{'rsstimingbot_for_db'} || return; |
|
509
|
10
|
|
|
|
|
13
|
DEBUG > 1 and print " I see ", scalar(keys %$for_db), " url-data in $self to save\n"; |
|
510
|
10
|
100
|
|
|
|
45
|
return unless %$for_db; |
|
511
|
|
|
|
|
|
|
|
|
512
|
7
|
|
|
|
|
13
|
my %path2mods; |
|
513
|
7
|
|
|
|
|
40
|
foreach my $url (sort keys %$for_db) { |
|
514
|
7
|
|
|
|
|
29
|
my $dbfile = $self->_stupid___url2dbfile($url); |
|
515
|
7
|
|
|
|
|
17
|
foreach my $varname (sort keys %{ $for_db->{$url} }) { |
|
|
7
|
|
|
|
|
44
|
|
|
516
|
13
|
|
|
|
|
34
|
my $value = $for_db->{$url}{$varname}; |
|
517
|
13
|
50
|
|
|
|
156
|
$path2mods{$dbfile}{ "$url $varname" } = defined($value) ? $value : ''; |
|
518
|
|
|
|
|
|
|
} |
|
519
|
|
|
|
|
|
|
} |
|
520
|
7
|
|
|
|
|
13
|
DEBUG > 7 and print " Committing to ", scalar(keys %path2mods), |
|
521
|
|
|
|
|
|
|
" database files...\n"; |
|
522
|
|
|
|
|
|
|
|
|
523
|
7
|
|
|
|
|
24
|
my $unlocker = $self->_stupid_lock(); |
|
524
|
|
|
|
|
|
|
|
|
525
|
7
|
|
|
|
|
66
|
foreach my $dbfile (keys %path2mods) { |
|
526
|
7
|
|
|
|
|
35
|
$self->_stupid___mod_db( $dbfile, $path2mods{$dbfile} ); |
|
527
|
7
|
|
|
|
|
39
|
$path2mods{$dbfile} = undef; # potentially free up some memory |
|
528
|
|
|
|
|
|
|
} |
|
529
|
7
|
|
|
|
|
26
|
DEBUG > 7 and print " Done committing all database files\n"; |
|
530
|
|
|
|
|
|
|
|
|
531
|
7
|
50
|
|
|
|
33
|
$unlocker and $unlocker->(); |
|
532
|
7
|
|
|
|
|
25
|
%$for_db = (); |
|
533
|
7
|
|
|
|
|
56
|
return; |
|
534
|
|
|
|
|
|
|
} |
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
# |
|
537
|
|
|
|
|
|
|
# And now some very internal stuff: |
|
538
|
|
|
|
|
|
|
# |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
sub _stupid_lock { |
|
541
|
30
|
|
|
30
|
|
41
|
my $self = shift; |
|
542
|
30
|
|
|
|
|
82
|
my $file = $self->rss_semaphore_file; |
|
543
|
30
|
50
|
|
|
|
339
|
return unless $file; |
|
544
|
|
|
|
|
|
|
|
|
545
|
30
|
50
|
|
|
|
164
|
$file = $self->_stupid___url2dbfile("http://lock.nowhere.int/lock") |
|
546
|
|
|
|
|
|
|
if $file eq '1'; |
|
547
|
|
|
|
|
|
|
|
|
548
|
30
|
|
|
|
|
45
|
DEBUG > 2 and |
|
549
|
|
|
|
|
|
|
print "About to request (maybe wait for!) an exclusive lock on $file\n"; |
|
550
|
|
|
|
|
|
|
|
|
551
|
30
|
|
|
|
|
105
|
return $self->_getsem($file); |
|
552
|
|
|
|
|
|
|
} |
|
553
|
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
sub _stupid___my_db_path { |
|
556
|
85
|
|
|
85
|
|
131
|
my $self = shift; |
|
557
|
|
|
|
|
|
|
return $self->{'_dbpath'} |
|
558
|
|
|
|
|
|
|
|| $ENV{'TIMINGBOTPATH'} |
|
559
|
|
|
|
|
|
|
|| $ENV{'APPDATA'} |
|
560
|
|
|
|
|
|
|
|| $ENV{'HOME'} |
|
561
|
85
|
|
0
|
|
|
7114
|
|| do { require File::Spec; File::Spec->curdir; } |
|
562
|
|
|
|
|
|
|
} |
|
563
|
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
sub _stupid___url2dbfile { |
|
565
|
85
|
|
|
85
|
|
136
|
my($self, $url) = @_; |
|
566
|
85
|
|
|
|
|
540
|
require File::Spec; |
|
567
|
85
|
|
|
|
|
135
|
DEBUG > 2 and print " Pondering filespec for url $url\n"; |
|
568
|
85
|
|
|
|
|
289
|
my $url_stem = lc $url; |
|
569
|
85
|
|
|
|
|
695
|
$url_stem =~ s{^http://(?:www\.)?}{}s; |
|
570
|
85
|
|
|
|
|
345
|
$url_stem =~ s{\.(?:xml|rss|rdf)$}{}s; |
|
571
|
85
|
|
|
|
|
541
|
$url_stem =~ s/\W+//sg; # includes killing all whitespace |
|
572
|
85
|
50
|
|
|
|
229
|
$url_stem = 'misc' unless length $url_stem; |
|
573
|
85
|
100
|
|
|
|
309
|
$url_stem = substr($url_stem,0,30) if length($url) > 30; # truncate |
|
574
|
|
|
|
|
|
|
|
|
575
|
85
|
|
|
|
|
439
|
my $rssdir = File::Spec->catfile( $self->_stupid___my_db_path, 'rssdata'); |
|
576
|
85
|
50
|
|
|
|
3104
|
if( -e $rssdir ) { |
|
577
|
85
|
|
|
|
|
291
|
DEBUG > 12 and print " RSSdir $rssdir exists already.\n"; |
|
578
|
|
|
|
|
|
|
} else { |
|
579
|
0
|
0
|
|
|
|
0
|
if( mkdir($rssdir, 0777) ) { |
|
580
|
0
|
|
|
|
|
0
|
DEBUG > 1 and print " Successfully created RSSdir $rssdir\n"; |
|
581
|
|
|
|
|
|
|
} else { |
|
582
|
0
|
|
|
|
|
0
|
die "Can't mkdir $rssdir: $!\nAborting"; |
|
583
|
|
|
|
|
|
|
} |
|
584
|
|
|
|
|
|
|
} |
|
585
|
85
|
|
|
|
|
16229
|
my $path = File::Spec->catfile( $rssdir, $url_stem ); |
|
586
|
85
|
|
|
|
|
205
|
DEBUG > 6 and print "# Path to stupid DB: $path\n"; |
|
587
|
85
|
|
|
|
|
410
|
return $path; |
|
588
|
|
|
|
|
|
|
} |
|
589
|
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
# . . . . . . . . . . . . . . . . . . . . . . . . . |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
sub _stupid___mod_db { |
|
593
|
7
|
|
|
7
|
|
17
|
my($self, $dbfile, $to_write) = @_; |
|
594
|
|
|
|
|
|
|
|
|
595
|
7
|
100
|
66
|
|
|
142
|
if( -e $dbfile and -s _ ) { |
|
596
|
4
|
|
|
|
|
7
|
DEBUG > 9 and print "Reading db $dbfile ...\n"; |
|
597
|
4
|
50
|
|
|
|
135
|
open( STUPID_DB, $dbfile) |
|
598
|
|
|
|
|
|
|
or die "Can't read-open $dbfile : $!\nAborting"; |
|
599
|
4
|
|
|
|
|
10
|
my @f; |
|
600
|
4
|
|
|
|
|
82
|
local $/ = "\n"; |
|
601
|
4
|
|
|
|
|
115
|
while() { |
|
602
|
10
|
|
|
|
|
19
|
chomp; |
|
603
|
10
|
|
|
|
|
54
|
@f = split ' ', $_, 3; # yup, just three space-separated fields |
|
604
|
10
|
50
|
66
|
|
|
163
|
$to_write->{ "$f[0] $f[1]" } = defined($f[2]) ? $f[2] : "" |
|
|
|
100
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
if @f >= 2 and ! exists $to_write->{ "$f[0] $f[1]" }; |
|
606
|
|
|
|
|
|
|
} |
|
607
|
4
|
|
|
|
|
64
|
close(STUPID_DB); |
|
608
|
|
|
|
|
|
|
} else { |
|
609
|
3
|
|
|
|
|
5
|
DEBUG > 9 and print "No db $dbfile to read, so just writing new.\n"; |
|
610
|
|
|
|
|
|
|
} |
|
611
|
|
|
|
|
|
|
|
|
612
|
7
|
|
|
|
|
13
|
DEBUG > 8 and print " Saving DB file $dbfile (", scalar(keys %$to_write), " entries)\n"; |
|
613
|
|
|
|
|
|
|
|
|
614
|
7
|
50
|
|
|
|
626
|
open( STUPID_DB, ">$dbfile" ) or die "Can't write-open $dbfile: $!\nAborting"; |
|
615
|
7
|
|
|
|
|
17
|
my $value; |
|
616
|
7
|
|
|
|
|
48
|
foreach my $key (sort keys %$to_write) { |
|
617
|
20
|
50
|
|
|
|
66
|
next unless defined( $value = $to_write->{$key} ); |
|
618
|
20
|
|
|
|
|
43
|
$value =~ tr/\n\r//d; # Enforce sanity |
|
619
|
20
|
|
|
|
|
138
|
print STUPID_DB "$key $value\n"; |
|
620
|
|
|
|
|
|
|
} |
|
621
|
7
|
|
|
|
|
369
|
close(STUPID_DB); |
|
622
|
7
|
|
|
|
|
13
|
DEBUG > 8 and print " Done saving DB file $dbfile\n"; |
|
623
|
7
|
|
|
|
|
19
|
return; |
|
624
|
|
|
|
|
|
|
} |
|
625
|
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
############################################################################ |
|
627
|
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
sub _getsem { |
|
629
|
30
|
|
|
30
|
|
55
|
my($self, $file, $be_nonblocking) = @_; |
|
630
|
|
|
|
|
|
|
# Lock this semaphore file. Returns the unlocker sub!! |
|
631
|
|
|
|
|
|
|
# |
|
632
|
|
|
|
|
|
|
# To have the lock be non-blocking, specify a true second parameter. |
|
633
|
|
|
|
|
|
|
# In that case, returns false if can't get the lock. Unlocker |
|
634
|
|
|
|
|
|
|
# sub otherwise. |
|
635
|
|
|
|
|
|
|
|
|
636
|
30
|
50
|
33
|
|
|
345
|
unless(defined $file and length $file) { |
|
637
|
0
|
|
|
|
|
0
|
require Carp; |
|
638
|
0
|
|
|
|
|
0
|
Carp::confess("Filename argument to _getsem must be contentful!") |
|
639
|
|
|
|
|
|
|
} |
|
640
|
|
|
|
|
|
|
|
|
641
|
30
|
50
|
|
|
|
2062894
|
open(my $fh, ">$file") or Carp::croak("Can't write-open $file\: $!"); |
|
642
|
|
|
|
|
|
|
#chmod 0666, $file; # yes, make it world-writeable. or at least try. |
|
643
|
|
|
|
|
|
|
|
|
644
|
30
|
50
|
|
|
|
110
|
if($be_nonblocking) { # non-blocking! |
|
645
|
0
|
0
|
|
|
|
0
|
eval { flock($fh, 2 | 4) } # Exclusive + NONblocking |
|
|
0
|
|
|
|
|
0
|
|
|
646
|
|
|
|
|
|
|
or return; # couldn't get a lock. |
|
647
|
|
|
|
|
|
|
} else { # normal case: Exclusive, Blocking |
|
648
|
30
|
|
|
|
|
320
|
eval { flock($fh, 2) } # Exclusive |
|
649
|
30
|
50
|
|
|
|
66
|
or do { require Carp; Carp::confess("Can't exclusive-block lock $file: $!") }; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
650
|
|
|
|
|
|
|
# should never just fail -- should queue up forever |
|
651
|
|
|
|
|
|
|
} |
|
652
|
|
|
|
|
|
|
|
|
653
|
30
|
50
|
|
|
|
449
|
unless( print $fh "I am a lowly " , __PACKAGE__ , " semaphore file\cm\cj" ) { |
|
654
|
0
|
|
|
|
|
0
|
require Carp; |
|
655
|
0
|
|
|
|
|
0
|
Carp::confess("Can't write to $file\: $!"); |
|
656
|
|
|
|
|
|
|
} |
|
657
|
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
return( |
|
659
|
|
|
|
|
|
|
sub { |
|
660
|
30
|
50
|
|
30
|
|
73
|
if($fh) { # So we can call multiple times |
|
661
|
30
|
|
|
|
|
33
|
DEBUG > 1 and print "Releasing lock on $file\n"; |
|
662
|
30
|
|
|
|
|
1693
|
close($fh); # Presumably will never fail! |
|
663
|
|
|
|
|
|
|
# Will release the lock. |
|
664
|
30
|
|
|
|
|
66
|
undef $fh; |
|
665
|
30
|
|
|
|
|
117
|
return 1; |
|
666
|
|
|
|
|
|
|
} else { |
|
667
|
0
|
|
|
|
|
0
|
return ''; |
|
668
|
|
|
|
|
|
|
} |
|
669
|
|
|
|
|
|
|
} |
|
670
|
30
|
|
|
|
|
355
|
); |
|
671
|
|
|
|
|
|
|
# Now, I /could/ just have this work by returning the globref -- |
|
672
|
|
|
|
|
|
|
# then when all the references to the glob go to 0, the FH |
|
673
|
|
|
|
|
|
|
# closes, and the lock is released. However, this /relies/ on |
|
674
|
|
|
|
|
|
|
# the timing of garbage collection. |
|
675
|
|
|
|
|
|
|
} |
|
676
|
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
########################################################################### |
|
679
|
|
|
|
|
|
|
# |
|
680
|
|
|
|
|
|
|
# XML HELL BEGINS HERE |
|
681
|
|
|
|
|
|
|
# |
|
682
|
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
sub _scan_xml { |
|
684
|
552
|
|
|
552
|
|
941
|
my($self, $tag, $c, $timing, $subtag) = @_; |
|
685
|
552
|
50
|
|
|
|
2216
|
die "Crazy tag \"$tag\"!!" |
|
686
|
|
|
|
|
|
|
unless $tag =~ m/^[a-zA-Z_][a-zA-Z_0-9]*$/s; # sanity |
|
687
|
552
|
50
|
33
|
|
|
3468
|
die "Contentref has to be a scalar ref" unless $c and ref($c) eq 'SCALAR'; |
|
688
|
552
|
50
|
33
|
|
|
3631
|
die "Timing object has to be an object!" unless $timing and ref($timing) |
|
|
|
|
33
|
|
|
|
|
|
689
|
|
|
|
|
|
|
and ref($timing) ne 'SCALAR'; |
|
690
|
|
|
|
|
|
|
|
|
691
|
552
|
|
|
|
|
752
|
my $method = $tag; |
|
692
|
|
|
|
|
|
|
|
|
693
|
552
|
|
|
|
|
513
|
DEBUG > 5 and print "# _scan_xml << self <$self>; timingobj <$timing>\n# tag <$tag>; subtag ", |
|
694
|
|
|
|
|
|
|
defined($subtag) ? "<$subtag>" : "(nil)", "\n", |
|
695
|
|
|
|
|
|
|
"# Content {\n$$c\n# }\n", |
|
696
|
|
|
|
|
|
|
; |
|
697
|
|
|
|
|
|
|
|
|
698
|
552
|
100
|
|
|
|
1042
|
unless(defined $subtag) { # common case: just someval |
|
699
|
368
|
100
|
|
|
|
29155
|
if( $$c =~ |
|
700
|
|
|
|
|
|
|
m{ |
|
701
|
|
|
|
|
|
|
< |
|
702
|
|
|
|
|
|
|
(?: [a-zA-Z_][-_\.a-zA-Z0-9]* \: )? # optional namespace |
|
703
|
|
|
|
|
|
|
$tag\b |
|
704
|
|
|
|
|
|
|
.*? # optional attributes and whitespace and junk |
|
705
|
|
|
|
|
|
|
> |
|
706
|
|
|
|
|
|
|
\s* |
|
707
|
|
|
|
|
|
|
([^<>\s"]+) |
|
708
|
|
|
|
|
|
|
\s* |
|
709
|
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
(?: [a-zA-Z_][-_\.a-zA-Z0-9]* \: )? # optional namespace |
|
711
|
|
|
|
|
|
|
$tag |
|
712
|
|
|
|
|
|
|
\s* # just the optional (and rare) whitespace |
|
713
|
|
|
|
|
|
|
> |
|
714
|
|
|
|
|
|
|
}sx |
|
715
|
|
|
|
|
|
|
) { |
|
716
|
94
|
|
|
|
|
196
|
my $it = $1; |
|
717
|
94
|
|
|
|
|
361
|
LWP::Debug::debug("Content has $method value: \"$it\""); |
|
718
|
94
|
|
|
|
|
386
|
DEBUG > 2 and print(" Content has $method value: \"$it\"!!\n"); |
|
719
|
94
|
|
|
|
|
494
|
$timing->$method( $it ); |
|
720
|
|
|
|
|
|
|
} else { |
|
721
|
274
|
|
|
|
|
1290
|
LWP::Debug::debug("Content has no $method value"); |
|
722
|
274
|
|
|
|
|
1086
|
DEBUG > 2 and print(" Content has no $method value\n"); |
|
723
|
|
|
|
|
|
|
} |
|
724
|
368
|
|
|
|
|
3609
|
return; |
|
725
|
|
|
|
|
|
|
} |
|
726
|
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
# Else it's a tag and subtaggy thing |
|
728
|
184
|
50
|
|
|
|
637
|
die "Crazy subtag \"$tag\"!!" |
|
729
|
|
|
|
|
|
|
unless $subtag =~ m/^[a-zA-Z_][a-zA-Z_0-9]*$/s; # sanity |
|
730
|
|
|
|
|
|
|
|
|
731
|
184
|
100
|
|
|
|
17183
|
if( $$c =~ |
|
732
|
|
|
|
|
|
|
m{ |
|
733
|
|
|
|
|
|
|
< |
|
734
|
|
|
|
|
|
|
(?: [a-zA-Z_][-_\.a-zA-Z0-9]* \: )? # optional namespace |
|
735
|
|
|
|
|
|
|
$tag |
|
736
|
|
|
|
|
|
|
\b.*?> |
|
737
|
|
|
|
|
|
|
\s* |
|
738
|
|
|
|
|
|
|
( |
|
739
|
|
|
|
|
|
|
(?: |
|
740
|
|
|
|
|
|
|
< |
|
741
|
|
|
|
|
|
|
(?: [a-zA-Z_][-_\.a-zA-Z0-9]* \: )? # optional namespace |
|
742
|
|
|
|
|
|
|
$subtag |
|
743
|
|
|
|
|
|
|
\b.*?> |
|
744
|
|
|
|
|
|
|
\s* |
|
745
|
|
|
|
|
|
|
[^<>\s"]+ |
|
746
|
|
|
|
|
|
|
\s* |
|
747
|
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
(?: [a-zA-Z_][-_\.a-zA-Z0-9]* \: )? # optional namespace |
|
749
|
|
|
|
|
|
|
$subtag |
|
750
|
|
|
|
|
|
|
\s*> |
|
751
|
|
|
|
|
|
|
\s* |
|
752
|
|
|
|
|
|
|
)+ |
|
753
|
|
|
|
|
|
|
) |
|
754
|
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
(?: [a-zA-Z_][-_\.a-zA-Z0-9]* \: )? # optional namespace |
|
756
|
|
|
|
|
|
|
$tag |
|
757
|
|
|
|
|
|
|
\s*> |
|
758
|
|
|
|
|
|
|
}sx |
|
759
|
|
|
|
|
|
|
) { |
|
760
|
32
|
|
|
|
|
80
|
my $there = $1; |
|
761
|
32
|
|
|
|
|
38
|
DEBUG > 3 and print " $method+subtag valuecluster \"$there\"\n"; |
|
762
|
32
|
|
|
|
|
649
|
my(@them) = ( $there =~ |
|
763
|
|
|
|
|
|
|
# Our previous RE made sure that this is a very simply-structured |
|
764
|
|
|
|
|
|
|
# area, so we can get away with just this simple regexp: |
|
765
|
|
|
|
|
|
|
m{ |
|
766
|
|
|
|
|
|
|
> |
|
767
|
|
|
|
|
|
|
\s* |
|
768
|
|
|
|
|
|
|
([^<>\s"]+) |
|
769
|
|
|
|
|
|
|
\s* |
|
770
|
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
}xsg |
|
772
|
|
|
|
|
|
|
); |
|
773
|
32
|
|
|
|
|
183
|
LWP::Debug::debug("Content $method+$subtag values: @them"); |
|
774
|
32
|
|
|
|
|
103
|
DEBUG > 2 and print(" Content $method+$subtag values: (", |
|
775
|
|
|
|
|
|
|
join(q<,>, map("\"$_\" ", @them)), ") !!\n"); |
|
776
|
32
|
|
|
|
|
190
|
$timing->$method( @them ); # yes, we call the method, not a submethod |
|
777
|
|
|
|
|
|
|
} else { |
|
778
|
152
|
|
|
|
|
619
|
LWP::Debug::debug("Content has no $method+$subtag values"); |
|
779
|
152
|
|
|
|
|
524
|
DEBUG > 2 and print(" Content has no $method+$subtag values\n"); |
|
780
|
|
|
|
|
|
|
} |
|
781
|
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
# Qvia sicvt exaltantvr Caeli a Terra, |
|
783
|
|
|
|
|
|
|
# sic exaltatae svnt viae Meae a viis vestris, |
|
784
|
|
|
|
|
|
|
# et cogitationes Meae a cogitationibvs vestris! |
|
785
|
|
|
|
|
|
|
# VGABVGA! |
|
786
|
|
|
|
|
|
|
|
|
787
|
184
|
|
|
|
|
1423
|
return; |
|
788
|
|
|
|
|
|
|
} |
|
789
|
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
# EndHell. |
|
791
|
|
|
|
|
|
|
########################################################################### |
|
792
|
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
1; |
|
794
|
|
|
|
|
|
|
__END__ |