File Coverage

blib/lib/XML/RSS/SimpleGen.pm
Criterion Covered Total %
statement 375 577 64.9
branch 153 306 50.0
condition 79 225 35.1
subroutine 43 71 60.5
pod 3 36 8.3
total 653 1215 53.7


line stmt bran cond sub pod time code
1              
2             require 5;
3             package XML::RSS::SimpleGen;
4 8     8   74457 use strict;
  8         20  
  8         310  
5 8     8   45 use Carp ();
  8         15  
  8         366  
6             require Exporter;
7 8         2470 use vars qw(
8             @EXPORT %EXPORT_TAGS @ISA $VERSION
9             %Entities %WinLameEntities %CommonEnts $MIME_Type
10             $DTD_url $DTD_pubid $Nativize_newlines $DWIM @Hidies $RSS_obj
11             $Sleepy %IsBlockMarkup $MaybeIndent $MaybeNL %PeriodAsSeconds
12             $NAMESPACE_SY $CHUNK_MINUTES %BadPorts
13             @Retry_delays $UserAgentString
14 8     8   41 );
  8         18  
15              
16             $VERSION = '11.11';
17 8 50   8   84467 BEGIN { *DEBUG = sub () {0} unless defined &DEBUG; } # set DEBUG level
18              
19             @ISA = qw(Exporter);
20              
21             #$DTD_url ||= 'http://my.netscape.com/publish/formats/rss-0.91.dtd';
22             #$DTD_pubid ||= '-//Netscape Communications//DTD RSS 0.91//EN';
23             $NAMESPACE_SY ||= 'http://purl.org/rss/1.0/modules/syndication/';
24             $CHUNK_MINUTES = 10;
25              
26             $Nativize_newlines = 1 unless defined $Nativize_newlines;
27             $DWIM = 1 unless defined $DWIM;
28             $Sleepy =
29             $ENV{'MAILTO'} ? 4 # under crontab
30             : ($ENV{'TERM'} || $ENV{'REQUEST_METHOD'} || $ENV{'COMSPEC'} ) ? 0
31             # almost definitely not under crontab
32             : 4 unless defined $Sleepy;
33             @Retry_delays = (4, 10, 20, 40);
34             $MaybeIndent = ' ';
35             $MaybeNL = "\n";
36             #$MaybeNL = $MaybeIndent = ''; # terser, more grep-worthy
37              
38             foreach my $p ( # ports we'll refuse to do HTTP on
39             qw<0 1 7 9 11 13 15 17 19 20 21 22 23 25 37 42 43 53 70 79 95 101 102 103
40             104 107 109 110 111 113 115 117 119 123 135 137 138 139 143 389 443 512
41             513 514 515 517 518 526 530 531 532 540 556 6667
42             >) { $BadPorts{$p} = 1 unless defined $BadPorts{$p}; }
43              
44             $UserAgentString ||= "XmlRssSimpleGen/$VERSION";
45              
46             #..........................................................................
47 8     8   26 sub _hide {push @Hidies, @_ };
48             _hide qw(init import);
49              
50             #..........................................................................
51              
52              
53             =head1 NAME
54              
55             XML::RSS::SimpleGen - for writing RSS files
56              
57             =head1 SYNOPSIS
58              
59             # A complete screen-scraper and RSS generator here:
60            
61             use strict;
62             use XML::RSS::SimpleGen;
63             my $url = q;
64            
65             rss_new( $url, "eXile", "Moscow-based Alternative Newspaper" );
66             rss_language( 'en' );
67             rss_webmaster( 'xxxxx@yourdomain.com' );
68             rss_twice_daily();
69            
70             get_url( $url );
71            
72             while(
73             m{

\s*(.*?)\s*

\s*(.*?)
82             exit;
83              
84             =head1 DESCRIPTION
85              
86             This module is for writing RSS files, simply. It transparently handles
87             all the unpleasant details of RSS, like proper XML escaping, and also has
88             a good number of Do-What-I-Mean features, like not changing the modtime
89             on a written-out RSS file if the file content hasn't changed, and like
90             automatically removing any HTML tags from content you might pass in.
91              
92             This module isn't meant to have the full expressive power of RSS;
93             instead, it provides functions that are most commonly needed by
94             RSS-writing programs.
95              
96             =head1 INTERFACE
97              
98             This module provides a bunch of functions for starting an RSS feed in
99             memory, putting items into it, and saving it to disk (or printing it as
100             a string, as in a CGI). If you prefer an object-oriented interface
101             (obviously more useful if you're composing several feeds at once), then
102             you can use this module as a class whose methods are the same as the
103             function names minus "rss_". Except for this detail of the naming, the
104             functions and methods are the same, behave the same, and take the same
105             arguments.
106              
107             That is, this functional code:
108              
109             use XML::RSS::SimpleGen;
110             my $url = q;
111            
112             rss_new( $url, "eXile" );
113             rss_language( 'en' );
114             get_url( $url );
115             ...
116              
117             does the same work as this OO code:
118              
119             use XML::RSS::SimpleGen ();
120             my $url = q;
121             my $rss = XML::RSS::SimpleGen->new( $url, "eXile");
122             $rss->language( 'en' );
123             $rss->get_url( $url );
124             ...
125              
126             (Note that the function C doesn't have a leading "rss_",
127             so its method name is the same as its function name. It's the
128             one exception.)
129              
130             If this talk of objects puzzles you, see
131             L in the C dist, and/or see
132             the chapter "User's View of Object-Oriented Modules"
133             in my book I (L).
134             (The book is also useful as an extended discussion of screen-scraping.)
135              
136             Note: in the code below, I use the word "accessor" a lot, to refer
137             to a function or method that you can call two possible ways:
138             1) like C)> to set the "foo" attribute to the value I,
139             or 2) like C to return the value of the "foo" attribute.
140              
141              
142             =head1 FUNCTIONS
143              
144             =over
145              
146             =item C );>
147              
148             =item C );>
149              
150             =item C );>
151              
152             =item I C<< $rss = XML::RSS::SimpleGen->new(...); >>
153              
154             This function creates a new RSS feed in memory. This should be the first
155             C> function you call in your program. If you call it
156             again, it erases the current object (if any) and sets up a new one according
157             to whatever parameters you pass.
158              
159             The parameters are the full URL, the title, and the description of the
160             site (or page) that you're providing an RSS feed of. The description is
161             optional, but you should provide at least a URL and title.
162              
163             Examples:
164              
165             rss_new( $url, "eXile", "Moscow-based Alternative Newspaper" );
166              
167             rss_new( 'http://www.mybazouki.com/news/', "Bazouki News!" );
168              
169             (As a method, XML::RSS::SimpleGen->new simply returns a new
170             RSS object.)
171              
172             =cut
173              
174             sub new {
175 20     20 1 1012 my $class = shift;
176 20   33     120 $class = ref($class) || $class;
177 20         112 my $new = bless { 'items' => [] }, $class;
178 20         72 $new->init();
179              
180 20 100       99 @_ and $new->url(shift);
181 20 100       71 @_ and $new->title(shift);
182 20 50       57 @_ and $new->description(shift);
183              
184 20         65 $new->item_limit( 0 );
185 20         157 $new->retention( 32 * 24 * 60 * 60 ); # 32 days
186              
187 20 50       383 (-e "rss.css") ? $new->css("./rss.css")
188             : $new->css("http://www.interglacial.com/rss/rss.css");
189              
190 20 50       264 (-e "about_rss.html") ? $new->docs("./about_rss.html")
191             : $new->docs("http://www.interglacial.com/rss/about.html");
192              
193 20         63 return $new;
194             }
195              
196 20     20 0 37 sub init { return; } # override in subclass as necessary
197              
198             #..........................................................................
199             __PACKAGE__->_accessorize(
200             qw(
201             title description url language css xsl webMaster docs
202             item_limit ttl
203             retention
204             allow_duplicates
205             image_title image_link image_url image_width image_height image_description
206             )
207             );
208              
209             =item the accessor C)>
210              
211             This declares what language this RSS feed is in. It must be
212             an RFC3066-style language tags like "en", or "en-US", or "zh-TW".
213             (See I for a list.)
214             If you don't set the feed's language, it defaults to "en", for generic English.
215              
216             If you call this function without a parameter, it returns
217             the current value of the RSS feed's language. For example:
218              
219             print "I'm making an RSS feed for ", rss_language(), "!\n";
220              
221             The same is true for all the functions that I label as "accessors".
222              
223              
224             =item the accessor C)>
225              
226             This sets the maximum number of items that this feed will show.
227              
228             The default value is 0, meaning that there is no maximum.
229              
230             If you set it to a positive number I, then the feed will show only
231             the first I items that you declare with C. (Or, if you set
232             C, then the newest I items that you declare
233             with C.)
234              
235             If you set it to a negative number I<-N>, then the feed will show only
236             the last I items that you declare with C. (Or, if you set
237             C, then the oldest I items you declare with C,
238             which is unlikely to be useful!)
239              
240              
241             =item the accessor C)>
242              
243             This declares what email address you, the RSS generator manager, can be
244             reached at. Example:
245              
246             rss_webMaster( 'sburke@bazouki-news.int' );
247              
248             =cut
249              
250             #..........................................................................
251              
252             =item C )>
253              
254             This declares that you want this RSS feed to keep track of what items are
255             new, and to list them first when the RSS is emitted. To do this, the RSS
256             generator has to store information in a file, where it tracks its "history",
257             i.e., when was the first time it saw given URLs, and the most recent time
258             it saw given URLs.
259              
260             Typical usage is:
261              
262             rss_history_file( 'thisrssfeed.dat' );
263              
264             You should call C I you make any calls to
265             C.
266              
267             The history-file feature is meant for cases where your RSS-generator
268             program calls C on I link it sees, but only wants the
269             I links to appear in the RSS output. (This can be a good approach
270             if you're making an RSS feed of a page like
271             C where there's some new links (to the
272             recently added stories), but also links to some days-old stories, and
273             I links to some always-there things like "Archive Search" and
274             "Contact Us" pages.
275              
276             Once you call rss_history_file, the specified file is read in. The
277             in-memory history (stored in the RSS object) is updated as you
278             call C. But the file isn't updated until you call rss_save.
279              
280             (A do-what-I-mean side effect of calling C is that it
281             sets rss_item_limit to 25 if it is currently 0.)
282              
283             (Incidentally, if you're using rss_history_file as part of a CGI that
284             emits RSS data, instead of a program that just saves to an RSS file,
285             then things will get complicated. You'll need to call an internal method
286             to explicitly commit the history file to disk, and you'll need a
287             semaphore file to avoid race conditions. Email me for full info.)
288              
289             =cut
290              
291             sub history_file {
292 12     12 0 20 my $self = shift;
293 12 100       64 return $self->{'history_file'} unless @_; # read accession
294 3         7 my $file = $_[0];
295            
296 3 50 33     54 unless(defined $file and length $file and $file =~ m/\S/) {
      33        
297 0         0 DEBUG and print "Killing $self 's memory-history.\n";
298             # I don't know if this'd be actually useful for anything tho.
299 0         0 delete $self->{'history_file'};
300 0         0 delete $self->{'_first_seen' };
301 0         0 delete $self->{'_last_seen' };
302 0         0 return undef;
303             }
304            
305 3         4 DEBUG and print "Considering $file as $self 's history.\n";
306 3         15 $self->{'history_file'} = $file;
307 3         19 $self->_read_history_file;
308            
309 3 50 33     20 $self->item_limit( 25 ) if $DWIM and !$self->item_limit();
310              
311 3         12 return $file;
312             }
313              
314             #..........................................................................
315              
316              
317             =item C );>
318              
319             =item C );>
320              
321             =item C );>
322              
323             This adds a new item to the current feed. You will need to specify the
324             URL to add (and it should be a valid-looking URL, starting with
325             "I", and not containing any spaces). You may also specify
326             the title, but it's optional. And finally, you can optionally specify a
327             description. (You can remember this because it starts with the essential
328             item first, and progresses toward the most optional.)
329              
330             Leading and tailing whitespace is removed from whichever of I
331             title,> and I are defined values, and HTML is parsed out.
332              
333             A simple usage:
334              
335             rss_item(
336             "http://www.harpers.org/MostRecentWR.html",
337             "Harper's Magazine's Weekly Review"
338             );
339              
340             Although in practice, a typical call won't have string constants, but
341             will instead be like the example in the L,
342             namely:
343              
344             rss_item("$url$1", $2, $3);
345              
346             Incidentally, as a do-what-I-mean feature, if the first parameter
347             doesn't look like a URL but one of the others does, then this error is
348             silently forgiven. This is so you can occasionally slip up and forget
349             the order of the parameters.
350              
351             (In the unlikely event where you I to avoid the HTML-removal
352             features, you can do this by passing scalar-references instead of
353             normal strings, like so: C.)
354              
355             =cut
356              
357             sub item { # Add an item: (url, title, description)
358 11     11 0 38 my($self, @params) = @_;
359 11 50 33     117 if( grep defined($_) && length($_), @params ) {
360 11         16 push @{$self->{'items'}},
  11         89  
361             [
362             $self->_process_item_params(@params) # DWIM things happen here
363             ]
364             ;
365             DEBUG and print "Adding item ",
366 11         24 join("|", @{ $self->{'items'}[-1] }), "\n";
367              
368             # Update history...
369 11 100       48 if( $self->{'_first_seen'} ) {
370 6         20 my $url = $self->{'items'}[-1][0];
371 6 100       19 my $now =
372             $self->{'_virgin_item_timestamp'}
373             ? --$self->{'_virgin_item_timestamp'}
374             : time()
375             ;
376 6   66     35 $self->{'_first_seen'}{$url} ||= $now;
377 6         19 $self->{'_last_seen' }{$url} = $now;
378             }
379              
380             } else {
381 0         0 DEBUG and print "Not adding item -- empty params\n";
382             }
383 11         42 return $self;
384             }
385              
386             #..........................................................................
387              
388             =item C
389              
390             This returns the number of items you've declared. I anticipate that its
391             main usage will be something like:
392              
393             die "What, no objects found at $url ?!"
394             unless rss_item_count();
395              
396             or, maybe...
397              
398             exit unless rss_item_count();
399              
400             ...depending on how/whether you'd want to react to cases where you don't
401             see anything to put into an RSS feed.
402              
403             Note that the parens are optional, since this command takes no options
404             (just like Perl's C function).
405              
406             =cut
407              
408 1     1 0 3 sub item_count { scalar @{ $_[0]->{'items'} }; }
  1         7  
409 0     0 0 0 sub items_count { shift->item_count } # harmless alias
410             #..........................................................................
411              
412             sub _sort_items_newest_first {
413 3     3   7 my $self = shift;
414 3         6 my $seen = $self->{'_first_seen'};
415              
416 3 50       11 $seen or Carp::confess(
417             "Can't sort items by age unless you define a history file!");
418              
419 3   50     11 my $items = $self->{'items'} || die "NO ITEMS!?";
420              
421 3         3 if( DEBUG ) {
422             print "\nSorting ", scalar(@$items), " items, newest first. Age dump:\n";
423             foreach my $i (@$items) {
424             if( $i->[0] ) {
425             printf " %s : %s (%s)\n",
426             $i->[0], $seen->{$i->[0]} || 0, scalar localtime(
427             $seen->{$i->[0]} || 0)
428             ;
429             } else {
430             print " NOURL\n";
431             }
432             }
433             print " (no items)\n" unless @$items;
434             print "(End history dump.)\n\n";
435             }
436              
437 4 50 50     51 @$items =
      50        
438 3         20 sort { ($seen->{$b->[0] ||''} ||0)
439             <=> ($seen->{$a->[0] ||''} ||0)
440             or $a->[0] cmp $b->[0] # So it's at least predictable
441             }
442             @$items;
443 3         6 return;
444             }
445              
446             #..........................................................................
447              
448             =item C );>
449              
450             This declares that you want to declare a particular image as the logo for
451             this feed. Most feeds don't have such a thing, and most readers just ignore
452             it anyway, but if you want to declare it, this function is how. The
453             three parameters, which are all required, are: the image's URL, its
454             height in pixels, and its width in pixels.
455             According to various specs, the width should/must be between 1 and 144,
456             an the height should/must be between 1 and 400.
457              
458             A typical usage:
459              
460             rss_image("http://interglacial.com/rss/weebl.gif", 106, 140);
461              
462             Be careful not to mix up the height and width.
463              
464             =cut
465              
466              
467             sub image {
468 0     0 0 0 my($self, $url, $h, $w) = splice(@_,0,4);
469 0 0 0     0 Carp::croak "Usage: \$rss->image(\$url, \$h, \$w)"
      0        
      0        
      0        
470             unless $url and $h and $w
471             and $h =~ m/^\d{1,3}$/s
472             and $w =~ m/^\d{1,3}$/s
473             ;
474 0         0 $self->image_url($url);
475 0         0 $self->image_height($h);
476 0         0 $self->image_width( $w);
477              
478             # width must (should?) be between 1 and 144
479             # height must (should?) be between 1 and 400
480              
481 0 0       0 @_ and $self->image_link( shift);
482 0 0       0 @_ and $self->image_title( shift);
483 0 0       0 @_ and $self->image_description( shift);
484 0         0 return;
485             }
486              
487             #..........................................................................
488              
489             =item C );>
490              
491             =item C );>
492              
493             This saves the RSS date to the file you specify. If the RSS data hasn't
494             changed, the file (and its modtime) aren't altered. The optional I
495             parameter means that if ever the file exists, and its content hasn't changed
496             for that many days or longer, then the program should die with a warning
497             message. For example, in the case of a screen-scraper for a site that we
498             know should (in theory) change its content at least weekly, we might save
499             the RSS file with:
500              
501             rss_save("whatever.rss", 17);
502             # Scream if the feed is unchanged for 17 days.
503              
504             The seventeen there is gotten by assuming that just maybe the site
505             might skip two weeks for a vacation now and then, and might even put
506             out the pre-vacation issue a few days early -- but that if ever the program
507             notices that the data hasn't changed for 17 days, then it should emit
508             error messages. If you want to disable this feature on a one-time basis,
509             just change the modtime (like via C) on the F file.
510              
511             If you don't specify a C value, then this whole
512             complain-if-it's-old feature is disabled.
513              
514             =cut
515              
516             sub save {
517 3     3 0 8 my($self, $filename, $maxage) = @_;
518 3 50 33     29 Carp::croak("I need a filename to save to")
519             unless defined $filename and length $filename;
520              
521 3 50       7 $self->history_file and $self->_sort_items_newest_first;
522 3         15 $self->_write_history_file;
523              
524 3         13 my $new_content = $self->as_string;
525              
526 3         17 local *RSS;
527             # See if we can skip writing to disk
528 3 50 66     96 if(-e $filename and -f _ and -r _ and -s _) {
      66        
      33        
529 2         5 DEBUG and print "Comparing to $filename...\n";
530 2 50       93 open RSS, $filename
531             or Carp::confess "Can't read-open $filename: $!"; # insanity
532 2         13 local $/;
533 2         82 my $content = ;
534 2         24 close(RSS);
535            
536 2 50       6 if( $content eq $new_content ) {
537 0         0 DEBUG and print " $filename\'s content is the same.\n";
538 0         0 return $self->_check_age($filename, $maxage);
539             }
540            
541 2         5 my $new_content_without_builddate = $new_content;
542 2         6 foreach my $c ($new_content_without_builddate, $content) {
543 4         39 $c =~ s{.*?}{};
544 4         12 $c =~ s{.*?}{};
545 4         25 $c =~ tr{\n\r}{}s; # And smash newlines while we're at it...
546             }
547 2 50       7 if( $content eq $new_content_without_builddate ) {
548 0         0 DEBUG and print " $filename\'s content is the same, basically.\n";
549 0         0 return $self->_check_age($filename, $maxage);
550             }
551 2         7 DEBUG and print " $filename\'s content is stale. Overwriting.\n";
552             }
553 3         4 DEBUG and print "Writing to $filename ...\n";
554            
555             # OK, we have to actually write it. Let's do it fast.
556 3 50       242 open(RSS, ">$filename")
557             or Carp::confess "Can't write-open $filename: $!\nAborting";
558 3 50       48 print RSS $new_content
559             or Carp::confess "Can't print to $filename: $!\nAborting";
560 3 50       120 close(RSS)
561             or Carp::confess "Can't close $filename: $!\nAborting";
562 3         5 eval { chmod 0644, $filename };
  3         86  
563 3         4 DEBUG and print "Done writing to $filename\n";
564 3         73 return 1;
565             }
566              
567             #..........................................................................
568              
569             =item C
570              
571             This returns the RSS-XML data as a string. This function is called
572             internally by the rss_save function; but you might want to call it
573             explicitly, as in a CGI, where your CGI would probably end like this:
574              
575             print "Content-type: application/xml\n\n", rss_as_string();
576             exit;
577              
578             =cut
579              
580              
581             sub as_string {
582 21     21 0 86 my $self = $_[0];
583            
584 21         54 my $namespaces = $self->_namespaces_as_string;
585            
586 21 50 33     173 return join '' =>
    50          
    100          
587             qq{\n},
588             ($DTD_pubid and $DTD_url)
589             ? qq{\n} : (),
590              
591             $self->css ? ( # http://www.w3.org/TR/xml-stylesheet/
592             qq{
593             $self->xmlesc($self->css),
594             qq{"?>\n}
595             ) : (),
596             $self->xsl ? (
597             qq{
598             $self->xmlesc($self->xsl),
599             qq{"?>\n}
600             ) : (),
601             qq{},
602             $self->_modversion_comment,
603             $self->_various_comments,
604             $self->_metadata_as_xml,
605             $self->_items_as_xml,
606             "\n\n",
607             ;
608             }
609              
610             sub _namespaces_as_string {
611 21     21   28 my $self = $_[0];
612 21         157 my $ns = $self->{'_namespaces'};
613 21 50 33     82 return '' unless $ns and keys %$ns;
614             return
615 0         0 join '', "\n",
616             map qq{ xmlns:$_="$$ns{$_}"\n},
617             sort keys %$ns
618             ;
619             }
620              
621             #..........................................................................
622              
623             sub _metadata_as_xml {
624 21     21   32 my $self = $_[0];
625 21 100       49 my($url, $title, $description) = (
626             map defined($_) ? $_ : '',
627             $self->url, $self->title, $self->description,
628             );
629            
630 21 50       57 if($DWIM) {
631             # tolerate switching the description/title and the URL
632 21 50       123 if($url !~ m{^[a-z]+[a-z0-9]*:\S+$}s) {
633 0         0 DEBUG and print "URL \"$url\" looks fishy...\n";
634 0 0       0 if( $title =~ m{^[a-z]+[a-z0-9]*:\S+$}s) {
    0          
635 0         0 DEBUG and print "Swapping with title \"$title\"\n";
636 0         0 ($url, $title) = ($title, $url);
637 0         0 $self->url($url);
638 0         0 $self->title($title);
639             } elsif( $description =~ m{^[a-z]+[a-z0-9]*:\S+$}s) {
640 0         0 DEBUG and print "Swapping with description \"$description\"\n";
641 0         0 ($url, $description) = ($description, $url);
642 0         0 $self->url($url);
643 0         0 $self->description($description);
644             } else {
645 0   0     0 Carp::croak( "Invalid URL \"$url\" in metadata "
646             . ($title||$description||"???") . "\nAborting"
647             );
648             }
649             }
650             }
651              
652 21         52 my $lang = $self->language;
653 21 100       54 $lang and $lang =~ tr[_][\-]; # tolerate underscores
654 21 50 66     153 $lang = 'en' unless defined $lang
      66        
655             # From I18N::LangTags...
656             and $lang !~ m/^[iIxI]$/s
657             and $lang =~
658             m/^(?: # First subtag
659             [xiIX] | [a-zA-Z]{2,3}
660             )
661             (?: # Subtags thereafter
662             - # separator
663             [A-Za-z0-9]{1,8} # subtag
664             )*
665             $/xs
666             ;
667              
668 21   50     255 return join '', sprintf( qq{
      50        
      50        
      66        
      66        
      33        
669             %s
670             %s
671             %s
672             %s
673             %s
674             },
675             map( $self->html2text($_),
676             $url || '!!!!DummyURL!!!!',
677             $title || $description || $url || '!!!!DummyTitle!!!!',
678             $description || $title || $url || '!!!!DummyDescription!!!!',
679             ),
680             $lang,
681             $self->_date_rfc822(),
682             ),
683             grep $_,
684            
685             $self->{"skipHours"},
686             $self->{"skipDays"},
687             $self->_sy_update_as_xml,
688             (($self->{"ttl"} || '') =~ m/^(\d+)$/s) && "$1\n",
689             $self->webMaster &&
690             ("" . $self->html2text($self->webMaster) . "\n"),
691            
692             $self->docs &&
693             ("" . $self->html2text($self->docs) . "\n"),
694              
695             $self->_image_as_xml,
696             $self->{'more_metadata'}, # a hack for sneaking in more things
697             ;
698              
699             # ...with sanity-checkers
700              
701             }
702             #..........................................................................
703              
704             sub _process_item_params {
705 11     11   21 my $self = shift;
706             my($url, $title, $description) =
707 11 50 33     48 map {; (defined($_) && m/\S/) ? $_ : '' } # contentfulness filter
  21         158  
708             map $self->html2text($_), # here's where we de-htmlify things
709             @_;
710             ;
711            
712 11         29 for($url, $title, $description) {
713 33 50       98 if(ref $_ ) { } # Okay
    100          
714 21         43 elsif(defined $_) { s/^\s+//s; s/\s+$//s }
  21         56  
715 12         39 else { $_ = '' }
716             }
717            
718             # Tolerate switching the description/title and the URL
719 11 50 33     100 if($DWIM and $url !~ m{^[a-z]+[a-z0-9]*:\S+$}s) {
720 0         0 DEBUG and print "URL \"$url\" looks fishy...\n";
721 0 0       0 if( $title =~ m{^[a-z]+[a-z0-9]*:\S+$}s) {
    0          
722 0         0 DEBUG and print "Swapping with title \"$title\"\n";
723 0         0 ($url, $title) = ($title, $url);
724             } elsif( $description =~ m{^[a-z]+[a-z0-9]*:\S+$}s) {
725 0         0 DEBUG and print "Swapping with description \"$description\"\n";
726 0         0 ($url, $description) = ($title, $description);
727             } else {
728 0   0     0 Carp::confess( "Invalid URL \"$url\" in item "
729             . ($title||$description||"???") . "\nAborting"
730             );
731             }
732             }
733 11         61 return ($url, $title, $description);
734             }
735              
736             #..........................................................................
737              
738             sub _items_as_xml {
739 21     21   34 my $self = $_[0];
740 21         26 my @items;
741 21         39 my @xml_out = ($MaybeNL);
742              
743 21 50       47 if( $self->allow_duplicates ) {
744 0 0       0 @items = @{ $self->{'items'} || die "NO ITEMS!?!?" };
  0         0  
745             } else {
746 21         37 my %seen;
747 21 50       23 foreach my $i (@{ $self->{'items'} || die "NO ITEMS!?!?" }) {
  21         82  
748 17 100       58 if( $seen{ $i->[0] } ++ ) {
749 1         5 DEBUG and print "Removing duplicate item @$i\n";
750             } else {
751 16         43 push @items, $i;
752             }
753             }
754             }
755              
756 21   100     93 my $item_limit = int( $self->item_limit || 0 );
757 21 100       47 if($item_limit) {
758 6 50       21 if(@items > abs($item_limit)) {
759 0 0       0 if($item_limit > 0) { # like 4 to mean just the first 4
760 0         0 DEBUG and print " Killing all but the first $item_limit items.\n";
761 0         0 splice @items, $item_limit;
762             } else { # like -4 to mean just the last 4
763 0         0 DEBUG and print " Killing all but the last ", 0-$item_limit, " items.\n";
764 0         0 splice @items, 0, @items + $item_limit;
765             # So if item_limit is -3 and @items has 10 elements,
766             # then that number will be 7, i.e., to chop the first 7
767             # elements, leaving the last 3.
768             }
769             }
770             }
771 21         23 DEBUG and print scalar(@items), " items at hand:\n";
772            
773 21         39 foreach my $i (@items) {
774 16         38 my($url, $title, $description) = @$i;
775 16         16 DEBUG > 1 and print "I: (u $url) (t $title) (d $description)\n";
776 16 0 33     48 next unless $url or $title or $description;
      33        
777 16 100       203 push @xml_out,
    50          
    50          
778             "$MaybeNL",
779             (map "$MaybeIndent$_$MaybeNL",
780             length($title) ? "$title" : (),
781             length($url) ? "$url" : (),
782             length($description) ? "$description" : (),
783             ),
784             "$MaybeNL\n",
785             }
786 21         23 DEBUG and print "_items_as_xml returning {\n", join('', @xml_out), "}\n\n";
787            
788 21         320 return join '', @xml_out;
789             }
790              
791             #..........................................................................
792              
793             sub _image_as_xml {
794 21     21   31 my $self = shift;
795 21 0 33     53 return '' unless
      33        
796             $self->image_url and $self->image_height and $self->image_width;
797              
798             #
799              
800 0   0     0 return join '',
      0        
      0        
801             "\n",
802             map(" $_\n" =>
803             $self->xmlelem( 'title' => $self->image_title || $self->html2text($self->title)),
804             $self->xmlelem( 'url' => $self->image_url), # url of the image
805             $self->xmlelem( 'link' => $self->image_link || $self->html2text($self->url )),
806             $self->xmlelem( 'width' => $self->image_width),
807             $self->xmlelem( 'height' => $self->image_height),
808             $self->xmlelem( 'description'
809             => $self->image_description
810             || $self->html2text($self->description)),
811             ),
812             "\n",
813             ;
814             }
815              
816             #==========================================================================
817              
818              
819             =item C );>
820              
821             =item C<$content = get_url( I );>
822              
823             =item I C<< $content = $rss->get_url(...); >>
824              
825             =item I C<< $content->get_url(...); >>
826              
827             This tries to get the content of the given url, and returns it.
828              
829             This is quite like L's C function, but with some
830             additional features:
831              
832             =over
833              
834             =item * If it can't
835             get the URL's content at first, it will sleep for a few seconds and try
836             again, up to about five times. (This is to avoid the case of the URL
837             being temporarily inaccessible simply because the DNS is a bit slow,
838             or because the server is too busy.)
839              
840             =item * If it can't get the content, even after several retries,
841             it will abort the program (like a C). If you want to override this
842             behavior, then call it as C
843              
844             =item * If you call the function in void context (i.e., not using its
845             return value), then the function assigns the URL's content to C<$_>.
846             That's so you can write nice concise code like this:
847              
848             get_url $thatsite;
849             m/Top Stories Tonight/ or die "What, no top stories?";
850             while( m{(.*?)}g ) {
851             rss_item("$thatsite/$1", $2);
852             }
853              
854             =item * This returns the content of the URL not exactly as-is, but after
855             changing its newlines to native format. That is, if the contents of the
856             URL use CR-LF pairs to express newlines, then C changes these
857             to C<\n>'s before returning the content. (Similarly for old MacOS
858             newline format.) Clearly this is wrong in you're dealing with binary
859             data; in that case, use LWP::Simple's C directly.
860              
861             =item * Finally, as a resource-conversation measure, this function
862             will also try to call C a few times if it sees several quick
863             calls to itself coming from a program that seems to be running
864             under crontab. As most of my RSS-generators are crontabbed, I
865             find it very useful that I can have however many C's in
866             my crontabbed programs without worrying that they'll take even a
867             noticeable part of the server's bandwidth.
868              
869             =back
870              
871             =cut
872              
873             my $last_url_get_time;
874              
875             sub get_url ($) {
876 6 50   6 1 602 my $self = shift if @_ > 1; # work as a function or method
877              
878             # Go whip up an object unless one was given:
879 6 100 33     41 $self = ($RSS_obj ||= XML::RSS::SimpleGen::->new) unless defined $self;
880 6 50       26 $self = $self->new unless ref $self;
881            
882 6         13 my $url = $_[0];
883 6 50 33     81 Carp::croak "\"$url\" doesn't look like a URL!\nAborting"
884             unless defined($url) and $url =~ m{^[a-z]+[a-z0-9]*:\S+$}s;
885            
886 6 50 33     44 if($Sleepy and $last_url_get_time) {
887 0         0 my $delay = $Sleepy - (time() - $last_url_get_time);
888 0 0       0 if( $delay > 0 ) {
889 0         0 DEBUG and print "Last URLget was at $last_url_get_time, sleep $delay\n";
890 0         0 sleep $delay;
891             } else {
892 0         0 DEBUG and print "Last URLget was at $last_url_get_time, so no sleep.\n";
893             }
894             }
895            
896 6         9 my $content;
897 6         13 my $count = 0;
898 6         23 foreach my $delay (@Retry_delays, 0) {
899 14         87 delete $self->{'_loops'};
900 14         30 delete $self->{'_http_abort'};
901              
902 14         26 DEBUG and print "Getting $url ...\n";
903 14         29 ++$count;
904 14 100       86 if( defined(
905             $content = $self->_get($url)
906             )) {
907 4         1232688 DEBUG and print "OK, got it (", length($content), " bytes)\n";
908 4         18 last;
909             }
910            
911 10 50       5017467 if( $self->{'_http_abort'} ) {
912 0         0 DEBUG and print "Couldn't get it and won't retry (",
913             $self->{'_http_abort'}, ")\n";
914 0         0 last;
915             } else {
916 10         31 DEBUG and print "Odd, couldn't get it.\n";
917             }
918 10 100       45 if($delay) { DEBUG and print "Sleeping $delay sec...\n"; sleep $delay; }
  8         12  
  8         148001596  
919             }
920 6         30 $last_url_get_time = time();
921              
922 6 100       23 unless( defined $content ) {
923 2         8 my $headers = $self->{'_http_headers'};
924 2 50 33     46 if( $headers and $headers =~ s{^}{ }mg ) {
925 2         7 $headers = "\nResponse headers:\n$headers";
926             } else {
927 0         0 $headers = '';
928             }
929 2 50       11 $count .= ($count == 1) ? " try" : " tries";
930 2         303 Carp::croak("Couldn't get $url in $count$headers\nSo, aborting")
931             }
932            
933 4 50       18 if($Nativize_newlines) {
934 4         8 if("\n" eq "\cj") { # CR => LF CR LFLF... => LF LF => LF
935 4         829 $content =~ s/\cm\cj*/\n/g; # most efficient, I dare say
936             } elsif("\n" eq "\cm") { # CR => CR CR LFLF... => CR LF => CR
937             $content =~ s/(?:(?:\cm\cj+)|\cj)/\n/g;
938             } else {
939             $content =~ s/(?:\n|\r|\n\r)/\n/g;
940             }
941             }
942            
943 4 50       28 $_ = $content unless defined wantarray;
944 4         63 return $content;
945             }
946              
947             #==========================================================================
948             # Things below here are less and less interesting to the casual reader.
949             #==========================================================================
950              
951             sub _get {
952 15     15   32 my $self = shift;
953 15 100 100     133 if(
954             !$LWP::Simple::VERSION
955             and $_[0] =~ m{^http://([^ \n\r\t/:\@]+)(?::(\d+))?(/\S*)?$}
956             ) {
957 2         6 my $host = $1;
958 2         5 my $path = $3;
959 2   50     15 my $port = 0 + ($2 || 80);
960 2 50       11 if( $BadPorts{$port} ) {
961 0         0 DEBUG and print "We don't like port $port from $_[0]\n";
962 0         0 return undef;
963             }
964            
965 2 50       6 $path = "/" unless defined($path);
966 2 50       8 $self = $self->new unless ref($self); # need to be an object method
967 2         9 return $self->_trivial_http_get($host, $port, $path);
968             } else {
969 13         73 return $self->_lwp_get(@_);
970             }
971             }
972              
973             sub _lwp_get {
974 13     13   1149 require LWP::Simple;
975 13         79256 DEBUG and print "Calling LWP::Simple on $_[1]\n";
976 13         104 return LWP::Simple::get($_[1]);
977             }
978              
979             sub _trivial_http_get {
980 2     2   4 my($self, $host, $port, $path) = @_;
981 2         2 DEBUG > 1 and print "Getting HOST=$host, PORT=$port, PATH=$path\n";
982              
983 2 50       6 $self = $self->new unless ref($self); # need to be an object method
984 2   50     14 my $seen = ( $self->{'_loops'} ||= {} );
985              
986 2         867 require IO::Socket;
987 2         23200 local($^W) = 0;
988 2   50     21 my $sock = IO::Socket::INET->new(PeerAddr => $host,
989             PeerPort => $port,
990             Proto => 'tcp',
991             Timeout => 60) || return undef;
992 2         527171 $sock->autoflush;
993 2         120 my $netloc = $host;
994 2 50       11 $netloc .= ":$port" if $port != 80;
995 2         148 print $sock join("\015\012" =>
996             "GET $path HTTP/1.0",
997             "Host: $netloc",
998             "User-Agent: $UserAgentString",
999             "", "");
1000              
1001 2         5 my $buf = "";
1002 2         4 my $n;
1003 2         256758 while( $n = sysread($sock, $buf, 8*1024, length($buf)) ) {
1004 33         46 DEBUG > 10 and print " Got $n bytes...\n";
1005 33         459349 1;
1006             }
1007 2 50       16 return undef unless defined($n);
1008              
1009 2 50       35 if( $buf =~ m{^HTTP/\d+\.\d+\s+(\d+)[^\012]*\012} ) {
1010 2         15 my $code = $1;
1011 2         3 DEBUG and print " Got HTTP status: $code\n";
1012 2 100 66     29 if( $code =~ m/^30[1237]/s and $buf =~ m/\012Location:\s*(\S+)/ ) {
1013             # Redirection
1014 1         5 my $url = $1;
1015 1 50 33     23 if( $url =~ m/^(file|mailto):/ # protocols we hate
      33        
1016             or $seen->{$url}++
1017             or scalar(keys %$seen) > 20
1018             ) {
1019 0         0 DEBUG and print "I don't like the redirection response $url\n";
1020 0         0 $self->{'_http_abort'} = "Bad HTTP-redirection loop.";
1021 0         0 return undef;
1022             }
1023 1         13 return $self->_get($url); # Recurse!
1024             }
1025              
1026 1         183 $buf =~ s/(.+?)\015?\012\015?\012//s; # zap the header
1027 1         13 ( $self->{'_http_headers'} = $1 ) =~ tr/\015\012/\n/sd;
1028            
1029 1         6 DEBUG > 10 and print "Headers: $$self{'_http_headers'}\n";
1030            
1031 1 50 33     13 $self->{'_http_abort'} = "HTTP status $code"
1032             if $code eq '404' or $code eq '403';
1033 1 50       6 return undef unless $code =~ m/^2/;
1034            
1035 1         3 DEBUG and print "Returning ", length($buf), " bytes of content.\n";
1036             } else {
1037 0         0 DEBUG and print "Got a headerless response. Returning.\n";
1038             }
1039              
1040 1         133 return $buf;
1041             }
1042              
1043             #==========================================================================
1044              
1045             sub _read_history_file {
1046 3     3   8 my $self = $_[0];
1047 3         11 my $hf = $self->history_file;
1048 3 50 33     23 unless(defined $hf and length $hf) {
1049 0         0 DEBUG and print "No history_file defined for $self.\n";
1050 0         0 return;
1051             }
1052            
1053 3         6 my(%first, %last); # "last" in the sense of "most recently"
1054 3         14 $self->{'_first_seen'} = \%first;
1055 3         8 $self->{'_last_seen' } = \%last ;
1056              
1057 3 100 66     181 unless( -e $hf and -s _ ) {
1058 1 50       11 $self->{'_virgin_item_timestamp'} = time() if $DWIM;
1059 1         3 return 0;
1060             }
1061              
1062 2         7 my $now = time();
1063 2         6 my $forget_before;
1064            
1065             my $in;
1066             {
1067 2         4 local *IN;
  2         8  
1068 2 50       130 open(IN, $hf) or Carp::confess "Can't read-open $hf: $!"; # insane
1069 2         11 $in = *IN{IO};
1070             }
1071 2         4 local $_;
1072 2         5 my @f;
1073 2         87 while(<$in>) {
1074 3         12 tr/\n\r//d;
1075 3 50 33     57 next unless @f = split "\t", $_, 3 and defined($f[0]) and length($f[0]);
      33        
1076              
1077 3 50       16 $last{$f[0]} = $f[2] if $f[2];
1078              
1079 3 50       8 if( $f[1] ) {
1080 3         10 $first{$f[0]} = $f[1];
1081 3         30 DEBUG > 3 and print " Learning $f[0] first seen at $f[1]\n";
1082             } else {
1083 0         0 $first{$f[0]} = $now;
1084 0         0 DEBUG > 3 and print " Prelearning $f[0] first seen at $f[1]\n";
1085             # TODO: is this useful?
1086             }
1087             }
1088 2         84 close($in);
1089              
1090 2         13 return 1;
1091             }
1092              
1093             # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
1094              
1095             sub _write_history_file {
1096 3     3   5 my $self = $_[0];
1097 3         8 my $hf = $self->history_file;
1098 3 50 33     20 unless(defined $hf and length $hf) {
1099 0         0 DEBUG and print "No history_file defined for $self.\n";
1100 0         0 return;
1101             }
1102            
1103 3         7 my $first = $self->{'_first_seen'};
1104 3         6 my $last = $self->{'_last_seen' };
1105 3         21 my $now = time();
1106            
1107 3         7 my $out;
1108 3         4 DEBUG and print "Saving to history file $hf\n";
1109 3         4 DEBUG > 2 and printf "Time now: %s = %s\n", time(), scalar(localtime);
1110             {
1111 3         5 local *HF;
  3         8  
1112 3 50       425 open(HF, ">$hf") or Carp::confess "Can't write-open $hf: $!"; # insane
1113 3         61 $out = *HF{IO};
1114             }
1115             {
1116 3         6 my(@f, @x);
  3         5  
1117             {
1118 3         4 my %x;
  3         42  
1119 3         22 @x{keys %$first, keys %$last} = ();
1120 3         19 @x = sort keys %x;
1121             }
1122              
1123 3         5 my $forget_before;
1124             {
1125 3         6 my $r = $self->retention;
  3         12  
1126 3 50 33     27 if( $r and $r > 0 ) { $forget_before = time() - $r }
  3         9  
1127             }
1128 3         6 foreach my $key (@x) {
1129 6 50       22 next if $key =~ m/[\t\n\r]/;
1130 6 50 33     31 if( $forget_before and $last->{$key} < $forget_before ) {
1131 0         0 DEBUG and printf(
1132             " Forgetting %s as being too old since last-seen (%s = %s)\n",
1133             $key,
1134             $last->{$key},
1135             scalar(localtime($last->{$key})),
1136             );
1137 0         0 next;
1138             }
1139 6   50     88 print $out $key, "\t",
      50        
1140             $first->{$key}||'', "\t",
1141             $last->{$key}||'', "\n",
1142             ;
1143            
1144 6         17 if(DEBUG > 2) {
1145             printf " Key %s\n First seen: %s = %s\n Last seen: %s = %s\n",
1146             $key,
1147             $first->{$key}||'-',
1148             scalar(localtime( $first->{$key}||0 )),
1149             $last->{$key}||'-',
1150             scalar(localtime( $last->{$key}||0 )),
1151             }
1152              
1153             }
1154 3         195 close($out);
1155             }
1156 3         8 DEBUG and print "Done saving to $hf\n";
1157 3         8 return 1;
1158             }
1159              
1160             #-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
1161              
1162              
1163             =item C I C I C
1164             I C I C I C
1165              
1166             Calling one of these functions declares that this feed is usually
1167             generated at the same time(s) every day (or every week, in the case of
1168             C). And, where it's not just once a day/week, these multiple
1169             times a day are evenly spaced. These functions then set the feed's
1170             C, C, C, C,
1171             C, and C elements appropriately, so that RSS readers can know at
1172             at what times there could (or couldn't) be new content in this feed.
1173              
1174             In other words: use C if this feed is updated at about the
1175             same time every day and then again 12 hours later. Use C
1176             if this feed is updated at the same time daily, and then 8 hours later, and
1177             then 8 hours later. And use C if the feed updates
1178             at about I minutes past every even numbered hour, or every odd-numbered
1179             hour.
1180              
1181             Clearly I mean these functions to be used in programs that are crontabbed
1182             to run at particular intervals, as with a crontab line like one of these:
1183              
1184             52 * * * * ~/thingy # => rss_hourly
1185             52 23 * * * ~/thingy # => rss_daily
1186             52 4,16 * * * ~/thingy # => rss_twice_daily
1187             52 5,13,21 * * * ~/thingy # => rss_thrice_daily
1188             52 23 * * 3 ~/thingy # => rss_weekly
1189             52 */2 * * * ~/thingy # => rss_every_other_hour
1190              
1191             Clearly there aren't C> functions for all the scheduling
1192             possibilities programs -- if you have a program that has to run at
1193             6am, 8am, 1pm, and 4pm, there's no function for that. However, the above
1194             crontab lines (or with minor changes, like C<1,9,17> instead of C<5,13,21>)
1195             are just fine for almost every RSS feed I've run.
1196              
1197             An aside: I recommend running the programs at about 52 minutes past the hour,
1198             generally in series, like so:
1199              
1200             52 5,13,21 * * * ~/thingy ; ~/dodad ; ~/makething ; ~/gizmo
1201              
1202             However, your mileage may vary.
1203              
1204             Incidentally, these functions take no arguments, so the parentheses are
1205             optional. That is, these two lines do the same thing:
1206              
1207             rss_hourly;
1208             rss_hourly();
1209              
1210             =cut
1211              
1212             ## Some handy if somewhat arbitrary shortcuts:
1213             #
1214             sub hourly {
1215 0     0 0 0 my $self = $_[0];
1216 0         0 $self->updatePeriod('hourly');
1217 0         0 $self->ttl('60');
1218 0         0 return;
1219             }
1220              
1221             sub daily {
1222 0     0 0 0 my $self = $_[0];
1223 0         0 $self->updateHours();
1224 0         0 $self->updatePeriod('daily');
1225 0         0 $self->ttl( 24 * 60 );
1226 0         0 return;
1227             }
1228              
1229             sub twice_daily {
1230 0     0 0 0 my $self = $_[0];
1231 0         0 my $h = (gmtime( $self->_now_rounded_up ))[2];
1232 0         0 $self->updateHours( $h, ($h+12) % 24 );
1233 0         0 $self->updatePeriod('daily',2);
1234 0         0 $self->ttl( 12 * 60 );
1235 0         0 return;
1236             }
1237              
1238             sub thrice_daily {
1239 0     0 0 0 my $self = $_[0];
1240 0         0 my $h = (gmtime( $self->_now_rounded_up ))[2];
1241 0         0 $self->updateHours( $h, ($h+ 8) % 24, ($h+16) % 24 );
1242 0         0 $self->updatePeriod('daily',3);
1243 0         0 $self->ttl( 8 * 60 );
1244 0         0 return;
1245             }
1246              
1247             sub weekly {
1248 0     0 0 0 my $self = $_[0];
1249 0         0 $self->updateHours();
1250 0         0 $self->updateDays();
1251 0         0 $self->updatePeriod('weekly',1);
1252 0         0 $self->ttl( 7 * 24 * 60 );
1253 0         0 return;
1254             }
1255              
1256             {
1257             my(@odds, @evens);
1258             for(0 .. 23) { push @{ ($_ & 1) ? \@odds : \@evens }, $_ }
1259              
1260             sub every_other_hour {
1261 0     0 0 0 my $self = $_[0];
1262 0         0 my $h = (gmtime( $self->_now_rounded_up ))[2];
1263 0 0       0 $self->skipHours( ($h & 1) ? @evens : @odds);
1264 0         0 $self->updatePeriod('daily',12);
1265 0         0 $self->ttl( '120' );
1266 0         0 return;
1267             }
1268             }
1269             #..........................................................................
1270              
1271             my @day_names = (
1272             "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday",
1273             );
1274             my %is_day_name;
1275             @is_day_name{@day_names} = @day_names;
1276              
1277             #..........................................................................
1278              
1279             sub _now_rounded_up {
1280 0     0   0 my $self = $_[0];
1281 0   0     0 my $now = $_[0]{'_force_now'} # to be used only for testing purposes
1282             || time();
1283 0 0       0 if( $CHUNK_MINUTES ) {
1284 0         0 my $tweak = ($CHUNK_MINUTES * 60) - ($now % ($CHUNK_MINUTES * 60));
1285 0         0 DEBUG and print "Tweaking $now (", scalar(gmtime($now)),
1286             " GMT) by $tweak seconds\n so it'll be an even $CHUNK_MINUTES minutes: ";
1287 0         0 $now += $tweak;
1288 0         0 DEBUG and print "$now (", scalar(gmtime($now)), " GMT)\n";
1289             } else {
1290             # probably not a good idea
1291 0         0 DEBUG and print "Archoring timebase to right this moment.\n";
1292             }
1293 0         0 return $now;
1294             }
1295              
1296 0   0 0   0 sub _now { return $_[0]{'_force_now'} || time() }
1297              
1298             #..........................................................................
1299              
1300             =back
1301              
1302             =head2 MINOR FUNCTIONS
1303              
1304             These are functions that you probably won't need often, or at all. I include
1305             these for the sake of completeness, and so that advanced users might find them
1306             useful in some cases.
1307              
1308             =over
1309              
1310             =item C );>
1311              
1312             This function directly sets the C element's values to the
1313             specified GMT hour numbers.
1314              
1315             =cut
1316              
1317             sub skipHours {
1318 1     1 0 7 my($self, @in) = @_;
1319 1         3 my @skiphours;
1320 1         3 foreach my $hour (@in) {
1321 1 50       4 next unless defined $hour;
1322 1 50 33     7 Carp::croak "\"$hour\" isn't a valid hour number (must be 0-23)"
1323             unless $hour <= 24 and $hour >= 0;
1324 1 50       5 push @skiphours, ($hour == 24) ? 0 : (0 + $hour);
1325             # tolerate "24" to mean 0h.
1326             }
1327 1         1 DEBUG and print "Skipdays: @skiphours\n";
1328 1         9 $self->{'skipHours'} = join '',
1329             "", map( "$_", @skiphours), "\n"
1330             ;
1331 1         4 return;
1332             }
1333              
1334             #--------------------------------------------------------------------------
1335              
1336             =item C
1337              
1338             =item C );>
1339              
1340             This function is a wrapper around C -- you call
1341             C with a list of GMT hour numbers, and
1342             C will call C except I
1343             whatever hour numbers you specified.
1344              
1345             If you call with an empty list (i.e., C), then
1346             we uses C to find out the current hour (and rounds it up
1347             if it's after 50 minutes past), basically just as if you'd called:
1348              
1349             rss_updateHours( (gmtime(600+time()))[2] );
1350              
1351             =cut
1352              
1353             sub updateHours { # Just the complement of skipHours.
1354             # Feeding it () means updates only at this hour of day
1355             # (This is fine since you'd never mean to actually declare
1356             # a feed whose updateHours is never, i.e. whose skipHours
1357             # is all of 0-23)
1358 0     0 0 0 my $self = shift;
1359 0         0 my %hours;
1360 0 0       0 @hours{ @_ ? (map $_+0, @_) :
1361             ( gmtime( $self->_now_rounded_up ) )[2] } = ();
1362             # might as well count 10:57 as 11h, etc
1363 0         0 DEBUG and print "HOURS UPDATEY :", join(' ', sort keys %hours), "\n";
1364 0         0 $self->skipHours( grep !exists($hours{$_}), 0 .. 23 );
1365 0         0 return;
1366             }
1367              
1368             #..........................................................................
1369              
1370             =item C
1371              
1372             =item C );>
1373              
1374             =item C );>
1375              
1376             This function directly sets the C element's values to the
1377             specified weekdays. Note that this accepts either integers (like
1378             6 for Saturday, Sunday being either 0 or 7), or their exact
1379             English names.
1380              
1381             If you use the C field, consider that it refers to days
1382             figured by GMT, not local time. For example, if I say to skip Saturdays,
1383             that means Saturdays GMT, which in my timezone (Alaska) starts
1384             in the middle of Friday afternoon.
1385              
1386             =cut
1387              
1388             sub skipDays {
1389 3     3 0 16 my($self,@in) = @_;
1390 3         5 my @skipdays;
1391 3         5 foreach my $day (@in) {
1392 3 50       82 next unless defined $day;
1393 3 100       19 if($day =~ m/^[0-7]$/s) { # tolerate numeric day specs
1394 1 50       5 $day = 0 if $day == 7;
1395 1   33     5 $day = $day_names[$day]
1396             || Carp::croak "Day number \"$day\" is out of range (0-6)";
1397             } else {
1398 2 50       6 Carp::croak "\"$day\" isn't a valid day name (use 0-6 or one of: @day_names)"
1399             unless $is_day_name{$day};
1400             }
1401 3         19 push @skipdays, $day;
1402             };
1403            
1404 3         5 DEBUG and print "Skipdays: @skipdays\n";
1405 3         58 $self->{'skipDays'} = join '',
1406             "", map( "$_", @skipdays), "\n"
1407             ;
1408 3         8 return;
1409             }
1410              
1411             #..........................................................................
1412              
1413             =item C
1414              
1415             =item C );>
1416              
1417             =item C );>
1418              
1419             This function is a wrapper around C -- you call
1420             C with a list of GMT day names/numbers, and
1421             C will call C except I
1422             whatever days you specified.
1423              
1424             If you call with an empty list (i.e., C), then
1425             we uses C to find out the current day (GMT!), basically
1426             just as if you'd called:
1427              
1428             rss_updateDays( (gmtime(600+time()))[6] );
1429              
1430              
1431             =cut
1432              
1433             sub updateDays { # just the complement of skipDays
1434             # Feeding it () means updates only at this day of the week
1435             # (This is fine since you'd never mean to actually declare
1436             # a feed whose updateDays is never, i.e., whose updateDays
1437             # is all of Monday thru Sunday)
1438 0     0 0 0 my $self = shift;
1439 0         0 my %days;
1440             @days{
1441 0 0       0 map {;
1442 0 0       0 m/^[0-7]$/ ? $day_names[ ($_ == 7) ? 0 : $_ ]
    0          
    0          
1443             : $is_day_name{$_} ? $_
1444             : Carp::croak "\"$_\" isn't a valid day name (use 0-6 or one of: @day_names)"
1445             }
1446             @_ ? @_ : ( gmtime( $self->_now_rounded_up ) )[6]
1447             } = ();
1448 0         0 DEBUG and print "DAYS UPDATEY :", join(' ', sort keys %days), "\n";
1449 0         0 $self->skipDays( grep !exists($days{$_}), @day_names );
1450 0         0 return;
1451             }
1452              
1453              
1454             #--------------------------------------------------------------------------
1455              
1456             =item C );>
1457              
1458             This function directly sets the C element's value to the
1459             period specified. You must specify one of the strings:
1460             "yearly", "monthly", "weekly", "daily", "hourly".
1461             I advise using "weekly" only if you know what you're doing, and
1462             "yearly", "monthly" only if you I know what you're doing.
1463              
1464             =item C, I, I );>
1465              
1466             This is a shortcut for
1467             C<<
1468             rss_updatePeriod(I); rss_updateFrequency(I)
1469             >>
1470              
1471             =item C, I, I );>
1472              
1473             This is a shortcut for
1474             C<<
1475             rss_updatePeriod(I); rss_updateFrequency(I);
1476             rss_updateBase(I)
1477             >>
1478              
1479             =cut
1480              
1481             sub updatePeriod {
1482 0 0   0 0 0 return $_[0]->{'updatePeriod'} if @_ == 1;
1483             # Else we're a write accessor:
1484            
1485 0         0 my($self, $period) = @_;
1486 0 0       0 $period or Carp::confess(
1487             "What period? yearly/monthly/weekly/daily/hourly?");
1488              
1489 0 0       0 if($period =~ m/^(yearly|monthly|weekly|daily|hourly)$/s) {
1490 0         0 $self->{'updatePeriod'} = $period;
1491 0 0       0 $self->updateFrequency($_[2]) if @_ > 2;
1492 0 0       0 $self->updateBase($_[3]) if @_ > 3;
1493 0         0 $self->{'_namespaces'}{'sy'} = $NAMESPACE_SY;
1494 0         0 return $period;
1495             } else {
1496 0         0 Carp::confess(
1497             "What kind of period is \"$period\"? Use one of: yearly|monthly|weekly|daily|hourly");
1498             }
1499             }
1500              
1501             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1502              
1503             =item C );>
1504              
1505             =item C );>
1506              
1507             This function directly sets the C element's value to the
1508             moment specified. If you pass in an epoch time, it is converted to
1509             an ISO date string.
1510              
1511             =cut
1512              
1513             sub updateBase {
1514 7 50   7 0 19 return $_[0]->{'updateBase'} if @_ == 1;
1515             # Else we're a write accessor:
1516            
1517 7         14 my($self, $then) = @_;
1518 7 50       60 if( $then =~ m/^-?\d+$/s ) {
    50          
1519 0         0 $self->{'updateBase'} = $self->_epoch2isodate($then);
1520             } elsif ( $then =~ # basically ISO8601
1521             m<^
1522             \d\d\d\d # year
1523             (?:
1524             -\d\d # month
1525             (?:
1526             -\d\d # day
1527             (?:
1528             T\d\d:\d\d # hh:mm
1529             (?:
1530             :\d\d # seconds
1531             (?:
1532             \.\d+ # fractions of a second
1533             )?
1534             )?
1535             (?: # TZ offset
1536             Z # Zulu
1537             |
1538             (?: # or by offset:
1539             [-+]
1540             \d\d:\d\d # hh:mm, with leading '+' or '-'
1541             )
1542             )?
1543             )?
1544             )?
1545            
1546             )?
1547             # The month was optional, but that
1548             # makes "1994" ambiguous -- a year or an epoch time?
1549             # I doubt anyone wants to write 2000,
1550             # esp as the default is basically == 1970.
1551             $
1552             >sx
1553             ) {
1554 7         16 $self->{'updateBase'} = $then;
1555             } else {
1556 0         0 Carp::confess("What kind of updateBase is \"$then\"?");
1557             }
1558 7         48 return $self->{'updateBase'};
1559             }
1560              
1561             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1562             sub _epoch2isodate {
1563 0     0   0 my($self, $time) = @_;
1564 0         0 my(@timelist) = gmtime($time);
1565 0         0 ++$timelist[4];
1566 0         0 $timelist[5] += 1900;
1567 0         0 return sprintf("%04d-%02d-%02dT%02d:%02d+00:00",
1568             @timelist[ 5, 4, 3 , 2 ,1 ]);
1569            
1570             }
1571              
1572             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1573             %PeriodAsSeconds = (
1574             'hourly' => 60 * 60,
1575             'daily' => 60 * 60 * 24,
1576             'weekly' => 60 * 60 * 24 * 7,
1577             );
1578              
1579             sub _updateBase_init {
1580             # What to do? Peg it on now plus Interval, rounded up to next M-minute interval?
1581 0     0   0 my $self = $_[0];
1582 0         0 my $now = $self->_now_rounded_up;
1583            
1584 0         0 my $period_type = $self->updatePeriod;
1585 0 0 0     0 if( $period_type eq 'monthly' or $period_type eq 'yearly') {
1586 0         0 $now = $self->_epoch2isodate( $now );
1587 0 0       0 if($period_type eq 'yearly') {
    0          
1588 0 0       0 $now =~ s/^....-/2000-/s || die "SPORK93931!"; # sanity
1589             } elsif($period_type eq 'monthly') {
1590 0 0       0 $now =~ s/^....-..-/2000-01-/s || die "SPORK35286!"; # sanity
1591             } else {
1592 0         0 die "SPORK!!!!!94829045"; #sanity
1593             }
1594 0         0 DEBUG > 1and print "Using $now as updateBase.\n";
1595             } else {
1596             # It's a plain ol' interval.
1597 0         0 DEBUG > 1 and print "Slapping $now (", gmtime($now). " into 1970.\n";
1598 0         0 my $freq = $self->updateFrequency();
1599 0   0     0 my $interval_as_seconds = int(
1600             ($PeriodAsSeconds{$period_type} ||die "SPORK84258 on '$period_type'!")
1601             / $freq)
1602             ;
1603 0         0 DEBUG > 1 and print
1604             "So, $freq times $period_type is $interval_as_seconds seconds...\n";
1605 0         0 $now %= $interval_as_seconds;
1606 0         0 DEBUG > 1 and print "Using $now (", gmtime($now). " as updateBase.\n";
1607             }
1608 0         0 $self->updateBase($now);
1609 0         0 return;
1610             }
1611              
1612             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1613              
1614             =item the accessor C );>
1615              
1616             This function directly sets the C element's value to the
1617             value specified. The value has to be a nonzero positive integer.
1618              
1619             For example, this means that this feed updates at/by the start of every hour
1620             and 30 minutes past:
1621              
1622             rss_updateBase('2000-01-01T00:00-00:00');
1623             rss_updateFrequency(2);
1624             rss_updatePeriod('hourly'); # 2*hourly means "twice an hour"
1625              
1626             Recall that this can also be done with the
1627             the C, I, I )> shortcut, like so:
1628              
1629             rss_updateBase('hourly', 2, '2000-01-01T00:00-00:00');
1630              
1631             =cut
1632              
1633             sub updateFrequency { # simple accessor, plus the int() thing
1634 0 0   0 0 0 if(@_ > 1) {
1635 0   0     0 $_[0]->{'updateFrequency'} = int($_[1]) || 1;
1636             } else {
1637 0         0 $_[0]->{'updateFrequency'}
1638             }
1639             }
1640              
1641 0   0 0   0 sub _updateFrequency_init { $_[0]->{'updateFrequency'} ||= 1 }
1642              
1643             #--------------------------------------------------------------------------
1644              
1645             sub _sy_update_as_xml {
1646 21     21   32 my $self = $_[0];
1647 21 50       197 return '' unless $self->{'updatePeriod'};
1648 0         0 $self->_updateFrequency_init();
1649 0         0 $self->_updateBase_init();
1650 0         0 return sprintf(
1651              
1652             q{%s
1653             %s
1654             %s
1655             },
1656             $self->updateFrequency,
1657             $self->updatePeriod,
1658             $self->updateBase,
1659             );
1660             }
1661              
1662             #--------------------------------------------------------------------------
1663              
1664             # Aliases:
1665 0     0 0 0 sub skip_hours {shift->skipHours( @_)}
1666 0     0 0 0 sub skip_days {shift->skipDays( @_)}
1667 0     0 0 0 sub update_days {shift->updateDays( @_)}
1668 0     0 0 0 sub update_hours {shift->updateHours( @_)}
1669 0     0 0 0 sub update_period {shift->updatePeriod( @_)}
1670 0     0 0 0 sub update_base {shift->updateBase( @_)}
1671 0     0 0 0 sub update_frequency {shift->updateFrequency( @_)}
1672 0     0 0 0 sub webmaster {shift->webMaster( @_)}
1673              
1674              
1675             #--------------------------------------------------------------------------
1676              
1677             =item the accessor C)>
1678              
1679             If you are using an C)>, the history file will
1680             accrete a list of all URLs it has seen. But to keep this file from potentially
1681             getting immense, items that haven't been seen for a while are thrown out.
1682             The period of time a feed's items go unseen before each is forgotten is
1683             called that feed's B, and is expressed in seconds.
1684              
1685             The default retention value is 32 days (i.e., 32*24*60*60, the number
1686             of seconds in 32 days). If you wanted to change it to just a week,
1687             you would do this with C.
1688              
1689             As a special case, a zero or negative value for the retention means
1690             to never clear anything from the history file, no matter how long
1691             it has gone unseen.
1692              
1693             =cut
1694              
1695             #..........................................................................
1696              
1697             =item C );>
1698              
1699             Call this function if you want to add extra XML comments to this RSS file.
1700             For example, if you call this:
1701              
1702             rss_add_comment(
1703             "Our terms of use: http://wherever.int/rsstou.html",
1704             "Any questions? Ask jimmy@wherever.int",
1705             );
1706              
1707             ...then this RSS feed will contain this XML fairly early on in the file:
1708              
1709            
1710            
1711              
1712             =cut
1713              
1714             sub add_comment {
1715 0     0 0 0 my $self = shift;
1716 0         0 foreach my $c (@_) {
1717 0 0 0     0 push @{ $self->{'_comments'} }, $c
  0   0     0  
1718             if defined $c and length $c and $c =~ m/\S/;
1719             # no point in having contentless comments
1720             }
1721 0         0 return;
1722             }
1723              
1724             #--------------------------------------------------------------------------
1725              
1726             =item the accessor C )>
1727              
1728             This defines the given URL as being the XML-CSS stylesheet for this RSS
1729             feed. The default value is "./rss.css" if C<-e "rss.css"> is true, otherwise
1730             is the value http://www.interglacial.com/rss/rss.css
1731              
1732             =item the accessor C )>
1733              
1734             This defines the given URL as being the XML-XSL stylesheet for this RSS
1735             feed. The default value is none.
1736              
1737             =item The accessors
1738             C<<
1739             rss_url( I ), rss_title( I ), rss_description( I )
1740             >>
1741              
1742             These define this feed's URL, title, and description. These functions
1743             are just for completeness, since it's simpler to just specify any/all of
1744             these parameters in the call to C.
1745              
1746             =item the accessor C )>
1747              
1748             This sets the parameter of this RSS feed's C element, which
1749             suggests how long (in minutes, not seconds!) an RSS reader should wait after it polls
1750             a feed until it polls it again. For example, C would suggest
1751             that a reader should not poll this feed more often than every 90 minutes.
1752              
1753             (This element is somewhat obsolescent next to the newer and more
1754             informative C elements, but is included for backward
1755             compatability.)
1756              
1757             =item the accessor C )>
1758              
1759             This controls whether or not duplicate items are filtered out out the
1760             feed. By default this is I. Note that duplicates are detected only
1761             by their URL, so if you call this:
1762              
1763             rss_item('http://foo.int/donate', "Give!");
1764             rss_item('http://foo.int/donate', "We need money!");
1765             rss_save('begging.rss');
1766              
1767             ...then only the first will appear in the feed, since the second item
1768             has a URL that is already being saved in this feed. (However,
1769             C is still 2, because filtering out duplicates is
1770             something that only happens as the feed is saved.)
1771              
1772             =item the accessor C )>
1773              
1774             This sets the value of the not-generally-useful C RSS element.
1775             The default value is "./about_rss.html" if C<-e "about_rss.html"> is
1776             true, otherwise "http://www.interglacial.com/rss/about.html".
1777              
1778             =item the accessors
1779             C<<
1780             rss_image_url(I), rss_image_width(I),
1781             rss_image_height(I),
1782             rss_image_title(I), rss_image_link(I),
1783             rss_image_description(I)
1784             >>
1785              
1786             These are for manually setting the values of this feed's image element's
1787             subelements:
1788              
1789            
1790             (rss_image_url)
1791             (rss_image_width)
1792             (rss_image_height)
1793             (rss_image_title)
1794             (rss_image_link)
1795             (rss_image_description)
1796            
1797              
1798             You rarely need to call any of these C> functions --
1799             usually just calling C );> is enough.
1800              
1801             =cut
1802              
1803             #..........................................................................
1804              
1805             my @DoW = qw(Sun Mon Tue Wed Thu Fri Sat);
1806             my @Month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1807              
1808             sub _date_rfc822 {
1809 21     21   36 my $self = shift;
1810 21 0       129 my @timelist = (@_ == 0) ? gmtime : (@_ == 1) ? gmtime($_[0]) : @_;
    50          
1811 21         50 $timelist[5] += 1900;
1812              
1813 21         268 return sprintf(
1814             "%s, %02d %s %04d %02d:%02d:%02d GMT",
1815             $DoW[$timelist[6]],
1816             $timelist[3],
1817             $Month[$timelist[4]],
1818             @timelist[5, 2,1,0],
1819             );
1820             }
1821              
1822             sub _check_age {
1823 0     0   0 my($self, $filename, $maxage) = @_;
1824 0         0 my $age;
1825 0 0 0     0 if($filename and $maxage and $maxage < ($age = -M $filename)) {
      0        
1826 0   0     0 Carp::carp(
1827             sprintf "%s warning - %s is getting old -- it hasn't changed in %0.1f days!",
1828             ref($self) || $self,
1829             $filename,
1830             $age,
1831             );
1832             }
1833 0         0 return 0;
1834             }
1835              
1836             #..........................................................................
1837             sub _various_comments {
1838 21     21   32 my $self = $_[0];
1839 21         29 my $x;
1840             return
1841             join '',
1842 0 50       0 map {; $x = $_; $x =~ s/--/__/g; "\n"; }
  0         0  
  0         0  
  21         184  
1843 21         28 @{ $self->{'_comments'} || [] }
1844             ;
1845             }
1846              
1847             #..........................................................................
1848              
1849             sub _modversion_comment {
1850 21     21   29 my $self = $_[0];
1851 21         38 my $selfclass = ref($self);
1852 21   33     29 my $v = eval { $selfclass->VERSION } || $VERSION;
1853 21         123 return "\n\n";
1854             }
1855              
1856             #..........................................................................
1857              
1858             sub _accessorize { # A simple-minded method-maker
1859 8     8   115 no strict 'refs';
  8         18  
  8         704  
1860 8     8   19 foreach my $attrname (@_) {
1861 152 100       355 next if $attrname =~ m/::/; # a hack
1862 144         690 *{caller() . '::' . $attrname} = sub {
1863 8     8   49 use strict;
  8         17  
  8         20237  
1864 406 50 66 406   2489 unless((@_ == 1 or @_ == 2) and ref $_[0] ) {
      33        
1865 0         0 $Carp::CarpLevel = 1;
1866 0         0 Carp::croak(
1867             "Accessor usage: \$obj->$attrname() or \$obj->$attrname(\$new_value)"
1868             );
1869             }
1870 406 100       1934 (@_ == 1) ? $_[0]->{$attrname}
1871             : ($_[0]->{$attrname} = $_[1]);
1872 144         545 };
1873             }
1874             # Ya know, they say accessories make the ensemble!
1875 8         21 return;
1876             }
1877              
1878             #..........................................................................
1879              
1880             sub xmlesc {
1881 35     35 0 816 my $self = shift;
1882 35         82 my $out = join '', @_;
1883 35         112 $out =~ s<([^\x20\x21\x23\x24\x25\x27-\x3b\x3d\x3f-\x7e])>
1884 24 100       158 <$CommonEnts{$1} || ('&#'.(ord($1)).';') >seg;
1885 35         144 return $out;
1886             }
1887              
1888             sub xmlelem {
1889 0     0 0 0 my $self = shift;
1890 0         0 my $gi = shift;
1891 0         0 return "<$gi>" . $self->xmlesc(@_) . "";
1892             }
1893              
1894             #..........................................................................
1895              
1896             sub html2text {
1897              
1898             # A rudimentary HTML tokenizer, adapted and simplified
1899             # from Gisle Aas's HTML::Parser v2.25...
1900              
1901 136     136 0 314 local $_;
1902 136         166 my($self, @out);
1903 136         355 ($self, $_) = @_;
1904              
1905 136 50       333 return undef unless defined $_;
1906 136 50       293 return $$_ if ref($_) eq 'SCALAR'; # Pass a scalar ref to bypass!
1907 136 100       356 return '' unless length $_;
1908              
1909 135 100 100     2551 if( m/^[\x00-\x7e]+$/s and ! m/[<>&]/s) {
1910             # Most common case: a simple string needing no HTML parsing.
1911             # So just compact and trim whitespace, and move on.
1912 111         229 tr/\n\r\f\t / /s;
1913 111         166 s/ $//s;
1914 111         148 s/^ //s;
1915 111         590 return $_;
1916             }
1917            
1918 24         59 while( length($_) ) {
1919 86 100       547 if ( s@^([^&<]+)@@s) { # Any plaintext
    100          
    50          
    100          
    50          
    100          
    100          
    50          
1920 43         53 DEBUG > 5 and print ":: Plaintext {$1}\n";
1921 43         151 push @out, $1;
1922             } elsif ( s@^
1923             (
1924             &\#
1925             (?:
1926             (?: # dec
1927             ([0-9]{1,7})
1928             )|(?: # or hex
1929             [xX]([0-9a-fA-F]{1,7})
1930             )
1931             )
1932             )
1933             \b
1934             ;?
1935             @@sx
1936             ) {
1937 30         31 DEBUG > 5 and print ":: Numeric ent {$1}",
1938             defined($2) ? " (dec $2)"
1939             : defined($3) ? " (hex $3)"
1940             : " (weird!)",
1941             "\n";
1942              
1943 30 100 100     218 if( defined $3 and exists $WinLameEntities{hex($3)} ) {
    100 100        
1944             # it's a winlame ent, in hex
1945 2         3 DEBUG > 6 and print " Correcting that to &#$WinLameEntities{hex($3)};\n";
1946 2         50 push @out, \"&#$WinLameEntities{hex($3)};" ;
1947             } elsif( defined $2 and exists $WinLameEntities{0 + $2} ) {
1948             # it's a winlame ent, in decimal
1949 2         2 DEBUG > 6 and print " Correcting that to &#$WinLameEntities{0 + $2};\n";
1950 2         12 push @out, \"&#$WinLameEntities{0 + $2};" ;
1951             } else {
1952             # it's a normal entity
1953 26         29 DEBUG > 6 and print " Passing it thru as \"$1;\"\n";
1954 26         98 push @out, \"$1;";
1955             }
1956            
1957              
1958             } elsif ( s@^&([A-Za-z][A-Za-z0-9]{0,10})\b(;?)@@s) {
1959              
1960 0         0 DEBUG > 5 and print ":: Named entity {$1}\n";
1961 0 0       0 if(exists $Entities{$1}) {
    0          
1962 0         0 push @out, \ $Entities{$1};
1963             } elsif(exists $Entities{ ucfirst(lc($1)) }) {
1964 0         0 DEBUG > 6 and print "::: Treating as if it were &",
1965             ucfirst(lc($1)), ";\n";
1966 0         0 push @out, \ $Entities{ ucfirst(lc($1)) };
1967             } else {
1968 0         0 DEBUG > 6 and print "::: Unresolvable! Treating as plaintext\n";
1969 0   0     0 push @out, "&$1" . ($2 || '');
1970             }
1971              
1972              
1973             # Things we just ignore:
1974             } elsif( s@^()@@s ) { # "Netscape" comments
1975 2         5 DEBUG > 5 and print ":: Simple comment {$1}\n";
1976             } elsif( s@^(<[!?][^>]*>)@@s ) { # doctype, PI, or a weird comment
1977 0         0 DEBUG > 5 and print ":: Directive/PI {$1}\n";
1978             } elsif( s@^@@ ) {
1979             # End-tag, or a simple start-tag
1980 6 100       27 push @out, ' ' if $IsBlockMarkup{lc $1};
1981 6         27 DEBUG > 5 and print ":: Simple tag {$1}",
1982             $IsBlockMarkup{lc $1} ? " +breaky": (), "\n";
1983              
1984             } elsif ( # complex start-tag
1985             s@
1986             ^
1987             (
1988             <
1989             ([a-zA-Z][a-zA-Z0-9\.\:\_\-]{0,72}) # the G.I.
1990             (?:
1991             \s+
1992             [a-zA-Z][a-zA-Z0-9\.\:\_\-]{0,72}
1993             (?: # Attributes can be valueless!
1994             \s*
1995             =
1996             \s*
1997             (?: # Attribute value...
1998             (?:
1999             "[^"]*" # "foo bar"
2000             )|(?:
2001             '[^']*' # 'foo bar'
2002             )|(?:
2003             [^<>\n\r\t ]{0,90} # foobarbaz
2004             # 90 is my sane length-limit on unquoted attrvalues
2005             )
2006             )
2007             )?
2008             )* # any nubmer of attribs
2009             \s* # optional ending whitespace
2010             /? # optional emptiness flag
2011             >
2012             )
2013             @@x
2014             ) {
2015 4 100       19 push @out, ' ' if $IsBlockMarkup{lc $2};
2016 4         18 DEBUG > 5 and print ":: Complex start-tag {$2}",
2017             $IsBlockMarkup{lc $2} ? " +breaky": (), "\n";
2018            
2019             # Gets called only on aberrant input, I think:
2020             } elsif( s@^(.)@@s ) {
2021 1         1 DEBUG > 5 and print ":: Last-resort single character {$1}\n";
2022 1         6 push @out, $1;
2023            
2024             } else {
2025 0         0 die "IMPOSSIBLE {$_}\n !!! "; # sanity-check
2026             }
2027             }
2028            
2029 24         96 foreach my $x (@out) {
2030 79 100       234 if( ref $x ) {
2031 30         59 $x = $$x;
2032             } else { # do necessary escaping
2033 49         93 $x =~ tr/\n\r\f\t / /s; # smash whitespace
2034 49 100       212 $x = $self->xmlesc($x) unless $x =~ m/^[a-zA-Z0-9_'",\. ]*$/s
2035             # Dodge calling on the most common case, i.e., text
2036             # that really doesn't need any excuse at all.
2037             }
2038             }
2039              
2040 24         207 return join '', @out;
2041             }
2042              
2043             #..........................................................................
2044              
2045             $MIME_Type = 'application/rss+xml';
2046              
2047             %WinLameEntities = (
2048             do {;
2049             my $c = 0x80;
2050             map {; $c++ => (hex($_) || 0x007e) }
2051             qw(
2052             20ac 0000 201a 0192 201e 2026 2020 2021 02c6 2030 0160 2039 0152 00 017d 0000
2053             0000 2018 2019 201c 201d 2022 2013 2014 02dc 2122 0161 203a 0153 00 017e 0178
2054             )# | # |
2055             });
2056              
2057             _init_common_ents();
2058              
2059             sub _init_common_ents {
2060             %CommonEnts = (
2061             '&' => '&',
2062             '<' => '<',
2063             '>' => '>',
2064             '"' => '"',
2065 256         507 ( map {; chr($_) => " " } 0 .. 31 ),
2066 8     8   23 ( map {; chr($_) => "&#$WinLameEntities{$_};" } keys %WinLameEntities ),
  256         1338  
2067             );
2068 8         217 return;
2069             }
2070              
2071             %Entities = (
2072             # HTML entities gotten from the HTML DTDs
2073            
2074             'Aacute' => 'Á', 'aacute' => 'á', 'Acirc' => 'Â',
2075             'acirc' => 'â', 'acute' => '´', 'AElig' => 'Æ',
2076             'aelig' => 'æ', 'Agrave' => 'À', 'agrave' => 'à',
2077             'alefsym' => 'ℵ', 'Alpha' => 'Α', 'alpha' => 'α',
2078             'amp' => '&', 'and' => '∧', 'ang' => '∠',
2079             'apos' => ''', 'Aring' => 'Å', 'aring' => 'å',
2080             'asymp' => '≈', 'Atilde' => 'Ã', 'atilde' => 'ã',
2081             'Auml' => 'Ä', 'auml' => 'ä', 'bdquo' => '„',
2082             'Beta' => 'Β', 'beta' => 'β', 'brvbar' => '¦',
2083             'bull' => '•', 'cap' => '∩', 'Ccedil' => 'Ç',
2084             'ccedil' => 'ç', 'cedil' => '¸', 'cent' => '¢',
2085             'Chi' => 'Χ', 'chi' => 'χ', 'circ' => 'ˆ',
2086             'clubs' => '♣', 'cong' => '≅', 'copy' => '©',
2087             'crarr' => '↵', 'cup' => '∪', 'curren' => '¤',
2088             'dagger' => '†', 'Dagger' => '‡', 'darr' => '↓',
2089             'dArr' => '⇓', 'deg' => '°', 'Delta' => 'Δ',
2090             'delta' => 'δ', 'diams' => '♦', 'divide' => '÷',
2091             'Eacute' => 'É', 'eacute' => 'é', 'Ecirc' => 'Ê',
2092             'ecirc' => 'ê', 'Egrave' => 'È', 'egrave' => 'è',
2093             'empty' => '∅', 'emsp' => ' ', 'ensp' => ' ',
2094             'Epsilon' => 'Ε', 'epsilon' => 'ε', 'equiv' => '≡',
2095             'Eta' => 'Η', 'eta' => 'η', 'ETH' => 'Ð',
2096             'eth' => 'ð', 'Euml' => 'Ë', 'euml' => 'ë',
2097             'euro' => '€', 'exist' => '∃', 'fnof' => 'ƒ',
2098             'forall' => '∀', 'frac12' => '½', 'frac14' => '¼',
2099             'frac34' => '¾', 'frasl' => '⁄', 'Gamma' => 'Γ',
2100             'gamma' => 'γ', 'ge' => '≥', 'gt' => '>',
2101             'harr' => '↔', 'hArr' => '⇔', 'hearts' => '♥',
2102             'hellip' => '…', 'Iacute' => 'Í', 'iacute' => 'í',
2103             'Icirc' => 'Î', 'icirc' => 'î', 'iexcl' => '¡',
2104             'Igrave' => 'Ì', 'igrave' => 'ì', 'image' => 'ℑ',
2105             'infin' => '∞', 'int' => '∫', 'Iota' => 'Ι',
2106             'iota' => 'ι', 'iquest' => '¿', 'isin' => '∈',
2107             'Iuml' => 'Ï', 'iuml' => 'ï', 'Kappa' => 'Κ',
2108             'kappa' => 'κ', 'Lambda' => 'Λ', 'lambda' => 'λ',
2109             'lang' => '〈', 'laquo' => '«', 'larr' => '←',
2110             'lArr' => '⇐', 'lceil' => '⌈', 'ldquo' => '“',
2111             'le' => '≤', 'lfloor' => '⌊', 'lowast' => '∗',
2112             'loz' => '◊', 'lrm' => '‎', 'lsaquo' => '‹',
2113             'lsquo' => '‘', 'lt' => '<', 'macr' => '¯',
2114             'mdash' => '—', 'micro' => 'µ', 'middot' => '·',
2115             'minus' => '−', 'Mu' => 'Μ', 'mu' => 'μ',
2116             'nabla' => '∇', 'nbsp' => ' ', 'ndash' => '–',
2117             'ne' => '≠', 'ni' => '∋', 'not' => '¬',
2118             'notin' => '∉', 'nsub' => '⊄', 'Ntilde' => 'Ñ',
2119             'ntilde' => 'ñ', 'Nu' => 'Ν', 'nu' => 'ν',
2120             'Oacute' => 'Ó', 'oacute' => 'ó', 'Ocirc' => 'Ô',
2121             'ocirc' => 'ô', 'OElig' => 'Œ', 'oelig' => 'œ',
2122             'Ograve' => 'Ò', 'ograve' => 'ò', 'oline' => '‾',
2123             'Omega' => 'Ω', 'omega' => 'ω', 'Omicron' => 'Ο',
2124             'omicron' => 'ο', 'oplus' => '⊕', 'or' => '∨',
2125             'ordf' => 'ª', 'ordm' => 'º', 'Oslash' => 'Ø',
2126             'oslash' => 'ø', 'Otilde' => 'Õ', 'otilde' => 'õ',
2127             'otimes' => '⊗', 'Ouml' => 'Ö', 'ouml' => 'ö',
2128             'para' => '¶', 'part' => '∂', 'permil' => '‰',
2129             'perp' => '⊥', 'Phi' => 'Φ', 'phi' => 'φ',
2130             'Pi' => 'Π', 'pi' => 'π', 'piv' => 'ϖ',
2131             'plusmn' => '±', 'pound' => '£', 'prime' => '′',
2132             'Prime' => '″', 'prod' => '∏', 'prop' => '∝',
2133             'Psi' => 'Ψ', 'psi' => 'ψ', 'quot' => '"',
2134             'radic' => '√', 'rang' => '〉', 'raquo' => '»',
2135             'rarr' => '→', 'rArr' => '⇒', 'rceil' => '⌉',
2136             'rdquo' => '”', 'real' => 'ℜ', 'reg' => '®',
2137             'rfloor' => '⌋', 'Rho' => 'Ρ', 'rho' => 'ρ',
2138             'rlm' => '‏', 'rsaquo' => '›', 'rsquo' => '’',
2139             'sbquo' => '‚', 'Scaron' => 'Š', 'scaron' => 'š',
2140             'sdot' => '⋅', 'sect' => '§', 'shy' => '­',
2141             'Sigma' => 'Σ', 'sigma' => 'σ', 'sigmaf' => 'ς',
2142             'sim' => '∼', 'spades' => '♠', 'sub' => '⊂',
2143             'sube' => '⊆', 'sum' => '∑', 'sup' => '⊃',
2144             'sup1' => '¹', 'sup2' => '²', 'sup3' => '³',
2145             'supe' => '⊇', 'szlig' => 'ß', 'Tau' => 'Τ',
2146             'tau' => 'τ', 'there4' => '∴', 'Theta' => 'Θ',
2147             'theta' => 'θ', 'thetasym' => 'ϑ','thinsp' => ' ',
2148             'THORN' => 'Þ', 'thorn' => 'þ', 'tilde' => '˜',
2149             'times' => '×', 'trade' => '™', 'Uacute' => 'Ú',
2150             'uacute' => 'ú', 'uarr' => '↑', 'uArr' => '⇑',
2151             'Ucirc' => 'Û', 'ucirc' => 'û', 'Ugrave' => 'Ù',
2152             'ugrave' => 'ù', 'uml' => '¨', 'upsih' => 'ϒ',
2153             'Upsilon' => 'Υ', 'upsilon' => 'υ', 'Uuml' => 'Ü',
2154             'uuml' => 'ü', 'weierp' => '℘', 'Xi' => 'Ξ',
2155             'xi' => 'ξ', 'Yacute' => 'Ý', 'yacute' => 'ý',
2156             'yen' => '¥', 'yuml' => 'ÿ', 'Yuml' => 'Ÿ',
2157             'Zeta' => 'Ζ', 'zeta' => 'ζ', 'zwj' => '‍',
2158             'zwnj' => '‌',
2159             );
2160              
2161             foreach my $tagname ( qw{
2162             br
2163             address applet area base bgsound blockquote body button caption center col
2164             colgroup dd del dir div dl dt fieldset form frame frameset h1 h2 h3 h4 h5
2165             h6 head hr html iframe ilayer input ins isindex label layer legend li link
2166             listing map menu meta multicol noframes nolayer noscript object ol
2167             optgroup option p param plaintext pre script select style table tbody td
2168             textarea tfoot th thead title tr ul xmp
2169             }) { $IsBlockMarkup{$tagname} = 1 unless exists $IsBlockMarkup{$tagname} }
2170              
2171             #..........................................................................
2172             # Now the function generators:
2173             @EXPORT = ( 'get_url' );
2174              
2175             $EXPORT_TAGS{'functions'} = \@EXPORT; # just for my own backward compat
2176              
2177             foreach my $method (
2178             do {
2179 8     8   94 no strict 'refs';
  8         17  
  8         1350  
2180             my %to_hide;
2181             @to_hide{@Hidies} = ();
2182             sort
2183             grep !exists $to_hide{$_} && !m/^rss_/s
2184             && m/^[a-z][_A-Za-z0-9]+$/s && defined &{"XML::RSS::SimpleGen::$_"},
2185             keys %XML::RSS::SimpleGen::
2186             }
2187             ) {
2188             my $function = "rss_$method";
2189             DEBUG > 20 and print "$method => $function\n";
2190             push @EXPORT, $function;
2191 8     8   40 no strict 'refs';
  8         17  
  8         618  
2192             unless( defined &{$function} ) {
2193             DEBUG > 20 and print " Defining $function => $function\n";
2194             *{$function} = sub {
2195 8     8   41 use strict 'refs';
  8         16  
  8         2088  
2196 28   33 28   307 unshift @_, ($RSS_obj ||= XML::RSS::SimpleGen::->new);
2197             goto &{
2198 28 50       45 $_[0]->can($method)
  28         242  
2199             || die "Where's $method for $_[0]?!"
2200             # insane error, should never happen, unless somebody
2201             # goes undefining existing methods!
2202             };
2203             };
2204             }
2205             }
2206              
2207 5     5 1 4003154 sub rss_new { $RSS_obj = XML::RSS::SimpleGen->new(@_) }
2208             DEBUG > 20 and print "Done compiling ", __PACKAGE__, ".\n";
2209              
2210             #--------------------------------------------------------------------------
2211             1;
2212             __END__