File Coverage

lib/WWW/Scrape/Mailman/RSS.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package WWW::Scrape::Mailman::RSS;
2              
3 1     1   33141 use warnings;
  1         3  
  1         36  
4 1     1   6 use strict;
  1         2  
  1         28  
5 1     1   1224 use WWW::Mechanize;
  1         283942  
  1         41  
6 1     1   1442 use HTML::TableExtract;
  1         14654  
  1         8  
7 1     1   3672 use XML::Twig;
  0            
  0            
8             use XML::RSS;
9             use HTML::TokeParser::Simple;
10             use Data::Dumper;
11              
12             =head1 NAME
13              
14             WWW::Scrape::Mailman::RSS - Parse mailman listserve archives, format as an rss feed
15              
16             =head1 VERSION
17              
18             Version 0.12
19              
20             =cut
21              
22             our $VERSION = '0.12';
23              
24             =head1 SYNOPSIS
25              
26             On some convenient server to host your rss feeds, schedule
27             the following script as a cron job at some appropriate interval:
28              
29             #!/usr/bin/perl
30             use strict;
31             use warnings;
32             use WWW::Scrape::Mailman::RSS;
33             my $feed = WWW::Scrape::Mailman::RSS->new(
34             'rss_version' => '0.91',
35             'debug' => 0, # try values from 1 to 5 for noisier output
36             );
37              
38             my %args = (
39             'info_url' => 'http://ga.greens.org/mailman/listinfo/gpga-news',
40             'base_url' => 'http://ga.greens.org/pipermail/gpga-news',
41             'list_name' => 'gpga-news',
42             'audience' => 'Greens',
43             'description' => 'News by, about and for Greens',
44             'cycles' => 2,
45             'output_file' => '/home/hesco/sites/news.tns.campaignfoundations.com/gpga_news_feed.html',
46             'rss_output' => '/home/hesco/sites/news.tns.campaignfoundations.com/gpga_news_feed.rss',
47             );
48              
49             $feed->render_feed(\%args);
50              
51             # create additional feeds for other lists here
52              
53             1;
54              
55             Then on your site, set your feed aggregator to point to:
56             http://news.tns.campaignfoundations.com/gpga_news_feed.rss
57              
58             =head1 METHODS
59              
60             =head2 WWW::Scrape::Mailman::RSS->new( \%defaults )
61              
62             Given a hashref of defaults which includes the key
63             'rss_version', construct and returns a $feed object, including
64             embedded objects for WWW::Mechanize, HTML::TableExtract,
65             XML::Twig and XML::RSS. If $defaults->{'debug'} is set, you
66             can see debugging output; with the noise level increasing as
67             you increment it from 1 to 5.
68              
69             =cut
70              
71             sub new {
72             my $class = shift;
73             my $defaults = shift;
74             my $self = {};
75              
76             if(!defined($defaults->{'debug'})){
77             $defaults->{'debug'} = 0;
78             }
79             if(!defined($defaults->{'rss_version'})){
80             $defaults->{'rss_version'} = '0.91';
81             }
82             if(!defined($defaults->{'feed_format'})){
83             $defaults->{'feed_format'} = 'html';
84             }
85             if(!defined($defaults->{'audience'})){
86             $defaults->{'audience'} = 'readers';
87             }
88             if(!defined($defaults->{'feed_type'})){
89             $defaults->{'feed_type'} = 'updates';
90             }
91             if(!defined($defaults->{'server'})){
92             $defaults->{'server'} = 'default';
93             }
94              
95             foreach my $key (keys %{$defaults}){
96             $self->{$key} = $defaults->{$key};
97             }
98              
99             $self->{'agent'} = WWW::Mechanize->new();
100             $self->{'te'} = HTML::TableExtract->new( headers => [ 'Archive', 'View by:', 'Downloadable version'] );
101             $self->{'twig'} = XML::Twig->new( );
102             $self->{'rss'} = XML::RSS->new( version => $defaults->{'rss_version'} );
103              
104             bless $self, $class;
105             return $self;
106             }
107              
108             =head2 $self->render_feed ( \%args )
109              
110             Given a $feed object and a hashref of arguments, including
111             list_name, info_url, description, base_url, cycles and
112             rss_output, download, process and render as an rss feed the
113             most recent $args->{'cycles'} cycles of a mailman list's
114             public archives.
115              
116             =cut
117              
118             sub render_feed {
119             my $self = shift;
120             my $args = shift;
121             print STDERR Dumper($self) if($self->{'debug'} > 4);
122             print STDERR Dumper($args) if($self->{'debug'} > 3);
123              
124             $self->{'rss'}->channel(
125             'title' => $args->{'list_name'},
126             'link' => $args->{'info_url'},
127             'description' => $args->{'description'}
128             );
129              
130             my $url = $args->{'base_url'};
131             $self->{'agent'}->get( $url );
132             my $html = $self->{'agent'}->content();
133             print STDERR Dumper($html) if($self->{'debug'} > 4);
134              
135             my $feed;
136             $self->{'te'}->parse($html);
137             my($month);
138             foreach my $ts ($self->{'te'}->tables){
139             my $month_count = 0;
140             foreach my $row ($ts->rows){
141             print STDERR 'Next row: ' . Dumper($row) if($self->{'debug'} > 2);
142             push @{$self->{'cycles'}},$row->[0];
143             $feed .= $self->_parse_mm_archive_cycle($args,$row->[0]);
144             $month_count++;
145             if($month_count >= $args->{'cycles'}){ last; }
146             }
147             }
148              
149             $self->{'rss'}->save( $args->{'rss_output'} );
150              
151             print "rss: $args->{'rss_output'}\n" if($self->{'debug'} > 0);
152             return $feed;
153             }
154              
155             =head2 $self->_parse_mm_archive_cycle ( \%args, '2010-September' );
156              
157             Given the arguments passed to ->render_feed, plus the cycle
158             name (month has been tested, week and quarter have not yet
159             been tested), get the appropriate date.html page from a mailman
160             list serve's archives, parse it and use the data collected to
161             add items to an rss feed of the data.
162              
163             =cut
164              
165             sub _parse_mm_archive_cycle {
166             my $self = shift;
167             my $args = shift;
168             my $base_url = $args->{'base_url'};
169             my $cycle = shift;
170             $cycle =~ s/:$//;
171              
172             my $feed;
173             my $url = "$base_url/$cycle/date.html";
174             print STDERR $url, "\n" if($self->{'debug'} > 0);
175             $self->{'agent'}->get( $url );
176             my $html = $self->{'agent'}->content();
177              
178             my $p = HTML::TokeParser::Simple->new( \$html );
179              
180             my @feed;
181             my $list_name = $args->{'list_name'};
182             my $count = 0;
183             STORY: while (my $token = $p->get_tag("li")) {
184             $count++;
185             my $a_tag = $p->get_tag("a");
186             print STDERR Dumper( \$a_tag ) if($self->{'debug'} > 3);
187             my $link = $a_tag->[3];
188             my $link_url = "$base_url/$cycle/" . $a_tag->[1]->{'href'};
189             my $text = $p->get_trimmed_text("a");
190             my $desc = '';
191             if($count == 2 && $self->{'debug'} > 3){ print STDERR Dumper( \$link, \$text ); }
192             if($text =~ m/Messages sorted by:/){ next STORY; }
193             if($text =~ m/More info on this list/){ next STORY; }
194             if($text =~ m/Archived on:/){ next STORY; }
195             if($text eq '[ thread ]'){ next STORY; }
196             $text =~ s,\[$list_name\] ,,;
197             print STDERR "$count : $text \n" if($self->{'debug'} > 2);
198             $link =~ s,HREF=",HREF="$base_url/$cycle/,;
199             $feed .= $link . "$text\n";
200             push @feed, ({ 'title' => $text, 'link' => $link_url, 'description' => $desc });
201             }
202              
203             my @feed_reversed = reverse @feed;
204             print STDERR Dumper( \@feed, \@feed_reversed ) if($self->{'debug'} > 3);
205             foreach my $item (@feed_reversed){
206             print STDERR Dumper( $item ) if($self->{'debug'} > 3);;
207             $self->{'rss'}->add_item(
208             'title' => $item->{'title'},
209             'link' => $item->{'link'},
210             'description' => $item->{'description'}
211             );
212             }
213              
214             $self->{'rss'}->save($args->{'rss_output'});
215             print "rss: $args->{rss_output}\n" if($self->{'debug'} > 0);
216              
217             return $feed;
218             }
219              
220             =head1 AUTHOR
221              
222             Hugh Esco, C<< >>
223              
224             =head1 BUGS
225              
226             * First item from each cycle is missing from feed.
227              
228             Please report any bugs or feature
229             requests to C
230             rt.cpan.org>, or through the web interface at
231             L.
232             I will be notified, and then you'll automatically be notified
233             of progress on your bug as I make changes.
234              
235             =head1 SUPPORT
236              
237             You can find documentation for this module with the perldoc command.
238              
239             perldoc WWW::Scrape::Mailman::RSS
240              
241             You can also look for information at:
242              
243             =over 4
244              
245             =item * RT: CPAN's request tracker
246              
247             L
248              
249             =item * AnnoCPAN: Annotated CPAN documentation
250              
251             L
252              
253             =item * CPAN Ratings
254              
255             L
256              
257             =item * Search CPAN
258              
259             L
260              
261             =back
262              
263              
264             =head1 ACKNOWLEDGEMENTS
265              
266             With appreciation to Adam Shand , whose
267             mm2rss.pl script served as inspiration for refactoring a
268             private module CF::mmFeedParser which I wrote years ago.
269             His code also introduced me to XML::RSS with which I had not
270             previously been familiar.
271              
272             =head1 COPYRIGHT & LICENSE
273              
274             Copyright 2010-2011 Hugh Esco.
275              
276             This program is free software; you can redistribute it and/or
277             modify it under the terms of the GNU General Public License
278             as published by the Free Software Foundation; version 2 dated
279             June, 1991 or at your option any later version.
280              
281             This program is distributed in the hope that it will be useful,
282             but WITHOUT ANY WARRANTY; without even the implied warranty of
283             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
284             GNU General Public License for more details.
285              
286             A copy of the GNU General Public License is available in the
287             source tree; if not, write to the Free Software Foundation,
288             Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
289              
290              
291             =cut
292              
293             1; # End of WWW::Scrape::Mailman::RSS