File Coverage

blib/lib/WWW/Topica.pm
Criterion Covered Total %
statement 169 191 88.4
branch 38 76 50.0
condition 13 34 38.2
subroutine 24 24 100.0
pod 9 9 100.0
total 253 334 75.7


line stmt bran cond sub pod time code
1             package WWW::Topica;
2              
3 1     1   938 use strict;
  1         2  
  1         40  
4 1     1   5 use Cwd;
  1         1  
  1         79  
5 1     1   14 use Carp qw(carp croak);
  1         2  
  1         56  
6 1     1   4 use Date::Parse;
  1         2  
  1         131  
7 1     1   1130 use Email::Date;
  1         13865  
  1         73  
8 1     1   896 use Email::Simple;
  1         5823  
  1         34  
9 1     1   11 use Email::Simple::Creator;
  1         3  
  1         22  
10 1     1   1062 use HTML::Entities;
  1         6192  
  1         108  
11 1     1   1055 use HTML::Scrubber;
  1         2394  
  1         34  
12 1     1   1180 use LWP::UserAgent;
  1         63360  
  1         29  
13 1     1   8 use URI;
  1         2  
  1         22  
14              
15 1     1   5 use vars qw($VERSION);
  1         2  
  1         52  
16              
17 1     1   601 use WWW::Topica::Index;
  1         2  
  1         22  
18 1     1   497 use WWW::Topica::Mail;
  1         3  
  1         22  
19 1     1   500 use WWW::Topica::Reply;
  1         3  
  1         1869  
20              
21             $VERSION = '0.6';
22             my $USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)';
23              
24              
25             =pod
26              
27             =head1 NAME
28              
29             WWW::Topica - read emails from a Topica mailing list
30              
31             =head1 SYNOPSIS
32              
33              
34             my $topica = WWW::Topica->new( list => 'mylist', login => 'mylogin', password => 'mypass' );
35            
36             while (my $mail = $topica->mail) {
37             Email::LocalDelivery->deliver($mail, 'mylist.mbox');
38             }
39              
40             =head1 DESCRIPTION
41              
42             This module screen scrapes the Topica website and fetches back RFC822 text representations
43             of all the mails posted to a given list. Where possible it fills in the from, to and date
44             fields. It should be noted that in some cases it's impossible to get both the sender name
45             and their email address.
46              
47             =head1 METHODS
48              
49             =cut
50              
51             =head2 new
52              
53             Takes three options, the list name, your login account and your password;
54              
55             You can also pass in C and C. C will print out
56             various debugging messages whereas C will use local files for
57             testing. C automatically sets C to C<1> unless debug is
58             explicitly set to C<0>.
59              
60              
61             Furthermore if you pass in a C option the parsing will start from
62             that offset. A C lets you set an upper bound.
63              
64             =cut
65              
66             sub new {
67 1     1 1 743 my $class = shift;
68 1         3 my %opts = @_;
69            
70 1 50       5 die "You must pass a list\n" unless defined $opts{list};
71             #die "You must pass an email\n" unless defined $opts{email};
72             #die "You must pass a password\n" unless defined $opts{password};
73              
74              
75 1   50     7 $opts{_next} = $opts{first} || 0;
76 1 50 33     7 $opts{debug} = $opts{local} if exists $opts{local} and not exists $opts{local};
77            
78 1         8 $opts{scrubber} = HTML::Scrubber->new( allow => [] );
79              
80 1         145 return bless \%opts, $class;
81              
82             }
83              
84             =head2 mail
85              
86             Returns a mail at a time
87              
88             Logs in automatically.
89              
90             =cut
91              
92              
93             sub mail {
94 301     301 1 3765 my $self = shift;
95              
96             # first time ever
97 301 100       1609 unless ($self->{_index})
98             {
99 1         20 $self->login;
100 1 50       4 print STDERR "Beginning to collect mails\n" if $self->{debug};
101             }
102              
103             # relog in after an hour
104             # TODO: untested
105 301 50       2551 unless ($self->{local}) {
106 0         0 my $time_diff = time() - $self->{_logged_in};
107 0 0       0 $self->login() if ($time_diff>60*60);
108             }
109              
110              
111             INDEX:
112             # need to get new message ids
113 301 100 100     3260 unless (defined $self->{_message_ids} && @{$self->{_message_ids}})
  300         1632  
114             {
115             # all over
116 4 100       14 return undef unless defined $self->{_next};
117              
118             # the last one we want
119 3 50 33     15 return undef if defined $self->{last} and $self->{_next} >= $self->{last};
120              
121             # end of first page
122             # return undef if $self->{debug} && $self->{_index};
123              
124 3         22 $self->{_index} = WWW::Topica::Index->new($self->fetch_index($self->{_next}));
125            
126 3 50       98 return undef unless $self->{_index};
127              
128 3         13 $self->{_next} = $self->{_index}->next();
129 3         12 $self->{_prev} = $self->{_index}->prev();
130 3         16 @{$self->{_message_ids}} = $self->{_index}->message_ids;
  3         32  
131              
132             }
133              
134 300         567 GET: my $mess_id = shift @{$self->{_message_ids}};
  300         1299  
135 300 50       878 goto INDEX unless defined $mess_id;
136              
137             # the mail has some information and also provides a link to the reply if we're logged in...
138 300         1811 my $mail_html = $self->fetch_mail($mess_id);
139 300 50       24747 goto GET unless $mail_html;
140 300         6391 my $mail = WWW::Topica::Mail->new($mail_html, $mess_id);
141              
142 300         709 my $reply;
143              
144             # which has other information (like the un-htmled mail and the email address) ...
145 300 50       1450 if ($mail->eto) {
146 300 50       1010 my $reply_html = $self->fetch_reply($mess_id,$mail->eto) if defined $mail->eto;
147 300 50       14574 goto GET unless $reply_html;
148 300         3891 $reply = WWW::Topica::Reply->new($reply_html, $mess_id, $mail->eto);
149             }
150            
151             # now build the rfc822 mail string
152 300         2558 return $self->build_rfc822($mail, $reply);
153              
154             }
155              
156             =head2 login
157              
158             Logs in to Topica and stashes the cookie.
159              
160             Called automatically by the first call to C.
161              
162             Builds the loader automatically.
163              
164             =cut
165              
166             sub login {
167 1     1 1 2 my $self = shift;
168              
169 1         4 $self->build_loader;
170              
171 1   33     10 my $anon = !defined $self->{email} || !defined $self->{password};
172              
173              
174 1 50       4 if ($anon) {
175 0         0 $self->{email} = $self->{password} = 'anonymous';
176             }
177              
178              
179              
180 1 50       6 print STDERR "Logging in using ".$self->{email}."/".$self->{password}."\n" if $self->{debug};
181              
182 1 50       5 return if $self->{local};
183              
184 0 0       0 if (!$anon) {
185 0         0 (undef) = $self->fetch_page("http://lists.topica.com/");
186 0         0 (undef) = $self->fetch_page("http://lists.topica.com/list.html");
187 0         0 (undef) = $self->fetch_page("http://lists.topica.com/perl/login.pl?email=".$self->{email}."&password=".$self->{password});
188             }
189              
190            
191              
192              
193              
194             # store when we logged in so that we can relog in again after an hour
195 0         0 $self->{_logged_in} = time;
196              
197              
198             }
199              
200             =head2 fetch_index
201              
202             Retrieve the html of the index page with the given offset.
203              
204             =cut
205              
206             sub fetch_index {
207 3     3 1 4 my $self = shift;
208 3         6 my $offset = shift;
209 3         5 my $list = $self->{list};
210            
211 3 50       10 print STDERR "Fetching index $offset of list ${list}\n" if $self->{debug};
212              
213 3         15 my $url = "http://lists.topica.com/lists/${list}/read?sort=d&start=$offset";
214            
215 3 50       10 if ($self->{local}) {
216 3         20032 $url = "file://".cwd."/t/local_files/";
217 3 100       107 if (0 == $offset) {
    100          
    50          
218 1         7 $url .= "list_first.html";
219             } elsif (100 == $offset) {
220 1         17 $url .= "list_middle.html";
221             } elsif (200 == $offset) {
222 1         9 $url .= "list_last.html";
223             }
224            
225             }
226            
227 3         45 return $self->fetch_page($url);
228              
229             }
230              
231              
232             =head2 fetch_mail
233              
234             Retrieve the html of a the message page with the given id.
235              
236             =cut
237              
238             sub fetch_mail {
239 300     300 1 483 my $self = shift;
240 300         570 my $id = shift;
241 300         638 my $list = $self->{list};
242              
243              
244 300 50       1147 print STDERR "\tFetching mail $id\n" if $self->{debug};
245            
246 300         961 my $url = "http://lists.topica.com/lists/${list}/read/message.html?mid=$id";
247            
248 300 50       869 if ($self->{local}) {
249 300         2353631 $url = "file://".cwd."/t/local_files/mail.html";
250             }
251            
252 300         20480 return $self->fetch_page($url);
253            
254             }
255              
256              
257             =head2 fetch_reply
258              
259             Retrieve the html of a the reply page with the given id and eto.
260              
261             =cut
262              
263             sub fetch_reply {
264 300     300 1 861 my $self = shift;
265 300         596 my $id = shift;
266 300         418 my $eto = shift;
267 300         909 my $list = $self->{list};
268              
269 300 50       2225 print STDERR "\t\tFetching reply $id - $eto\n" if $self->{debug};
270              
271              
272 300         1404 my $url = "http://lists.topica.com/lists/${list}/read/post.html?mode=replytosender&mid=$id&eto=$eto";
273            
274 300 50       1049 if ($self->{local}) {
275 300         2357623 $url = "file://".cwd."/t/local_files/reply.html";
276             }
277            
278 300         12816 return $self->fetch_page($url);
279              
280              
281             }
282              
283              
284             =head2 build_rfc822
285              
286             Given a C object and a C object
287             build up the text of an RFC822 compliant email.
288              
289             =cut
290              
291             sub build_rfc822 {
292 300     300 1 704 my $self = shift;
293 300         630 my $mail = shift;
294 300         497 my $reply = shift;
295              
296 300         987 my $list = $self->{list};
297              
298 300         1519 my $mid = $mail->id;
299              
300 300         1919 my $name = decode_entities($mail->from);
301 300         1561 my $email = "";
302 300 50       1040 if (defined $reply) {
303 300         1275 $email = decode_entities($reply->email);
304             } else {
305 0         0 $email = "${list}\@topica.com";
306             }
307              
308             # we may have been confused and got name and email mixed up
309 300 50 33     2477 if ($name =~ /@/ && $email !~ /@/) {
310 0         0 my $tmp = $name;
311 0         0 $name = $email;
312 0         0 $email = $tmp;
313             }
314              
315             # try and build a sane From: line
316 300         529 my $from;
317 300 50 33     4561 if ($name ne $email && $email =~ /@/) {
    0          
318 300         1061 $from = "$name <$email>";
319             } elsif ($email =~ /@/) {
320 0         0 $from = "<$email>";
321             } else {
322 0         0 $from = "$name <${list}\@topica.com>";
323             }
324            
325             # get the subject from somewhere - mail preferably because then it
326             # doesn't have the Re: which we don't know whether to strip out or not
327 300         2550 my $subject = $mail->subject;
328 300 50 33     7000 $subject = $reply->subject if defined $reply && $subject =~ /^\s*$/;
329              
330             # remove newlines
331 300         827 $subject =~ s/[\n\r]//gs;
332              
333             # strip out html
334 300         727 $subject =~ s!
\s+!!sg; # hack
335 300         3795 $subject = $self->{scrubber}->scrub($subject);
336              
337 300         56534 $subject = decode_entities($subject);
338              
339             # time
340 300   33     1435 my $time = str2time(decode_entities($mail->date)) || gmtime;
341            
342              
343             # message-id
344 300         214823 my $message_id = "${mid}\@lists.topica.com";
345              
346              
347             # time to build the mail
348             # we should probably use Email::Simple::Creator for this
349 300         1114 my $string = "";
350              
351 300         610 my $body = "";
352 300 50 33     3280 if ($reply && defined $reply->body) {
353 300         1300 $body = $reply->body;
354             }else {
355 0   0     0 $body = $self->{scrubber}->scrub($mail->body) || "";
356             }
357              
358 300         5624 $string .= "Date: ".format_date($time)."\n";
359 300         43139 $string .= "To: ${list}\@topica.com\n";
360 300         1609 $string .= "From: $from\n";
361 300         872 $string .= "Message-ID: $message_id\n";
362 300         1304 $string .= "X-TopicaMailUrl: http://lists.topica.com/lists/${list}/read/message.html?mid=${mid}\n";
363 300 50       979 if ($reply) {
364 300         1240 my $rid = $reply->id;
365 300         1242 my $eto = $reply->eto;
366 300         2035 $string .= "X-TopicaReplyUrl: http://lists.topica.com/lists/${list}/read/post.html?mode=replytosender&mid=${rid}&eto=${eto}\n";
367             }
368 300         1188 $string .= "Subject: $subject\n";
369 300         1701 $string .= "\n$body\n\n";
370              
371 300         8973 return $string;
372             }
373              
374             =head2 build_loader
375              
376             Set up the LWP::UserAgent object used to fetch pages.
377              
378             =cut
379              
380             sub build_loader {
381 1     1 1 1 my $self = shift;
382            
383 1         7 my $ua = new LWP::UserAgent( keep_alive => 1, timeout => 30, agent => $USER_AGENT, );
384              
385              
386             # setting it in the 'new' seems not to work sometimes
387 1         5472 $ua->agent($USER_AGENT);
388             # for some reason this makes stuff work
389 1         62 $ua->max_redirect( 0 );
390             # cookies!
391 1         17 $ua->cookie_jar( {} );
392              
393 1         8287 $self->{_ua} = $ua;
394             }
395              
396             =head2 fetch_page
397              
398             Utility function for getting a page with various niceties.
399              
400             =cut
401              
402             sub fetch_page {
403 603     603 1 5194 my $self = shift;
404 603         1401 my $url = shift;
405              
406             # print STDERR "\tfetching $url\n" if $self->{debug};
407              
408             # make a full set of headers
409 603         30155 my $h = new HTTP::Headers(
410             'Host' => "lists.topica.com",
411             'User-Agent' => $USER_AGENT,
412             'Referer' => $url,
413             'Accept' => 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1',
414             'Accept-Language' => 'en-us,en;q=0.5',
415             'Accept-Charset' => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
416             #'Accept-Encoding' => 'gzip,deflate',
417             'Keep-Alive' => '300',
418             'Connection' => 'keep-alive',
419            
420             );
421            
422 603         346058 $h->referer("$url");
423              
424              
425              
426 603         48825 my $request = HTTP::Request->new ( 'GET', $url, $h );
427 603         319225 my $response;
428              
429 603         1228 my $times = 0;
430              
431             # LWP should be able to do this but seemingly fails sometimes
432 603         3838 while ($times++<3) {
433 603         17426 $response = $self->{_ua}->request($request);
434 603 50       2681250 last if $response->is_success;
435 0 0       0 if ($response->is_redirect) {
436 0         0 $url = URI->new($response->header("Location"));
437 0         0 $url = $url->abs("http://lists.topica.com");
438 0         0 $h->referer("$url");
439 0         0 $request = HTTP::Request->new ( 'GET', $url, $h );
440             }
441             }
442              
443 603 0 33     18623 if (!$response->is_success && !$response->is_redirect) {
444 0         0 carp "Failed to retrieve $url";
445 0         0 return undef;
446             }
447              
448 603         7062 return $response->content;
449              
450             }
451              
452             1;
453              
454             =head1 AUTHOR
455              
456             Simon Wistow
457              
458             =head1 COPYRIGHT
459              
460             Copyright (c) 2004, Simon Wistow
461              
462             =cut
463              
464              
465