File Coverage

blib/lib/WWW/Wikevent/Bot.pm
Criterion Covered Total %
statement 7 9 77.7
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 10 12 83.3


line stmt bran cond sub pod time code
1             package WWW::Wikevent::Bot;
2             #
3             # Copyright 2007 Mark Jaroski
4             #
5             # This program is free software; you can redistribute it and/or modify
6             # it under the terms of either:
7             #
8             # a) the GNU General Public License as published by the Free Software
9             # Foundation; either version 3, or (at your option) any later version,
10             # or
11             # b) the "Artistic License" which comes with this Kit.
12             #
13             # This program is distributed in the hope that it will be useful, but
14             # WITHOUT ANY WARRANTY; without even the implied warranty of
15             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the
16             # GNU General Public License or the Artistic License for more details.
17             #
18             # You should have received a copy of the Artistic License with this Kit,
19             # in the file named "Artistic".
20             #
21             # You should also have received a copy of the GNU General Public License
22             # along with this program in the file named "Copying". If not, write to
23             # the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
24             # Boston, MA 02111-1307, USA or visit their web page on the internet at
25             # http://www.gnu.org/copyleft/gpl.html.
26              
27 1     1   80836 use warnings;
  1         2  
  1         35  
28 1     1   6 use strict;
  1         2  
  1         37  
29 1     1   462 use WWW::Mediawiki::Client;
  0            
  0            
30             use WWW::Wikevent::Event;
31             use Date::Format;
32             use Digest::MD5 qw(md5_hex);
33             use Encode;
34             use utf8;
35              
36             use base 'Exporter';
37             our %EXPORT_TAGS = (
38             options => [qw(OPT_YES OPT_NO OPT_DEFAULT OPT_KEEP)],
39             );
40             our @EXPORT_OK = map { @{$EXPORT_TAGS{$_}} } keys %EXPORT_TAGS;
41              
42             our $VERSION = 0.2.0;
43              
44             =head1 NAME
45              
46             WWW::Wikevent::Bot
47              
48             =cut
49              
50             =head1 SYNOPSIS
51              
52             use WWW::Wikevent::Bot;
53             use HTML::TreeBuilder;
54             use utf8;
55              
56             my $bot = WWW::Wikevent::Bot->new();
57             $bot->name( 'HideoutBot' );
58             $bot->url( 'http://www.hideoutchicago.com/schedule.html' );
59             $bot->sample( 'sample.html' );
60             $bot->encoding( 'utf8' );
61              
62             $bot->parser( sub {
63             my ( $bot, $html ) = @_;
64            
65             # Use HTML::TreeBuilder and HTML::Element, or if you prefer
66             # HTML::TokeParser to parse the HTML down to whatever elements
67             # contains events, then ...
68             foreach my $container ( @event_containers ) {
69             my $event = $bot->add_event();
70              
71             # build up the event using methods of L
72             }
73              
74             # Figure out the next page to scrape (not needed if you are parsing
75             # by month) and set
76              
77             $bot->url( $next_page_to_scrape );
78             });
79            
80             $bot->scrape();
81             $bot->upload();
82            
83             =cut
84              
85             =head1 DESCRIPTION
86              
87             WWW::Wikevent::Bot is a package which will help you write scraper scripts
88             for gathering events from venue and artist websites and for inclusion in
89             the Free content events compendium, Wikevent.
90              
91             The module takes care of the tedium of interaction with the website, and
92             leaves to you the fun work of writing the scraper subroutine for the venue
93             or artist you are interested in.
94              
95             =cut
96              
97             =head1 CONSTANTS
98              
99             item $SEEN_FILE
100              
101             =cut
102              
103             my $SEEN_FILE = '.processed_events';
104              
105             my $Mvs = WWW::Mediawiki::Client->new();
106             my $Ua = LWP::UserAgent->new();
107              
108             my @Events;
109              
110              
111             =head1 CONSTRUCTORS
112              
113             =cut
114              
115             =head2 new
116              
117             Creates a new bot object.
118              
119             =cut
120              
121             sub new {
122             my $pkg = shift;
123             my %init = @_;
124             my $self = bless {};
125             foreach my $key ( keys %init ) {
126             if ( $key eq 'name' ) {
127             $self->name( $init{$key} );
128             } elsif ( $key eq 'parser' ) {
129             $self->parser( $init{$key} );
130             } elsif ( $key eq 'url' ) {
131             $self->url( $init{$key} );
132             }
133             }
134             $self->{'events'} = [];
135             $self->{'months'} = 3;
136             $self->{'last_url'} = '';
137             $self->load_remembered_events();
138             return $self;
139             }
140              
141             =head1 ACCESSORS
142              
143             =cut
144              
145             =head2 name
146            
147             $bot->name( $bot_name );
148              
149             The name of your bot.
150              
151             This setting will be used to control where your bot will submit information
152             about itself and the list of events it scrapes on each run.
153              
154             =cut
155              
156             sub name {
157             my ( $self, $name ) = @_;
158             if ( $name ) {
159             $self->{'name'} = $name;
160             $self->user_dir( "User:$self->{'name'}" );
161             $self->user_page( "User:$self->{'name'}.wiki" );
162             $self->shows_page( "User:$self->{'name'}/Shows.wiki" );
163             }
164             return $self->{'name'};
165             }
166              
167             =head2 events
168              
169             my @events = $bot->events()
170              
171             or
172              
173             my $event_ref = $bot->events()
174              
175             The list of events which this bot has scraped (so far).
176              
177             =cut
178              
179             sub events {
180             my $self = shift;
181             return wantarray ? @{$self->{'events'}} : $self->{'events'};
182             }
183              
184             =head2 sample
185              
186             $bot->sample( 'somepage.html' );
187              
188             A local file containing a sample page to scrape while you are building and
189             debugging your parser subroutine.
190              
191             =cut
192              
193             sub sample {
194             my ( $self, $sample ) = @_;
195             $self->{'sample'} = $sample if $sample;
196             return $self->{'sample'};
197             }
198              
199             =head2 charset
200            
201             $bot->charset( 'utf8' );
202              
203             The charset of the target site/page.
204              
205             Sometimes the charset is detected incorrectly, or even set incorrectly in
206             venue and artist webpages. This lets you override.
207              
208             =cut
209              
210             sub charset {
211             my ( $self, $charset ) = @_;
212             $self->{'charset'} = $charset if $charset;
213             return $self->{'charset'};
214             }
215              
216             =head2 encoding
217              
218             An alias for charset, if you prefer.
219              
220             =cut
221              
222             sub encoding {
223             my ( $self, $charset ) = @_;
224             $self->{'charset'} = $charset if $charset;
225             return $self->{'charset'};
226             }
227              
228             =head2 url
229              
230             $bot->url( 'http://venue.com/schedule.html' );
231              
232             The next URL to scrape.
233              
234             Initially you should set this to the first page which your scraper bot
235             should look at. Afterwords if there are more pages to scrape you'll set it
236             again in your parser subroutine.
237              
238             If the site you're scraping has calendar pages with elements of the date in
239             the URL you can put Date::Format placeholders in the your URL string, as
240             in:
241              
242             $bot->url( 'http://venue.com/calendar.html?year=%Ymonth=%L' );
243              
244             .. and your bot will scrape C months ahead from the current month,
245             whatever that is. You can of course override this behaviour by specifying
246             a new URL to parse in the parser subroutine, but then you'll have to do all
247             of the date calculation yourself.
248              
249             =cut
250              
251             sub url {
252             my ( $self, $url ) = @_;
253             $self->{'url'} = $url if $url;
254             return $self->{'url'};
255             }
256              
257             =head2 months
258              
259             $bot->months( $int );
260             my $int = $bot->months();
261              
262             The number of months to scrape if C is a Date::Format specification.
263              
264             Defaults to 3.
265              
266             =cut
267              
268             sub months {
269             my ( $self, $months ) = @_;
270             $self->{'months'} = $months if $months;
271             return $self->{'months'};
272             }
273              
274             sub parser {
275             my ( $self, $parser ) = @_;
276             $self->{'parser'} = $parser if $parser;
277             return $self->{'parser'};
278             }
279              
280             =head2 user_dir
281              
282             my $dir = $bot->user_dir( $dir );
283              
284             The directory to which your events will be dumped.
285              
286             Normally this is set as a side-effect of setting the C accessor,
287             however it can be optionally set to something else I setting
288             C.
289              
290             =cut
291              
292             sub user_dir {
293             my ( $self, $user_dir ) = @_;
294             $self->{'user_dir'} = $user_dir if $user_dir;
295             return $self->{'user_dir'};
296             }
297              
298             =head2 user_page
299              
300             my $page = $bot->user_page( $page );
301              
302             The page on which information about your bot is to be found.
303              
304             Normally this is set as a side-effect of setting the C accessor,
305             however it can be optionally set to something else I setting
306             C.
307              
308             =cut
309              
310             sub user_page {
311             my ( $self, $user_page ) = @_;
312             $self->{'user_page'} = $user_page if $user_page;
313             return $self->{'user_page'};
314             }
315              
316             =head2 shows_page
317              
318             my $page = $bot->shows_page( $page );
319              
320             The page to which events scraped by your bot will be uploaded.
321              
322             Normally this is set as a side-effect of setting the C accessor,
323             however it can be optionally set to something else I setting
324             C.
325              
326             =cut
327              
328             sub shows_page {
329             my ( $self, $shows_page ) = @_;
330             $self->{'shows_page'} = $shows_page if $shows_page;
331             return $self->{'shows_page'};
332             }
333              
334             =head1 METHODS
335              
336             =cut
337              
338             =head2 add_event
339              
340             my $e = $bot->add_event();
341              
342             Create a new event and return it.
343              
344             This is a convenience method which both creates a new event, adds it to
345             C list (see above) and returns a refernce to which you may
346             manipulate as necessary.
347              
348             =cut
349              
350             sub add_event {
351             my $self = shift;
352             my $event = WWW::Wikevent::Event->new();
353             push @{$self->{'events'}}, $event;
354             return $event;
355             }
356              
357             =head2 parse
358              
359             my @events = $bot->parse( $html );
360              
361             or
362              
363             my $events_ref = $bot->parse( $html );
364              
365             Run the user supplised C subroutine against the argument HTML and
366             return any events found. This is used internally by C.
367              
368             =cut
369              
370             sub parse {
371             my ( $self, $html ) = @_;
372             $self->parser->( $self, $html );
373             return wantarray ? @{$self->events()} : $self->events();
374             }
375              
376             =head2 check_allowed
377              
378             $bot->check_allowed();
379              
380             Check the user page of this bot to see if it is currently allowed to run.
381             This will be indicated by the text:
382              
383             run = true
384              
385             at the top of the page. If that text is present return true, other wise
386             die with an error. This method is called internally by C so you
387             don't have to call it, but you do have to make sure that the above text
388             appears on the bot's user page.
389              
390             =cut
391              
392             sub check_allowed {
393             my $self = shift();
394             $Mvs->do_login();
395             # check to see if run = true
396             $Mvs->do_update( $self->{'user_page'} );
397             open UP, $self->{'user_page'};
398             my $go = undef;
399             while ( ) {
400             next if m{^\s*$};
401             last if m{^\s*==[^=]*==\s*$};
402             if ( m{^\s*run\s*=\s*(\w+)\s*$} ) {
403             $go = $1;
404             }
405             }
406             die "Not allowed to run, according to user page."
407             unless $go;
408             close UP;
409             }
410              
411             sub create_account {
412             }
413              
414             =head2 scrape_sample
415              
416             $bot->scrape_sample();
417              
418             Runs the C against the supplied C HTML page.
419              
420             =cut
421              
422             sub scrape_sample {
423             my $self = shift;
424             my $file = $self->sample();
425             my $in;
426             if ( $self->charset() ) {
427             my $c = $self->charset();
428             open( $in, "<:encoding($c)", $file )
429             or die "Could not open sample file '$file'. $!\n";
430             } else {
431             open( $in, "<", $file )
432             or die "Could not open sample file '$file'. $!\n";
433             }
434             local $/ = undef;
435             my $html = <$in>;
436             close $in;
437             $self->events( $self->parse( $html ) );
438             }
439              
440             sub find_month_urls {
441             my $self = shift;
442             my @calendar_urls;
443             # from perlfunc:
444             # my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
445             # thus: 0 1 2 3 4 5 6 7 8
446             my @lt = localtime(time);
447             for ( my $i = 0; $i < $self->months(); $i++ ) {
448             my $url = strftime( $self->url(), @lt );
449             $lt[4]++;
450             if ( $lt[4] > 11 ) {
451             $lt[4] = 0;
452             $lt[5]++;
453             }
454             push @calendar_urls, $url;
455             }
456             return wantarray ? @calendar_urls : \@calendar_urls;
457             }
458              
459             =head2 scrape
460              
461             $bot->scrape();
462              
463             Starts scraping at the supplied C and continues as long as C
464             changes.
465            
466             =cut
467              
468             sub scrape {
469             my $self = shift;
470             if ( $self->url() =~ m{%[A-z]} ) {
471             foreach my $url ( $self->find_month_urls() ) {
472             $self->scrape_page( $url );
473             }
474             } else {
475             while ( $self->url() and ( $self->url() ne $self->{last_url} ) ) {
476             $self->{last_url} = $self->url();
477             $self->scrape_page( $self->url() );
478             }
479             }
480             }
481              
482             =head2 scrape_page
483              
484             $bot->scrape_page( $url );
485              
486             Scrapes a single page of HTML found at the given URL. This method is
487             called internally by C.
488              
489             =cut
490              
491             sub scrape_page {
492             my ( $self, $url ) = @_;
493             my $html;
494             print "fetching $url\n"; #FIXME
495             my $res = $Ua->get( $url );
496             die "couldn't fetch $url." unless $res->is_success();
497             if ( $self->charset() ) {
498             $html = $res->decoded_content( charset => $self->charset() );
499             } else {
500             $html = $res->decoded_content();
501             }
502             $self->parse( $html );
503             }
504              
505             =head2 dump
506              
507             $bot->dump();
508              
509             Dumps the contents of C as text to standard out.
510              
511             =cut
512              
513             sub dump {
514             my $self = shift;
515             foreach my $e ( $self->events() ) {
516             print $e;
517             }
518             return 1;
519             }
520              
521             =head2 remember
522            
523             $bot->remember( $event );
524              
525             Records an md5sum of the given event, so as to not repeat it again when
526             running C.
527              
528             =cut
529              
530             sub remember {
531             my ( $self, $event ) = @_;
532             my $token = md5_hex( encode( "iso-8859-1", $event->to_string() ) );
533             $self->{known_events}->{$token} = 1;
534             open OUT, ">>$SEEN_FILE"
535             or die 'could not open file for seen events';
536             print OUT "$token\n";
537             close OUT;
538             }
539              
540             =head2 load_remembered_events
541              
542             $bot->load_remembered_events
543              
544             Loads in the md5sums of previously Ced events. This is called
545             internally by C so it's unlikely that you will need to call it.
546              
547             =cut
548              
549             sub load_remembered_events {
550             my ( $self, $event ) = @_;
551             unless ( open SEEN, $SEEN_FILE ) {
552             open SEEN, ">$SEEN_FILE"
553             or die 'could not open file for seen events';
554             print SEEN "\n";
555             close SEEN;
556             open SEEN, $SEEN_FILE
557             or die 'could not open file for seen events';
558             }
559             while ( my $token = ) {
560             chomp( $token );
561             $self->{known_events}->{$token} = 1;
562             }
563             close SEEN;
564             }
565              
566             =head2 is_new
567              
568             my $bool = $bot->is_new( $event );
569              
570             Checks to see if the md5sum of an event is in our list of Ced
571             events.
572              
573             =cut
574              
575             sub is_new {
576             my ( $self, $event ) = @_;
577             my $token = md5_hex( encode( "iso-8859-1", $event->to_string() ) );
578             if ( $self->{known_events}->{$token} ) {
579             return 0;
580             } else {
581             return 1;
582             }
583             }
584              
585             =head2 dump_to_file
586            
587             $bot->dump_to_file
588              
589             Prints out the events in their final form to the appropriate .wiki file for
590             upload to the bot's event page. This is called internally by C but
591             is also useful for the last stages of writing and debugging your bot.
592              
593             =cut
594              
595             sub dump_to_file {
596             my $self = shift;
597             if ( ! -e $self->user_dir() ) {
598             mkdir $self->user_dir()
599             or die "could not make directory: " . $self->user_dir() . "\n";
600             }
601             open my $out, ">:encoding(utf-8)", $self->shows_page()
602             or die "could not open wiki file: " . $self->shows_page() . "\n";
603             foreach my $e ( $self->events() ) {
604             if ( $self->is_new( $e ) ) {
605             print $out $e;
606             $self->remember( $e );
607             }
608             }
609             close $out;
610             return 1;
611             }
612              
613             =head2 upload
614              
615             $bot->upload();
616              
617             This is the method which interacts with the Wikevent server, first checking
618             to see if the bot is allowed to proceed, then doing an update, printing out
619             the bot's C and then proceeding to do the upload.
620              
621             =cut
622              
623             sub upload {
624             my $self = shift;
625             $self->check_allowed();
626             $Mvs->do_update( $self->shows_page() );
627             $self->dump_to_file();
628             $Mvs->watch(0);
629             $Mvs->minor_edit(0);
630             $Mvs->commit_message( "scraping results" );
631             $Mvs->do_commit( $self->shows_page() );
632             }
633              
634             1;
635              
636             __END__