File Coverage

blib/lib/Sport/Analytics/NHL/Scraper.pm
Criterion Covered Total %
statement 109 120 90.8
branch 20 28 71.4
condition 19 33 57.5
subroutine 19 19 100.0
pod 4 4 100.0
total 171 204 83.8


line stmt bran cond sub pod time code
1             package Sport::Analytics::NHL::Scraper;
2              
3 25     25   106274 use v5.10.1;
  25         90  
4 25     25   112 use warnings FATAL => 'all';
  25         74  
  25         800  
5 25     25   116 use strict;
  25         70  
  25         537  
6 25     25   105 use experimental qw(smartmatch);
  25         41  
  25         134  
7 25     25   1916 use parent 'Exporter';
  25         457  
  25         124  
8              
9 25     25   11880 use Time::HiRes qw(time usleep);
  25         27866  
  25         96  
10 25     25   5270 use POSIX qw(strftime);
  25         14103  
  25         180  
11              
12 25     25   6264 use JSON;
  25         26234  
  25         178  
13 25     25   11478 use LWP::Simple;
  25         1167750  
  25         171  
14              
15 25     25   8171 use Sport::Analytics::NHL::LocalConfig;
  25         52  
  25         2643  
16 25     25   147 use Sport::Analytics::NHL::Config;
  25         44  
  25         3907  
17 25     25   1213 use Sport::Analytics::NHL::Util;
  25         60  
  25         1788  
18 25     25   1291 use Sport::Analytics::NHL::Tools;
  25         55  
  25         3948  
19 25     25   10605 use Sport::Analytics::NHL::Report::BS;
  25         64  
  25         22801  
20              
21             =head1 NAME
22              
23             Sport::Analytics::NHL::Scraper - Scrape and crawl the NHL website for data
24              
25             =head1 SYNOPSIS
26              
27             Scrape and crawl the NHL website for data
28              
29             use Sport::Analytics::NHL::Scraper
30             my $schedules = crawl_schedule({
31             start_season => 2016,
32             stop_season => 2017
33             });
34             ...
35             my $contents = crawl_game(
36             { season => 2011, stage => 2, season_id => 0001 }, # game 2011020001 in NHL accounting
37             { game_files => [qw(BS PL)], retries => 2 },
38             );
39              
40             =head1 IMPORTANT VARIABLE
41              
42             Variable @GAME_FILES contains specific definitions for the report types. Right now only the boxscore javascript has any meaningful non-default definitions; the PB feed seems to have become unavailable.
43              
44             =head1 FUNCTIONS
45              
46             =over 2
47              
48             =item C
49              
50             A wrapper around the LWP::Simple::get() call for retrying and control.
51             Arguments: hash reference containing
52             * url => URL to access
53             * retries => Number of retries
54             * validate => sub reference to validate the download
55             Returns: the content if both download and validation are successful
56             undef otherwise.
57              
58             =item C
59              
60             Crawls the NHL schedule. The schedule is accessed through a minimalistic live api first (only works for post-2010 seasons), then through the general /api/
61              
62             Arguments: hash reference containing
63             * start_season => the first season to crawl
64             * stop_season => the last season to crawl
65             Returns: hash reference of seasonal schedules where seasons are the keys, and decoded JSONs are the values.
66              
67             =item C
68              
69             Sets the arguments to populate the game URL for a given report type and game
70             Arguments: document name, currently one of qw(BS PB RO ES GS PL)
71             game hashref containing
72             * season => YYYY
73             * stage => 2|3
74             * season ID => NNNN
75             Returns: a configured list of arguments for the URL.
76              
77             =item C
78              
79             Crawls the data for the given game
80             Arguments: game data as hashref:
81             * season => YYYY
82             * stage => 2|3
83             * season ID => NNNN
84             options hashref:
85             * game_files => hashref of types of reports that are requested
86             * force => 0|1 force overwrite of files already present in the system
87             * retries => N number of the retries for every get call
88              
89             =back
90              
91             =cut
92              
93             our $SCHEDULE_JSON = 'http://live.nhle.com/GameData/SeasonSchedule-%s%s.json';
94             our $SCHEDULE_JSON_API = 'https://statsapi.web.nhl.com/api/v1/schedule?startDate=%s&endDate=%s';
95             our $HTML_REPORT_URL = 'http://www.nhl.com/scores/htmlreports/%d%d/%s%02d%04d.HTM';
96             #our $PLAYER_URL = 'https://statsapi.web.nhl.com/api/v1/people/%d?expand=person.stats&stats=yearByYear,yearByYearPlayoffs&expand=stats.team&site=en_nhl';
97             #our $SUPP_PLAYER_URL = "https://www.nhl.com/player/%d";
98             #our $ROTOWORLD_URL = 'http://www.rotoworld.com/teams/injuries/nhl/all/';
99             #our %ROTO_CENSUS = (
100             # 'CHRIS TANEV' => 'CHRISTOPHER TANEV',
101             #);
102             our @GAME_FILES = (
103             {
104             name => 'BS',
105             pattern => 'https://statsapi.web.nhl.com/api/v1/game/%s/feed/live',
106             extension => 'json',
107             validate => sub {
108             my $json = shift;
109             my $bs = Sport::Analytics::NHL::Report::BS->new($json);
110             return scalar @{$bs->{json}{liveData}{plays}{allPlays}};
111             1;
112             },
113             },
114             {
115             name => 'PB',
116             pattern => 'http://live.nhle.com/GameData/%s%s/%s/PlayByPlay.json',
117             extension => 'json',
118             disabled => 1,
119             },
120             { name => 'ES' },
121             { name => 'GS' },
122             { name => 'PL' },
123             { name => 'RO' },
124             );
125              
126             our @EXPORT = qw(
127             crawl_schedule crawl_game
128             );
129              
130             our $DEFAULT_RETRIES = 3;
131              
132             sub scrape ($) {
133              
134 27     27 1 67 my $opts = shift;
135 27 50       106 die "Can't scrape without a URL" unless defined $opts->{url};
136              
137 27   33     164 $opts->{retries} ||= $DEFAULT_RETRIES;
138 27   100 24   210 $opts->{validate} ||= sub { 1 };
  24         226  
139              
140 27         105 my $now = time;
141 27         57 my $r = 0;
142 27         44 my $content;
143 27   100     153 while (! $content && $r++ < $opts->{retries}) {
144 31         289 debug "Trying ($r/$opts->{retries}) $opts->{url}...";
145 31         175 $content = get($opts->{url});
146 31 50       11535481 unless ($opts->{validate}->($content)) {
147 0         0 verbose "$opts->{url} failed validation, retrying";
148 0         0 $content = undef;
149             }
150             }
151 27 100       653 debug sprintf("Retrieved in %.3f seconds", time - $now) if $content;
152 27         156 $content;
153              
154             }
155              
156             sub crawl_schedule ($) {
157              
158 5     5 1 2811 my $opts = shift;
159              
160 5   33     20 my $start_season = $opts->{start_season} || $FIRST_SEASON;
161 5   33     18 my $stop_season = $opts->{stop_season} || $CURRENT_SEASON;
162              
163 5         9 my $schedules = {};
164 5         26 for my $season ($start_season .. $stop_season) {
165 6 50       24 next if grep { $_ == $season } @LOCKOUT_SEASONS;
  6         30  
166 6         12 my $schedule_json;
167 6         26 my $schedule_json_file = get_schedule_json_file($season);
168 6 100 66     251 if ($season == $CURRENT_SEASON || ! -f $schedule_json_file) {
169 4         28 my $schedule_json_url = sprintf($SCHEDULE_JSON, $season, $season+1);
170 4         23 $schedule_json = scrape({ url => $schedule_json_url });
171 4 100       30 if (! $schedule_json) {
172 2         14 my ($start_date, $stop_date) = get_start_stop_date($season);
173 2         11 $schedule_json_url = sprintf($SCHEDULE_JSON_API, $start_date, $stop_date);
174 2         9 $schedule_json = scrape({ url => $schedule_json_url });
175 2 50       21 if (! $schedule_json) {
176 0         0 verbose "Couldn't download from $schedule_json_url, skipping...";
177 0         0 next;
178             }
179             }
180 4         22 write_file($schedule_json, $schedule_json_file);
181 4 50       60 if (! -f $schedule_json_file) {
182 0         0 print "ERROR: could not find a JSON schedule file, skipping...";
183 0         0 next;
184             }
185             }
186 6   66     34 $schedule_json ||= read_file($schedule_json_file);
187 6         14630 $schedules->{$season} = decode_json($schedule_json);
188             }
189 5         3660 $schedules;
190             }
191              
192             sub get_game_url_args ($$) {
193              
194 24     24 1 1630 my $doc_name = shift;
195 24         63 my $game = shift;
196              
197             my $game_id = sprintf(
198             "%04d%02d%04d",
199             $game->{season}, $game->{stage}, $game->{season_id}
200 24         162 );
201 24         72 my @args;
202 24         75 for ($doc_name) {
203 24         98 when ('BS') {
204 8         32 @args = ($game_id);
205             }
206 16         43 when ('PB') {
207 1         4 @args = ($game->{season}, $game->{season} + 1, $game_id);
208             }
209 15         35 default {
210             @args = (
211             $game->{season}, $game->{season} + 1, $doc_name,
212             $game->{stage}, $game->{season_id}
213 15         107 );
214             }
215             }
216 24         88 @args;
217             }
218              
219             sub crawl_game ($;$) {
220              
221 7     7 1 4447 my $game = shift;
222 7   50     60 my $opts = shift || {};
223              
224 7         55 my $path = make_game_path($game->{season}, $game->{stage}, $game->{season_id});
225 7         29 my $contents = {};
226 7         30 for my $doc (@GAME_FILES) {
227 42 100       161 next if $doc->{disabled};
228 35 50 33     127 next if $opts->{game_files} && ! $opts->{game_files}{$doc};
229 35 100       167 next if $game->{season} < $FIRST_REPORT_SEASONS{$doc->{name}};
230 21         121 my @args = get_game_url_args($doc->{name}, $game);
231 21   66     145 $doc->{pattern} ||= $HTML_REPORT_URL;
232 21   100     93 $doc->{extension} ||= 'html';
233 21         108 my $file = "$path/$doc->{name}.$doc->{extension}";
234 21 50 33     557 if (-f $file && ! $opts->{force}) {
235 0         0 print STDERR "[NOTICE] File $file already exists, not crawling\n";
236 0         0 $contents->{$doc->{name}} = read_file($file);
237 0         0 next;
238             }
239 21         166 my $url = sprintf($doc->{pattern}, @args);
240             my $content = scrape({
241             url => $url, validate => $doc->{validate}, retries => $opts->{retries}
242 21         184 });
243 21 50       150 if (! $content) {
244 0         0 print STDERR "[WARNING] Got no content for $game->{season}, $game->{stage}, $game->{season_id}, $doc->{name}\n";
245 0         0 next;
246             }
247 21 100       17549 $content =~ s/\xC2\xA0/ /g unless $doc->{extension} eq 'json';
248 21         121 write_file($content, $file);
249 21         241 $contents->{$doc->{name}} = {content => $content, file => $file};
250             }
251 7         36 $contents;
252             }
253              
254             1;
255              
256             =head1 AUTHOR
257              
258             More Hockey Stats, C<< >>
259              
260             =head1 BUGS
261              
262             Please report any bugs or feature requests to C, or through
263             the web interface at L. I will be notified, and then you'll
264             automatically be notified of progress on your bug as I make changes.
265              
266              
267             =head1 SUPPORT
268              
269             You can find documentation for this module with the perldoc command.
270              
271             perldoc Sport::Analytics::NHL::Scraper
272              
273             You can also look for information at:
274              
275             =over 4
276              
277             =item * RT: CPAN's request tracker (report bugs here)
278              
279             L
280              
281             =item * AnnoCPAN: Annotated CPAN documentation
282              
283             L
284              
285             =item * CPAN Ratings
286              
287             L
288              
289             =item * Search CPAN
290              
291             L
292              
293             =back