File Coverage

blib/lib/Sport/Analytics/NHL/Scraper.pm
Criterion Covered Total %
statement 140 155 90.3
branch 30 42 71.4
condition 26 48 54.1
subroutine 20 20 100.0
pod 5 5 100.0
total 221 270 81.8


line stmt bran cond sub pod time code
1             package Sport::Analytics::NHL::Scraper;
2              
3 33     33   120659 use v5.10.1;
  33         124  
4 33     33   169 use warnings FATAL => 'all';
  33         63  
  33         1343  
5 33     33   176 use strict;
  33         66  
  33         783  
6 33     33   157 use experimental qw(smartmatch);
  33         55  
  33         233  
7 33     33   2704 use parent 'Exporter';
  33         508  
  33         202  
8              
9 33     33   16193 use Time::HiRes qw(time usleep);
  33         38753  
  33         132  
10 33     33   7158 use POSIX qw(strftime);
  33         15783  
  33         324  
11              
12 33     33   7392 use JSON;
  33         32044  
  33         251  
13 33     33   15809 use LWP::Simple;
  33         1678830  
  33         271  
14              
15 33     33   11808 use Sport::Analytics::NHL::LocalConfig;
  33         76  
  33         4059  
16 33     33   246 use Sport::Analytics::NHL::Config;
  33         67  
  33         5822  
17 33     33   1617 use Sport::Analytics::NHL::Util;
  33         93  
  33         2645  
18 33     33   1454 use Sport::Analytics::NHL::Tools;
  33         66  
  33         5495  
19 33     33   14731 use Sport::Analytics::NHL::Report::BS;
  33         90  
  33         45058  
20              
21             =head1 NAME
22              
23             Sport::Analytics::NHL::Scraper - Scrape and crawl the NHL website for data
24              
25             =head1 SYNOPSIS
26              
27             Scrape and crawl the NHL website for data
28              
29             use Sport::Analytics::NHL::Scraper
30             my $schedules = crawl_schedule({
31             start_season => 2016,
32             stop_season => 2017
33             });
34             ...
35             my $contents = crawl_game(
36             { season => 2011, stage => 2, season_id => 0001 }, # game 2011020001 in NHL accounting
37             { game_files => [qw(BS PL)], retries => 2 },
38             );
39              
40             =head1 IMPORTANT VARIABLE
41              
42             Variable @GAME_FILES contains specific definitions for the report types. Right now only the boxscore javascript has any meaningful non-default definitions; the PB feed seems to have become unavailable.
43              
44             =head1 FUNCTIONS
45              
46             =over 2
47              
48             =item C
49              
50             A wrapper around the LWP::Simple::get() call for retrying and control.
51              
52             Arguments: hash reference containing
53              
54             * url => URL to access
55             * retries => Number of retries
56             * validate => sub reference to validate the download
57              
58             Returns: the content if both download and validation are successful undef otherwise.
59              
60             =item C
61              
62             Crawls the NHL schedule. The schedule is accessed through a minimalistic live api first (only works for post-2010 seasons), then through the general /api/
63              
64             Arguments: hash reference containing
65              
66             * start_season => the first season to crawl
67             * stop_season => the last season to crawl
68              
69             Returns: hash reference of seasonal schedules where seasons are the keys, and decoded JSONs are the values.
70              
71             =item C
72              
73             Sets the arguments to populate the game URL for a given report type and game
74              
75             Arguments:
76              
77             * document name, currently one of qw(BS PB RO ES GS PL)
78             * game hashref containing
79             - season => YYYY
80             - stage => 2|3
81             - season ID => NNNN
82              
83             Returns: a configured list of arguments for the URL.
84              
85             =item C
86              
87             Crawls the data for the given game
88              
89             Arguments:
90              
91             game data as hashref:
92             * season => YYYY
93             * stage => 2|3
94             * season ID => NNNN
95             options hashref:
96             * game_files => hashref of types of reports that are requested
97             * force => 0|1 force overwrite of files already present in the system
98             * retries => N number of the retries for every get call
99              
100             =item C
101              
102             Crawls the data for an NHL player given his NHL id. First, the API call is made, and the JSON is retrieved. Unfortunately, the JSON does not contain the draft information, so another call to the HTML page is made to complete the information. The merged information is stored in a json file at the ROOT_DATA_DIR/players/$ID.json path.
103              
104             Arguments:
105             * player's NHL id
106             * options hashref:
107             - data_dir root data dir location
108             - playerfile_expiration -how long the saved playerfile should be trusted
109             - force - crawl the player regardless
110              
111             Returns: the path to the saved file
112              
113             =back
114              
115             =cut
116              
117             our $SCHEDULE_JSON = 'http://live.nhle.com/GameData/SeasonSchedule-%s%s.json';
118             our $SCHEDULE_JSON_API = 'https://statsapi.web.nhl.com/api/v1/schedule?startDate=%s&endDate=%s';
119             our $HTML_REPORT_URL = 'http://www.nhl.com/scores/htmlreports/%d%d/%s%02d%04d.HTM';
120             our $PLAYER_URL = 'https://statsapi.web.nhl.com/api/v1/people/%d?expand=person.stats&stats=yearByYear,yearByYearPlayoffs&expand=stats.team&site=en_nhl';
121             our $SUPP_PLAYER_URL = "https://www.nhl.com/player/%d";
122             #our $ROTOWORLD_URL = 'http://www.rotoworld.com/teams/injuries/nhl/all/';
123             #our %ROTO_CENSUS = (
124             # 'CHRIS TANEV' => 'CHRISTOPHER TANEV',
125             #);
126             our @GAME_FILES = (
127             {
128             name => 'BS',
129             pattern => 'https://statsapi.web.nhl.com/api/v1/game/%s/feed/live',
130             extension => 'json',
131             validate => sub {
132             my $json = shift;
133             my $bs = Sport::Analytics::NHL::Report::BS->new($json);
134             return scalar @{$bs->{json}{liveData}{plays}{allPlays}};
135             1;
136             },
137             },
138             {
139             name => 'PB',
140             pattern => 'http://live.nhle.com/GameData/%s%s/%s/PlayByPlay.json',
141             extension => 'json',
142             disabled => 1,
143             },
144             { name => 'ES' },
145             { name => 'GS' },
146             { name => 'PL' },
147             { name => 'RO' },
148             );
149              
150             our @EXPORT = qw(
151             crawl_schedule crawl_game crawl_player
152             );
153              
154             our $DEFAULT_RETRIES = 3;
155              
156             sub scrape ($) {
157              
158 35     35 1 109 my $opts = shift;
159 35 50       174 die "Can't scrape without a URL" unless defined $opts->{url};
160              
161 35 50       173 return undef if $ENV{HOCKEYDB_NONET};
162 35   33     261 $opts->{retries} ||= $DEFAULT_RETRIES;
163 35   100 34   336 $opts->{validate} ||= sub { 1 };
  34         350  
164              
165 35         170 my $now = time;
166 35         74 my $r = 0;
167 35         76 my $content;
168 35   100     245 while (! $content && $r++ < $opts->{retries}) {
169 41         392 debug "Trying ($r/$opts->{retries}) $opts->{url}...";
170 41         229 $content = get($opts->{url});
171 41 50       18799116 unless ($opts->{validate}->($content)) {
172 0         0 verbose "$opts->{url} failed validation, retrying";
173 0         0 $content = undef;
174             }
175             }
176 35 100       728 debug sprintf("Retrieved in %.3f seconds", time - $now) if $content;
177 35         305 $content;
178              
179             }
180              
181             sub crawl_schedule ($) {
182              
183 5     5 1 2564 my $opts = shift;
184              
185 5   33     22 my $start_season = $opts->{start_season} || $FIRST_SEASON;
186 5   33     17 my $stop_season = $opts->{stop_season} || $CURRENT_SEASON;
187              
188 5         9 my $schedules = {};
189 5         23 for my $season ($start_season .. $stop_season) {
190 6 50       21 next if grep { $_ == $season } @LOCKOUT_SEASONS;
  6         30  
191 6         14 my $schedule_json;
192 6         25 my $schedule_json_file = get_schedule_json_file($season);
193 6 100 66     175 if ($season == $CURRENT_SEASON || ! -f $schedule_json_file) {
194 4         26 my $schedule_json_url = sprintf($SCHEDULE_JSON, $season, $season+1);
195 4         24 $schedule_json = scrape({ url => $schedule_json_url });
196 4 100       30 if (! $schedule_json) {
197 2         13 my ($start_date, $stop_date) = get_start_stop_date($season);
198 2         12 $schedule_json_url = sprintf($SCHEDULE_JSON_API, $start_date, $stop_date);
199 2         10 $schedule_json = scrape({ url => $schedule_json_url });
200 2 50       23 if (! $schedule_json) {
201 0         0 verbose "Couldn't download from $schedule_json_url, skipping...";
202 0         0 next;
203             }
204             }
205 4         25 write_file($schedule_json, $schedule_json_file);
206 4 50       73 if (! -f $schedule_json_file) {
207 0         0 print "ERROR: could not find a JSON schedule file, skipping...";
208 0         0 next;
209             }
210             }
211 6   66     42 $schedule_json ||= read_file($schedule_json_file);
212 6         28171 $schedules->{$season} = decode_json($schedule_json);
213             }
214 5         5220 $schedules;
215             }
216              
217             sub get_game_url_args ($$) {
218              
219 24     24 1 1684 my $doc_name = shift;
220 24         60 my $game = shift;
221              
222             my $game_id = sprintf(
223             "%04d%02d%04d",
224             $game->{season}, $game->{stage}, $game->{season_id}
225 24         182 );
226 24         56 my @args;
227 24         74 for ($doc_name) {
228 24         105 when ('BS') {
229 8         35 @args = ($game_id);
230             }
231 16         49 when ('PB') {
232 1         3 @args = ($game->{season}, $game->{season} + 1, $game_id);
233             }
234 15         37 default {
235             @args = (
236             $game->{season}, $game->{season} + 1, $doc_name,
237             $game->{stage}, $game->{season_id}
238 15         100 );
239             }
240             }
241 24         98 @args;
242             }
243              
244             sub crawl_game ($;$) {
245              
246 7     7 1 6356 my $game = shift;
247 7   50     57 my $opts = shift || {};
248              
249 7         62 my $path = make_game_path($game->{season}, $game->{stage}, $game->{season_id});
250 7         28 my $contents = {};
251 7         69 for my $doc (@GAME_FILES) {
252 42 100       180 next if $doc->{disabled};
253 35 50 33     140 next if $opts->{game_files} && ! $opts->{game_files}{$doc};
254 35 100       190 next if $game->{season} < $FIRST_REPORT_SEASONS{$doc->{name}};
255 21         136 my @args = get_game_url_args($doc->{name}, $game);
256 21   66     154 $doc->{pattern} ||= $HTML_REPORT_URL;
257 21   100     101 $doc->{extension} ||= 'html';
258 21         128 my $file = "$path/$doc->{name}.$doc->{extension}";
259 21 50 33     551 if (-f $file && ! $opts->{force}) {
260 0         0 print STDERR "[NOTICE] File $file already exists, not crawling\n";
261 0         0 $contents->{$doc->{name}} = read_file($file);
262 0         0 next;
263             }
264 21         153 my $url = sprintf($doc->{pattern}, @args);
265             my $content = scrape({
266             url => $url, validate => $doc->{validate}, retries => $opts->{retries}
267 21         189 });
268 21 50       170 if (! $content) {
269 0         0 print STDERR "[WARNING] Got no content for $game->{season}, $game->{stage}, $game->{season_id}, $doc->{name}\n";
270 0         0 next;
271             }
272 21 100       16153 $content =~ s/\xC2\xA0/ /g unless $doc->{extension} eq 'json';
273 21         145 write_file($content, $file);
274 21         262 $contents->{$doc->{name}} = {content => $content, file => $file};
275             }
276 7         44 $contents;
277             }
278              
279             sub crawl_player ($;$$$) {
280              
281 5     5 1 14979 my $id = shift;
282 5         14 my $opts = shift;
283              
284 5   33     72 $opts->{data_dir} ||= $ENV{HOCKEYDB_DATA_DIR} || $DATA_DIR;
      33        
285 5   33     40 $opts->{playerfile_expiration} ||= $DEFAULT_PLAYERFILE_EXPIRATION;
286 5         13 my $sfx = 'json';
287 5         30 my $file = sprintf("%s/players/%d.%s", $opts->{data_dir}, $id, $sfx);
288              
289 5 50 66     160 if (-f $file && -M $file < $opts->{playerfile_expiration} && ! $opts->{force}) {
      66        
290 0         0 debug "File exists and is recent, skipping";
291 0         0 return $file;
292             }
293              
294 5         56 my $content = scrape({ url => sprintf($PLAYER_URL, $id) });
295 5 100       42 if (! $content) {
296 1         406 print STDERR "ID $id missing or network unavailable\n";
297 1 50       24 if (-f $file) {
298 0         0 print STDERR "Using old available file\n";
299 0         0 return $file;
300             }
301 1         10 return;
302             }
303 4         2489 my $json = decode_json($content);
304 4         18 $json = $json->{people}[0];
305 4 100       85 if (-f $file) {
306 1         7 my $existing_json = decode_json(read_file($file));
307 1         7 for my $key (qw(draftyear draftteam round undrafted pick)) {
308 5 50       163 $json->{$key} = $existing_json->{$key} if exists $existing_json->{$key};
309             }
310             }
311             else {
312 3         18 my $supp_url = sprintf($SUPP_PLAYER_URL, $id);
313 3         18 $content = scrape({url => $supp_url});
314 3 100       430 if ($content =~ /Draft:.*(\d{4}) (\S{3}), (\d+)\S\S rd, .* pk \((\d+)\D+ overall\)/) {
315 2         16 $json->{draftyear} = $1+0;
316 2         10 $json->{draftteam} = $2;
317 2         9 $json->{round} = $3+0;
318 2         7 $json->{undrafted} = 0;
319 2         10 $json->{pick} = $4+0;
320             }
321             else {
322 1         11 $json->{undrafted} = 1;
323 1         6 $json->{pick} = $UNDRAFTED_PICK;
324             }
325             }
326 4         2645 write_file(encode_json($json), $file);
327 4         749 $file;
328             }
329              
330             1;
331              
332             =head1 AUTHOR
333              
334             More Hockey Stats, C<< >>
335              
336             =head1 BUGS
337              
338             Please report any bugs or feature requests to C, or through
339             the web interface at L. I will be notified, and then you'll
340             automatically be notified of progress on your bug as I make changes.
341              
342              
343             =head1 SUPPORT
344              
345             You can find documentation for this module with the perldoc command.
346              
347             perldoc Sport::Analytics::NHL::Scraper
348              
349             You can also look for information at:
350              
351             =over 4
352              
353             =item * RT: CPAN's request tracker (report bugs here)
354              
355             L
356              
357             =item * AnnoCPAN: Annotated CPAN documentation
358              
359             L
360              
361             =item * CPAN Ratings
362              
363             L
364              
365             =item * Search CPAN
366              
367             L
368              
369             =back