line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Sport::Analytics::NHL::Scraper; |
2
|
|
|
|
|
|
|
|
3
|
25
|
|
|
25
|
|
106274
|
use v5.10.1; |
|
25
|
|
|
|
|
90
|
|
4
|
25
|
|
|
25
|
|
112
|
use warnings FATAL => 'all'; |
|
25
|
|
|
|
|
74
|
|
|
25
|
|
|
|
|
800
|
|
5
|
25
|
|
|
25
|
|
116
|
use strict; |
|
25
|
|
|
|
|
70
|
|
|
25
|
|
|
|
|
537
|
|
6
|
25
|
|
|
25
|
|
105
|
use experimental qw(smartmatch); |
|
25
|
|
|
|
|
41
|
|
|
25
|
|
|
|
|
134
|
|
7
|
25
|
|
|
25
|
|
1916
|
use parent 'Exporter'; |
|
25
|
|
|
|
|
457
|
|
|
25
|
|
|
|
|
124
|
|
8
|
|
|
|
|
|
|
|
9
|
25
|
|
|
25
|
|
11880
|
use Time::HiRes qw(time usleep); |
|
25
|
|
|
|
|
27866
|
|
|
25
|
|
|
|
|
96
|
|
10
|
25
|
|
|
25
|
|
5270
|
use POSIX qw(strftime); |
|
25
|
|
|
|
|
14103
|
|
|
25
|
|
|
|
|
180
|
|
11
|
|
|
|
|
|
|
|
12
|
25
|
|
|
25
|
|
6264
|
use JSON; |
|
25
|
|
|
|
|
26234
|
|
|
25
|
|
|
|
|
178
|
|
13
|
25
|
|
|
25
|
|
11478
|
use LWP::Simple; |
|
25
|
|
|
|
|
1167750
|
|
|
25
|
|
|
|
|
171
|
|
14
|
|
|
|
|
|
|
|
15
|
25
|
|
|
25
|
|
8171
|
use Sport::Analytics::NHL::LocalConfig; |
|
25
|
|
|
|
|
52
|
|
|
25
|
|
|
|
|
2643
|
|
16
|
25
|
|
|
25
|
|
147
|
use Sport::Analytics::NHL::Config; |
|
25
|
|
|
|
|
44
|
|
|
25
|
|
|
|
|
3907
|
|
17
|
25
|
|
|
25
|
|
1213
|
use Sport::Analytics::NHL::Util; |
|
25
|
|
|
|
|
60
|
|
|
25
|
|
|
|
|
1788
|
|
18
|
25
|
|
|
25
|
|
1291
|
use Sport::Analytics::NHL::Tools; |
|
25
|
|
|
|
|
55
|
|
|
25
|
|
|
|
|
3948
|
|
19
|
25
|
|
|
25
|
|
10605
|
use Sport::Analytics::NHL::Report::BS; |
|
25
|
|
|
|
|
64
|
|
|
25
|
|
|
|
|
22801
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 NAME |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
Sport::Analytics::NHL::Scraper - Scrape and crawl the NHL website for data |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=head1 SYNOPSIS |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
Scrape and crawl the NHL website for data |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
use Sport::Analytics::NHL::Scraper |
30
|
|
|
|
|
|
|
my $schedules = crawl_schedule({ |
31
|
|
|
|
|
|
|
start_season => 2016, |
32
|
|
|
|
|
|
|
stop_season => 2017 |
33
|
|
|
|
|
|
|
}); |
34
|
|
|
|
|
|
|
... |
35
|
|
|
|
|
|
|
my $contents = crawl_game( |
36
|
|
|
|
|
|
|
{ season => 2011, stage => 2, season_id => 0001 }, # game 2011020001 in NHL accounting |
37
|
|
|
|
|
|
|
{ game_files => [qw(BS PL)], retries => 2 }, |
38
|
|
|
|
|
|
|
); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=head1 IMPORTANT VARIABLE |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
Variable @GAME_FILES contains specific definitions for the report types. Right now only the boxscore javascript has any meaningful non-default definitions; the PB feed seems to have become unavailable. |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=head1 FUNCTIONS |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=over 2 |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=item C |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
A wrapper around the LWP::Simple::get() call for retrying and control. |
51
|
|
|
|
|
|
|
Arguments: hash reference containing |
52
|
|
|
|
|
|
|
* url => URL to access |
53
|
|
|
|
|
|
|
* retries => Number of retries |
54
|
|
|
|
|
|
|
* validate => sub reference to validate the download |
55
|
|
|
|
|
|
|
Returns: the content if both download and validation are successful |
56
|
|
|
|
|
|
|
undef otherwise. |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=item C |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
Crawls the NHL schedule. The schedule is accessed through a minimalistic live api first (only works for post-2010 seasons), then through the general /api/ |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
Arguments: hash reference containing |
63
|
|
|
|
|
|
|
* start_season => the first season to crawl |
64
|
|
|
|
|
|
|
* stop_season => the last season to crawl |
65
|
|
|
|
|
|
|
Returns: hash reference of seasonal schedules where seasons are the keys, and decoded JSONs are the values. |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=item C |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
Sets the arguments to populate the game URL for a given report type and game |
70
|
|
|
|
|
|
|
Arguments: document name, currently one of qw(BS PB RO ES GS PL) |
71
|
|
|
|
|
|
|
game hashref containing |
72
|
|
|
|
|
|
|
* season => YYYY |
73
|
|
|
|
|
|
|
* stage => 2|3 |
74
|
|
|
|
|
|
|
* season ID => NNNN |
75
|
|
|
|
|
|
|
Returns: a configured list of arguments for the URL. |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=item C |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Crawls the data for the given game |
80
|
|
|
|
|
|
|
Arguments: game data as hashref: |
81
|
|
|
|
|
|
|
* season => YYYY |
82
|
|
|
|
|
|
|
* stage => 2|3 |
83
|
|
|
|
|
|
|
* season ID => NNNN |
84
|
|
|
|
|
|
|
options hashref: |
85
|
|
|
|
|
|
|
* game_files => hashref of types of reports that are requested |
86
|
|
|
|
|
|
|
* force => 0|1 force overwrite of files already present in the system |
87
|
|
|
|
|
|
|
* retries => N number of the retries for every get call |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=back |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=cut |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
our $SCHEDULE_JSON = 'http://live.nhle.com/GameData/SeasonSchedule-%s%s.json'; |
94
|
|
|
|
|
|
|
our $SCHEDULE_JSON_API = 'https://statsapi.web.nhl.com/api/v1/schedule?startDate=%s&endDate=%s'; |
95
|
|
|
|
|
|
|
our $HTML_REPORT_URL = 'http://www.nhl.com/scores/htmlreports/%d%d/%s%02d%04d.HTM'; |
96
|
|
|
|
|
|
|
#our $PLAYER_URL = 'https://statsapi.web.nhl.com/api/v1/people/%d?expand=person.stats&stats=yearByYear,yearByYearPlayoffs&expand=stats.team&site=en_nhl'; |
97
|
|
|
|
|
|
|
#our $SUPP_PLAYER_URL = "https://www.nhl.com/player/%d"; |
98
|
|
|
|
|
|
|
#our $ROTOWORLD_URL = 'http://www.rotoworld.com/teams/injuries/nhl/all/'; |
99
|
|
|
|
|
|
|
#our %ROTO_CENSUS = ( |
100
|
|
|
|
|
|
|
# 'CHRIS TANEV' => 'CHRISTOPHER TANEV', |
101
|
|
|
|
|
|
|
#); |
102
|
|
|
|
|
|
|
our @GAME_FILES = ( |
103
|
|
|
|
|
|
|
{ |
104
|
|
|
|
|
|
|
name => 'BS', |
105
|
|
|
|
|
|
|
pattern => 'https://statsapi.web.nhl.com/api/v1/game/%s/feed/live', |
106
|
|
|
|
|
|
|
extension => 'json', |
107
|
|
|
|
|
|
|
validate => sub { |
108
|
|
|
|
|
|
|
my $json = shift; |
109
|
|
|
|
|
|
|
my $bs = Sport::Analytics::NHL::Report::BS->new($json); |
110
|
|
|
|
|
|
|
return scalar @{$bs->{json}{liveData}{plays}{allPlays}}; |
111
|
|
|
|
|
|
|
1; |
112
|
|
|
|
|
|
|
}, |
113
|
|
|
|
|
|
|
}, |
114
|
|
|
|
|
|
|
{ |
115
|
|
|
|
|
|
|
name => 'PB', |
116
|
|
|
|
|
|
|
pattern => 'http://live.nhle.com/GameData/%s%s/%s/PlayByPlay.json', |
117
|
|
|
|
|
|
|
extension => 'json', |
118
|
|
|
|
|
|
|
disabled => 1, |
119
|
|
|
|
|
|
|
}, |
120
|
|
|
|
|
|
|
{ name => 'ES' }, |
121
|
|
|
|
|
|
|
{ name => 'GS' }, |
122
|
|
|
|
|
|
|
{ name => 'PL' }, |
123
|
|
|
|
|
|
|
{ name => 'RO' }, |
124
|
|
|
|
|
|
|
); |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
our @EXPORT = qw( |
127
|
|
|
|
|
|
|
crawl_schedule crawl_game |
128
|
|
|
|
|
|
|
); |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
our $DEFAULT_RETRIES = 3; |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub scrape ($) { |
133
|
|
|
|
|
|
|
|
134
|
27
|
|
|
27
|
1
|
67
|
my $opts = shift; |
135
|
27
|
50
|
|
|
|
106
|
die "Can't scrape without a URL" unless defined $opts->{url}; |
136
|
|
|
|
|
|
|
|
137
|
27
|
|
33
|
|
|
164
|
$opts->{retries} ||= $DEFAULT_RETRIES; |
138
|
27
|
|
100
|
24
|
|
210
|
$opts->{validate} ||= sub { 1 }; |
|
24
|
|
|
|
|
226
|
|
139
|
|
|
|
|
|
|
|
140
|
27
|
|
|
|
|
105
|
my $now = time; |
141
|
27
|
|
|
|
|
57
|
my $r = 0; |
142
|
27
|
|
|
|
|
44
|
my $content; |
143
|
27
|
|
100
|
|
|
153
|
while (! $content && $r++ < $opts->{retries}) { |
144
|
31
|
|
|
|
|
289
|
debug "Trying ($r/$opts->{retries}) $opts->{url}..."; |
145
|
31
|
|
|
|
|
175
|
$content = get($opts->{url}); |
146
|
31
|
50
|
|
|
|
11535481
|
unless ($opts->{validate}->($content)) { |
147
|
0
|
|
|
|
|
0
|
verbose "$opts->{url} failed validation, retrying"; |
148
|
0
|
|
|
|
|
0
|
$content = undef; |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
} |
151
|
27
|
100
|
|
|
|
653
|
debug sprintf("Retrieved in %.3f seconds", time - $now) if $content; |
152
|
27
|
|
|
|
|
156
|
$content; |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
sub crawl_schedule ($) { |
157
|
|
|
|
|
|
|
|
158
|
5
|
|
|
5
|
1
|
2811
|
my $opts = shift; |
159
|
|
|
|
|
|
|
|
160
|
5
|
|
33
|
|
|
20
|
my $start_season = $opts->{start_season} || $FIRST_SEASON; |
161
|
5
|
|
33
|
|
|
18
|
my $stop_season = $opts->{stop_season} || $CURRENT_SEASON; |
162
|
|
|
|
|
|
|
|
163
|
5
|
|
|
|
|
9
|
my $schedules = {}; |
164
|
5
|
|
|
|
|
26
|
for my $season ($start_season .. $stop_season) { |
165
|
6
|
50
|
|
|
|
24
|
next if grep { $_ == $season } @LOCKOUT_SEASONS; |
|
6
|
|
|
|
|
30
|
|
166
|
6
|
|
|
|
|
12
|
my $schedule_json; |
167
|
6
|
|
|
|
|
26
|
my $schedule_json_file = get_schedule_json_file($season); |
168
|
6
|
100
|
66
|
|
|
251
|
if ($season == $CURRENT_SEASON || ! -f $schedule_json_file) { |
169
|
4
|
|
|
|
|
28
|
my $schedule_json_url = sprintf($SCHEDULE_JSON, $season, $season+1); |
170
|
4
|
|
|
|
|
23
|
$schedule_json = scrape({ url => $schedule_json_url }); |
171
|
4
|
100
|
|
|
|
30
|
if (! $schedule_json) { |
172
|
2
|
|
|
|
|
14
|
my ($start_date, $stop_date) = get_start_stop_date($season); |
173
|
2
|
|
|
|
|
11
|
$schedule_json_url = sprintf($SCHEDULE_JSON_API, $start_date, $stop_date); |
174
|
2
|
|
|
|
|
9
|
$schedule_json = scrape({ url => $schedule_json_url }); |
175
|
2
|
50
|
|
|
|
21
|
if (! $schedule_json) { |
176
|
0
|
|
|
|
|
0
|
verbose "Couldn't download from $schedule_json_url, skipping..."; |
177
|
0
|
|
|
|
|
0
|
next; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
} |
180
|
4
|
|
|
|
|
22
|
write_file($schedule_json, $schedule_json_file); |
181
|
4
|
50
|
|
|
|
60
|
if (! -f $schedule_json_file) { |
182
|
0
|
|
|
|
|
0
|
print "ERROR: could not find a JSON schedule file, skipping..."; |
183
|
0
|
|
|
|
|
0
|
next; |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
} |
186
|
6
|
|
66
|
|
|
34
|
$schedule_json ||= read_file($schedule_json_file); |
187
|
6
|
|
|
|
|
14630
|
$schedules->{$season} = decode_json($schedule_json); |
188
|
|
|
|
|
|
|
} |
189
|
5
|
|
|
|
|
3660
|
$schedules; |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub get_game_url_args ($$) { |
193
|
|
|
|
|
|
|
|
194
|
24
|
|
|
24
|
1
|
1630
|
my $doc_name = shift; |
195
|
24
|
|
|
|
|
63
|
my $game = shift; |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
my $game_id = sprintf( |
198
|
|
|
|
|
|
|
"%04d%02d%04d", |
199
|
|
|
|
|
|
|
$game->{season}, $game->{stage}, $game->{season_id} |
200
|
24
|
|
|
|
|
162
|
); |
201
|
24
|
|
|
|
|
72
|
my @args; |
202
|
24
|
|
|
|
|
75
|
for ($doc_name) { |
203
|
24
|
|
|
|
|
98
|
when ('BS') { |
204
|
8
|
|
|
|
|
32
|
@args = ($game_id); |
205
|
|
|
|
|
|
|
} |
206
|
16
|
|
|
|
|
43
|
when ('PB') { |
207
|
1
|
|
|
|
|
4
|
@args = ($game->{season}, $game->{season} + 1, $game_id); |
208
|
|
|
|
|
|
|
} |
209
|
15
|
|
|
|
|
35
|
default { |
210
|
|
|
|
|
|
|
@args = ( |
211
|
|
|
|
|
|
|
$game->{season}, $game->{season} + 1, $doc_name, |
212
|
|
|
|
|
|
|
$game->{stage}, $game->{season_id} |
213
|
15
|
|
|
|
|
107
|
); |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
} |
216
|
24
|
|
|
|
|
88
|
@args; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
sub crawl_game ($;$) { |
220
|
|
|
|
|
|
|
|
221
|
7
|
|
|
7
|
1
|
4447
|
my $game = shift; |
222
|
7
|
|
50
|
|
|
60
|
my $opts = shift || {}; |
223
|
|
|
|
|
|
|
|
224
|
7
|
|
|
|
|
55
|
my $path = make_game_path($game->{season}, $game->{stage}, $game->{season_id}); |
225
|
7
|
|
|
|
|
29
|
my $contents = {}; |
226
|
7
|
|
|
|
|
30
|
for my $doc (@GAME_FILES) { |
227
|
42
|
100
|
|
|
|
161
|
next if $doc->{disabled}; |
228
|
35
|
50
|
33
|
|
|
127
|
next if $opts->{game_files} && ! $opts->{game_files}{$doc}; |
229
|
35
|
100
|
|
|
|
167
|
next if $game->{season} < $FIRST_REPORT_SEASONS{$doc->{name}}; |
230
|
21
|
|
|
|
|
121
|
my @args = get_game_url_args($doc->{name}, $game); |
231
|
21
|
|
66
|
|
|
145
|
$doc->{pattern} ||= $HTML_REPORT_URL; |
232
|
21
|
|
100
|
|
|
93
|
$doc->{extension} ||= 'html'; |
233
|
21
|
|
|
|
|
108
|
my $file = "$path/$doc->{name}.$doc->{extension}"; |
234
|
21
|
50
|
33
|
|
|
557
|
if (-f $file && ! $opts->{force}) { |
235
|
0
|
|
|
|
|
0
|
print STDERR "[NOTICE] File $file already exists, not crawling\n"; |
236
|
0
|
|
|
|
|
0
|
$contents->{$doc->{name}} = read_file($file); |
237
|
0
|
|
|
|
|
0
|
next; |
238
|
|
|
|
|
|
|
} |
239
|
21
|
|
|
|
|
166
|
my $url = sprintf($doc->{pattern}, @args); |
240
|
|
|
|
|
|
|
my $content = scrape({ |
241
|
|
|
|
|
|
|
url => $url, validate => $doc->{validate}, retries => $opts->{retries} |
242
|
21
|
|
|
|
|
184
|
}); |
243
|
21
|
50
|
|
|
|
150
|
if (! $content) { |
244
|
0
|
|
|
|
|
0
|
print STDERR "[WARNING] Got no content for $game->{season}, $game->{stage}, $game->{season_id}, $doc->{name}\n"; |
245
|
0
|
|
|
|
|
0
|
next; |
246
|
|
|
|
|
|
|
} |
247
|
21
|
100
|
|
|
|
17549
|
$content =~ s/\xC2\xA0/ /g unless $doc->{extension} eq 'json'; |
248
|
21
|
|
|
|
|
121
|
write_file($content, $file); |
249
|
21
|
|
|
|
|
241
|
$contents->{$doc->{name}} = {content => $content, file => $file}; |
250
|
|
|
|
|
|
|
} |
251
|
7
|
|
|
|
|
36
|
$contents; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
1; |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=head1 AUTHOR |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
More Hockey Stats, C<< >> |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=head1 BUGS |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, or through |
263
|
|
|
|
|
|
|
the web interface at L. I will be notified, and then you'll |
264
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes. |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=head1 SUPPORT |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
perldoc Sport::Analytics::NHL::Scraper |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
You can also look for information at: |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=over 4 |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker (report bugs here) |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
L |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
L |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=item * CPAN Ratings |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
L |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
=item * Search CPAN |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
L |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
=back |