| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package HTML::Scrape; |
|
2
|
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
214236
|
use 5.10.1; |
|
|
3
|
|
|
|
|
23
|
|
|
4
|
3
|
|
|
3
|
|
16
|
use strict; |
|
|
3
|
|
|
|
|
5
|
|
|
|
3
|
|
|
|
|
58
|
|
|
5
|
3
|
|
|
3
|
|
16
|
use warnings; |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
153
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
HTML::Scrape - The great new HTML::Scrape! |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 VERSION |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Version 0.2.0 |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=cut |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = '0.2.0'; |
|
18
|
|
|
|
|
|
|
|
|
19
|
3
|
|
|
3
|
|
1830
|
use HTML::Parser; |
|
|
3
|
|
|
|
|
17500
|
|
|
|
3
|
|
|
|
|
112
|
|
|
20
|
3
|
|
|
3
|
|
3070
|
use HTML::TokeParser; |
|
|
3
|
|
|
|
|
13515
|
|
|
|
3
|
|
|
|
|
91
|
|
|
21
|
3
|
|
|
3
|
|
19
|
use HTML::Tagset; |
|
|
3
|
|
|
|
|
5
|
|
|
|
3
|
|
|
|
|
2641
|
|
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
Handy helpers for common HTML scraping tasks. |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
use HTML::Scrape; |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
my $ids = HTML::Scrape::scrape_all_ids( $html ); |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 FUNCTIONS |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head2 scrape_id( $id, $html ) |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
Scrapes the text of the single ID C<$id> from C<$html>. |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=cut |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub scrape_id { |
|
41
|
15
|
|
|
15
|
1
|
14072
|
my $id = shift; |
|
42
|
15
|
|
|
|
|
21
|
my $html = shift; |
|
43
|
|
|
|
|
|
|
|
|
44
|
15
|
|
|
|
|
41
|
my $all_ids = scrape_all_ids( $html, $id ); |
|
45
|
|
|
|
|
|
|
|
|
46
|
15
|
|
|
|
|
67
|
return $all_ids->{$id}; |
|
47
|
|
|
|
|
|
|
} |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head2 scrape_all_ids( $html [, $specific_id ] ) |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
Parses the entire web page and returns all the text in a hashref keyed on ID. |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
If you pass in C<$specific_id>, then only that ID will be scraped, |
|
55
|
|
|
|
|
|
|
and parsing will stop once it is found. The better way to do this is by |
|
56
|
|
|
|
|
|
|
calling C. |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=cut |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
sub scrape_all_ids { |
|
61
|
30
|
|
|
30
|
1
|
6447
|
my $html = shift; |
|
62
|
30
|
|
|
|
|
41
|
my $wanted_id = shift; |
|
63
|
|
|
|
|
|
|
|
|
64
|
30
|
|
|
|
|
154
|
my $p = HTML::Parser->new( |
|
65
|
|
|
|
|
|
|
start_h => [ \&_parser_handle_start, 'self, tagname, attr, line, column' ], |
|
66
|
|
|
|
|
|
|
end_h => [ \&_parser_handle_end, 'self, tagname, line, column' ], |
|
67
|
|
|
|
|
|
|
text_h => [ \&_parser_handle_text, 'self, dtext' ], |
|
68
|
|
|
|
|
|
|
); |
|
69
|
30
|
|
|
|
|
1682
|
$p->{stack} = []; |
|
70
|
30
|
|
|
|
|
67
|
$p->{ids} = {}; |
|
71
|
30
|
100
|
|
|
|
73
|
if ( defined $wanted_id ) { |
|
72
|
27
|
|
|
|
|
46
|
$p->{wanted_id} = $wanted_id; |
|
73
|
|
|
|
|
|
|
} |
|
74
|
|
|
|
|
|
|
|
|
75
|
30
|
|
|
|
|
82
|
$p->empty_element_tags(1); |
|
76
|
30
|
|
|
|
|
195
|
$p->parse($html); |
|
77
|
30
|
|
|
|
|
96
|
$p->eof; |
|
78
|
|
|
|
|
|
|
|
|
79
|
30
|
100
|
|
|
|
61
|
if ( !defined $wanted_id ) { |
|
80
|
|
|
|
|
|
|
# With a wanted_id, we would have stopped parsing early and left tags on the stack, so don't check. |
|
81
|
3
|
100
|
|
|
|
6
|
if ( my $n = scalar @{$p->{stack}} ) { |
|
|
3
|
|
|
|
|
23
|
|
|
82
|
1
|
|
|
|
|
12
|
warn "$n tag(s) unclosed at end of document.\n"; |
|
83
|
|
|
|
|
|
|
} |
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
|
|
86
|
30
|
|
|
|
|
166
|
return $p->{ids}; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
sub _parser_handle_start { |
|
91
|
529
|
|
|
529
|
|
823
|
my $parser = shift; |
|
92
|
529
|
|
|
|
|
678
|
my $tagname = shift; |
|
93
|
529
|
|
|
|
|
618
|
my $attr = shift; |
|
94
|
529
|
|
|
|
|
633
|
my $line = shift; |
|
95
|
529
|
|
|
|
|
601
|
my $column = shift; |
|
96
|
|
|
|
|
|
|
|
|
97
|
529
|
100
|
|
|
|
1071
|
return if $HTML::Tagset::emptyElement{$tagname}; |
|
98
|
|
|
|
|
|
|
|
|
99
|
505
|
|
|
|
|
667
|
my $id = $attr->{id}; |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# If it's a dupe ID, warn and ignore the ID. |
|
102
|
505
|
50
|
66
|
|
|
1020
|
if ( defined($id) && exists $parser->{ids}{$id} ) { |
|
103
|
0
|
|
|
|
|
0
|
warn "Duplicate ID $id found in <$tagname> at $line:$column\n"; |
|
104
|
0
|
|
|
|
|
0
|
$id = undef; |
|
105
|
|
|
|
|
|
|
} |
|
106
|
|
|
|
|
|
|
|
|
107
|
505
|
|
|
|
|
652
|
my $stack = $parser->{stack}; |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# Tags like and that don't have to close themselves get closed another of them comes along. |
|
110
|
|
|
|
|
|
|
# For example: |
|
111
|
|
|
|
|
|
|
# |
|
112
|
|
|
|
|
|
|
# whatever |
|
113
|
|
|
|
|
|
|
# thingy |
|
114
|
|
|
|
|
|
|
# |
|
115
|
505
|
100
|
66
|
|
|
920
|
if ( $HTML::Tagset::optionalEndTag{$tagname} && @{$stack} && $stack->[-1][0] eq $tagname ) { |
|
|
125
|
|
100
|
|
|
442
|
|
|
116
|
36
|
|
|
|
|
45
|
my $item = pop @{$stack}; |
|
|
36
|
|
|
|
|
50
|
|
|
117
|
36
|
|
|
|
|
62
|
_close_tag( $parser, $item ); |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
|
|
120
|
505
|
|
|
|
|
611
|
push @{$stack}, [ $tagname, $id, '' ]; |
|
|
505
|
|
|
|
|
1119
|
|
|
121
|
|
|
|
|
|
|
|
|
122
|
505
|
|
|
|
|
1780
|
return; |
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub _parser_handle_end { |
|
127
|
402
|
|
|
402
|
|
589
|
my $parser = shift; |
|
128
|
402
|
|
|
|
|
504
|
my $tagname = shift; |
|
129
|
402
|
|
|
|
|
481
|
my $line = shift; |
|
130
|
402
|
|
|
|
|
459
|
my $column = shift; |
|
131
|
|
|
|
|
|
|
|
|
132
|
402
|
100
|
|
|
|
747
|
return if $HTML::Tagset::emptyElement{$tagname}; |
|
133
|
|
|
|
|
|
|
|
|
134
|
389
|
|
|
|
|
497
|
my $stack = $parser->{stack}; |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# Deal with tags that close others. |
|
137
|
389
|
50
|
|
|
|
442
|
if ( @{$stack} ) { |
|
|
389
|
|
|
|
|
677
|
|
|
138
|
389
|
|
|
|
|
517
|
my $previous_item = $stack->[-1]; |
|
139
|
389
|
|
|
|
|
474
|
my $previous_tagname = $previous_item->[0]; |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
#warn "tagname $tagname hprase markup = " , $HTML::Tagset::isPhraseMarkup{$tagname} // 'undef', ' previous = ' . $previous_tagname; |
|
142
|
|
|
|
|
|
|
my $this_tag_closes_previous_one = |
|
143
|
|
|
|
|
|
|
( $tagname ne $previous_tagname ) |
|
144
|
|
|
|
|
|
|
&& |
|
145
|
|
|
|
|
|
|
( |
|
146
|
|
|
|
|
|
|
( ($tagname eq 'ul' || $tagname eq 'ol') && $previous_tagname eq 'li' ) |
|
147
|
|
|
|
|
|
|
|| |
|
148
|
|
|
|
|
|
|
( ($tagname eq 'dl') && ($previous_tagname eq 'dt' || $previous_tagname eq 'dd') ) |
|
149
|
|
|
|
|
|
|
|| |
|
150
|
389
|
|
100
|
|
|
817
|
( !$HTML::Tagset::isPhraseMarkup{$tagname} && $previous_tagname eq 'p' ) |
|
151
|
|
|
|
|
|
|
) |
|
152
|
|
|
|
|
|
|
; |
|
153
|
389
|
100
|
|
|
|
644
|
if ( $this_tag_closes_previous_one ) { |
|
154
|
14
|
|
|
|
|
18
|
_close_tag( $parser, pop @{$stack} ); |
|
|
14
|
|
|
|
|
25
|
|
|
155
|
|
|
|
|
|
|
} |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
|
|
158
|
389
|
50
|
|
|
|
495
|
if ( !@{$stack} ) { |
|
|
389
|
|
|
|
|
666
|
|
|
159
|
0
|
|
|
|
|
0
|
warn "Unexpected closing $tagname> at $line:$column\n"; |
|
160
|
0
|
|
|
|
|
0
|
return; |
|
161
|
|
|
|
|
|
|
} |
|
162
|
389
|
100
|
|
|
|
693
|
if ( $tagname ne $stack->[-1][0] ) { |
|
163
|
4
|
|
|
|
|
40
|
warn "Unexpected closing $tagname> at $line:$column: Expecting $stack->[-1][0]>\n"; |
|
164
|
4
|
|
|
|
|
30
|
return; |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
|
|
167
|
385
|
|
|
|
|
461
|
_close_tag( $parser, pop @{$stack} ); |
|
|
385
|
|
|
|
|
791
|
|
|
168
|
|
|
|
|
|
|
|
|
169
|
385
|
|
|
|
|
1175
|
return; |
|
170
|
|
|
|
|
|
|
} |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub _parser_handle_text { |
|
174
|
833
|
|
|
833
|
|
1207
|
my $parser = shift; |
|
175
|
833
|
|
|
|
|
1073
|
my $text = shift; |
|
176
|
|
|
|
|
|
|
|
|
177
|
833
|
|
|
|
|
947
|
for my $item ( @{$parser->{stack}} ) { |
|
|
833
|
|
|
|
|
1443
|
|
|
178
|
4300
|
100
|
|
|
|
6984
|
if ( $item->[1] ) { # Only accumulate text for tags with IDs. |
|
179
|
792
|
|
|
|
|
1200
|
$item->[2] .= $text; |
|
180
|
|
|
|
|
|
|
} |
|
181
|
|
|
|
|
|
|
} |
|
182
|
|
|
|
|
|
|
|
|
183
|
833
|
|
|
|
|
2940
|
return; |
|
184
|
|
|
|
|
|
|
} |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
sub _close_tag { |
|
188
|
435
|
|
|
435
|
|
541
|
my $parser = shift; |
|
189
|
435
|
|
|
|
|
516
|
my $item = shift; |
|
190
|
|
|
|
|
|
|
|
|
191
|
435
|
|
|
|
|
499
|
my (undef, $id, $text) = @{$item}; |
|
|
435
|
|
|
|
|
714
|
|
|
192
|
435
|
100
|
|
|
|
752
|
if ( defined $id ) { |
|
193
|
109
|
|
|
|
|
126
|
my $keepit; |
|
194
|
|
|
|
|
|
|
|
|
195
|
109
|
100
|
|
|
|
194
|
if ( defined $parser->{wanted_id} ) { |
|
196
|
|
|
|
|
|
|
# We're searching for a specific ID. |
|
197
|
86
|
100
|
|
|
|
144
|
if ( $id eq $parser->{wanted_id} ) { |
|
198
|
24
|
|
|
|
|
33
|
$keepit = 1; |
|
199
|
24
|
|
|
|
|
59
|
$parser->eof; |
|
200
|
|
|
|
|
|
|
} |
|
201
|
|
|
|
|
|
|
else { |
|
202
|
|
|
|
|
|
|
# No need to keep the text of an ID we don't care about. |
|
203
|
|
|
|
|
|
|
} |
|
204
|
|
|
|
|
|
|
} |
|
205
|
|
|
|
|
|
|
else { |
|
206
|
23
|
|
|
|
|
32
|
$keepit = 1; |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
|
|
209
|
109
|
100
|
|
|
|
177
|
if ( $keepit ) { |
|
210
|
47
|
|
|
|
|
160
|
$text =~ s/^\s+//; |
|
211
|
47
|
|
|
|
|
261
|
$text =~ s/\s+$//; |
|
212
|
47
|
|
|
|
|
417
|
$text =~ s/\s+/ /g; |
|
213
|
47
|
|
|
|
|
127
|
$parser->{ids}{$id} = $text; |
|
214
|
|
|
|
|
|
|
} |
|
215
|
|
|
|
|
|
|
} |
|
216
|
|
|
|
|
|
|
|
|
217
|
435
|
|
|
|
|
629
|
return; |
|
218
|
|
|
|
|
|
|
} |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=head1 AUTHOR |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
Andy Lester, C<< >> |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=head1 BUGS |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Please report any bugs or feature requests at L.. |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=head1 SUPPORT |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
perldoc HTML::Scrape |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
You can also look for information at: |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=over 4 |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
=item * Search CPAN |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
L |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=back |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
This software is Copyright (c) 2023 by Andy Lester. |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
This is free software, licensed under: The Artistic License 2.0 (GPL Compatible) |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=cut |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
1; # End of HTML::Scrape |