| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package HTML::Scrape; |
|
2
|
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
140399
|
use 5.10.1; |
|
|
2
|
|
|
|
|
12
|
|
|
4
|
2
|
|
|
2
|
|
17
|
use strict; |
|
|
2
|
|
|
|
|
6
|
|
|
|
2
|
|
|
|
|
38
|
|
|
5
|
2
|
|
|
2
|
|
9
|
use warnings; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
134
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
HTML::Scrape - The great new HTML::Scrape! |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 VERSION |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Version 0.1.0 |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=cut |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = '0.1.0'; |
|
18
|
|
|
|
|
|
|
|
|
19
|
2
|
|
|
2
|
|
1214
|
use HTML::Parser; |
|
|
2
|
|
|
|
|
11663
|
|
|
|
2
|
|
|
|
|
66
|
|
|
20
|
2
|
|
|
2
|
|
994
|
use HTML::Tagset; |
|
|
2
|
|
|
|
|
2670
|
|
|
|
2
|
|
|
|
|
1636
|
|
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
Handy helpers for common HTML scraping tasks. |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
use HTML::Scrape; |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
my $ids = HTML::Scrape::scrape_all_ids( $html ); |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 FUNCTIONS |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head2 scrape_all_ids( $html ) |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
Parses the entire web page and returns all the text in a hashref keyed on ID. |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=cut |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub scrape_all_ids { |
|
40
|
2
|
|
|
2
|
1
|
1197
|
my $html = shift; |
|
41
|
|
|
|
|
|
|
|
|
42
|
2
|
|
|
|
|
17
|
my $p = HTML::Parser->new( |
|
43
|
|
|
|
|
|
|
start_h => [ \&_parser_handle_start, 'self, tagname, attr, line, column' ], |
|
44
|
|
|
|
|
|
|
end_h => [ \&_parser_handle_end, 'self, tagname, line, column' ], |
|
45
|
|
|
|
|
|
|
text_h => [ \&_parser_handle_text, 'self, dtext' ], |
|
46
|
|
|
|
|
|
|
ignore_elements => [qw(script style)], |
|
47
|
|
|
|
|
|
|
); |
|
48
|
2
|
|
|
|
|
167
|
$p->{stack} = []; |
|
49
|
2
|
|
|
|
|
5
|
$p->{ids} = {}; |
|
50
|
|
|
|
|
|
|
|
|
51
|
2
|
|
|
|
|
15
|
$p->empty_element_tags(1); |
|
52
|
2
|
|
|
|
|
21
|
$p->parse($html); |
|
53
|
2
|
|
|
|
|
16
|
$p->eof; |
|
54
|
|
|
|
|
|
|
|
|
55
|
2
|
50
|
|
|
|
2
|
if ( my $n = scalar @{$p->{stack}} ) { |
|
|
2
|
|
|
|
|
7
|
|
|
56
|
0
|
|
|
|
|
0
|
warn "$n tag(s) unclosed at end of document.\n"; |
|
57
|
|
|
|
|
|
|
} |
|
58
|
|
|
|
|
|
|
|
|
59
|
2
|
|
|
|
|
15
|
return $p->{ids}; |
|
60
|
|
|
|
|
|
|
} |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub _parser_handle_start { |
|
64
|
22
|
|
|
22
|
|
30
|
my $parser = shift; |
|
65
|
22
|
|
|
|
|
33
|
my $tagname = shift; |
|
66
|
22
|
|
|
|
|
26
|
my $attr = shift; |
|
67
|
22
|
|
|
|
|
37
|
my $line = shift; |
|
68
|
22
|
|
|
|
|
22
|
my $column = shift; |
|
69
|
|
|
|
|
|
|
|
|
70
|
22
|
100
|
|
|
|
56
|
return if $HTML::Tagset::emptyElement{$tagname}; |
|
71
|
|
|
|
|
|
|
|
|
72
|
19
|
|
|
|
|
22
|
my $id = $attr->{id}; |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# If it's a dupe ID, warn and ignore the ID. |
|
75
|
19
|
50
|
66
|
|
|
53
|
if ( defined($id) && exists $parser->{ids}{$id} ) { |
|
76
|
0
|
|
|
|
|
0
|
warn "Duplicate ID $id found in <$tagname> at $line:$column\n"; |
|
77
|
0
|
|
|
|
|
0
|
$id = undef; |
|
78
|
|
|
|
|
|
|
} |
|
79
|
|
|
|
|
|
|
|
|
80
|
19
|
|
|
|
|
29
|
my $stack = $parser->{stack}; |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# Tags like and that don't have to close themselves get closed another of them comes along. |
|
83
|
|
|
|
|
|
|
# For example: |
|
84
|
|
|
|
|
|
|
# |
|
85
|
|
|
|
|
|
|
# whatever |
|
86
|
|
|
|
|
|
|
# thingy |
|
87
|
|
|
|
|
|
|
# |
|
88
|
19
|
100
|
66
|
|
|
43
|
if ( $HTML::Tagset::optionalEndTag{$tagname} && @{$stack} && $stack->[-1][0] eq $tagname ) { |
|
|
8
|
|
100
|
|
|
31
|
|
|
89
|
3
|
|
|
|
|
5
|
my $item = pop @{$stack}; |
|
|
3
|
|
|
|
|
4
|
|
|
90
|
3
|
|
|
|
|
20
|
_close_tag( $parser, $item ); |
|
91
|
|
|
|
|
|
|
} |
|
92
|
|
|
|
|
|
|
|
|
93
|
19
|
|
|
|
|
23
|
push @{$stack}, [ $tagname, $id, '' ]; |
|
|
19
|
|
|
|
|
64
|
|
|
94
|
|
|
|
|
|
|
|
|
95
|
19
|
|
|
|
|
73
|
return; |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub _parser_handle_end { |
|
100
|
16
|
|
|
16
|
|
28
|
my $parser = shift; |
|
101
|
16
|
|
|
|
|
21
|
my $tagname = shift; |
|
102
|
16
|
|
|
|
|
19
|
my $line = shift; |
|
103
|
16
|
|
|
|
|
19
|
my $column = shift; |
|
104
|
|
|
|
|
|
|
|
|
105
|
16
|
100
|
|
|
|
34
|
return if $HTML::Tagset::emptyElement{$tagname}; |
|
106
|
|
|
|
|
|
|
|
|
107
|
14
|
|
|
|
|
18
|
my $stack = $parser->{stack}; |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# Deal with tags that close others. |
|
110
|
14
|
50
|
|
|
|
16
|
if ( @{$stack} ) { |
|
|
14
|
|
|
|
|
29
|
|
|
111
|
14
|
|
|
|
|
18
|
my $previous_item = $stack->[-1]; |
|
112
|
14
|
|
|
|
|
20
|
my $previous_tagname = $previous_item->[0]; |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
#warn "tagname $tagname hprase markup = " , $HTML::Tagset::isPhraseMarkup{$tagname} // 'undef', ' previous = ' . $previous_tagname; |
|
115
|
|
|
|
|
|
|
my $this_tag_closes_previous_one = |
|
116
|
|
|
|
|
|
|
( $tagname ne $previous_tagname ) |
|
117
|
|
|
|
|
|
|
&& |
|
118
|
|
|
|
|
|
|
( |
|
119
|
|
|
|
|
|
|
( ($tagname eq 'ul' || $tagname eq 'ol') && $previous_tagname eq 'li' ) |
|
120
|
|
|
|
|
|
|
|| |
|
121
|
|
|
|
|
|
|
( ($tagname eq 'dl') && ($previous_tagname eq 'dt' || $previous_tagname eq 'dd') ) |
|
122
|
|
|
|
|
|
|
|| |
|
123
|
14
|
|
66
|
|
|
45
|
( !$HTML::Tagset::isPhraseMarkup{$tagname} && $previous_tagname eq 'p' ) |
|
124
|
|
|
|
|
|
|
) |
|
125
|
|
|
|
|
|
|
; |
|
126
|
14
|
100
|
|
|
|
26
|
if ( $this_tag_closes_previous_one ) { |
|
127
|
2
|
|
|
|
|
2
|
_close_tag( $parser, pop @{$stack} ); |
|
|
2
|
|
|
|
|
5
|
|
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
} |
|
130
|
|
|
|
|
|
|
|
|
131
|
14
|
50
|
|
|
|
18
|
if ( !@{$stack} ) { |
|
|
14
|
|
|
|
|
29
|
|
|
132
|
0
|
|
|
|
|
0
|
warn "Unexpected closing $tagname> at $line:$column\n"; |
|
133
|
0
|
|
|
|
|
0
|
return; |
|
134
|
|
|
|
|
|
|
} |
|
135
|
14
|
50
|
|
|
|
26
|
if ( $tagname ne $stack->[-1][0] ) { |
|
136
|
0
|
|
|
|
|
0
|
warn "Unexpected closing $tagname> at $line:$column: Expecting $stack->[-1][0]>\n"; |
|
137
|
0
|
|
|
|
|
0
|
return; |
|
138
|
|
|
|
|
|
|
} |
|
139
|
|
|
|
|
|
|
|
|
140
|
14
|
|
|
|
|
17
|
_close_tag( $parser, pop @{$stack} ); |
|
|
14
|
|
|
|
|
30
|
|
|
141
|
|
|
|
|
|
|
|
|
142
|
14
|
|
|
|
|
46
|
return; |
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub _parser_handle_text { |
|
147
|
38
|
|
|
38
|
|
56
|
my $parser = shift; |
|
148
|
38
|
|
|
|
|
53
|
my $text = shift; |
|
149
|
|
|
|
|
|
|
|
|
150
|
38
|
|
|
|
|
42
|
for my $item ( @{$parser->{stack}} ) { |
|
|
38
|
|
|
|
|
68
|
|
|
151
|
87
|
100
|
|
|
|
149
|
if ( $item->[1] ) { # Only accumulate text for tags with IDs. |
|
152
|
17
|
|
|
|
|
41
|
$item->[2] .= $text; |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
|
|
156
|
38
|
|
|
|
|
156
|
return; |
|
157
|
|
|
|
|
|
|
} |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
sub _close_tag { |
|
161
|
19
|
|
|
19
|
|
26
|
my $parser = shift; |
|
162
|
19
|
|
|
|
|
21
|
my $item = shift; |
|
163
|
|
|
|
|
|
|
|
|
164
|
19
|
|
|
|
|
20
|
my (undef, $id, $text) = @{$item}; |
|
|
19
|
|
|
|
|
37
|
|
|
165
|
19
|
100
|
|
|
|
34
|
if ( defined $id ) { |
|
166
|
10
|
|
|
|
|
34
|
$text =~ s/^\s+//; |
|
167
|
10
|
|
|
|
|
35
|
$text =~ s/\s+$//; |
|
168
|
10
|
|
|
|
|
28
|
$text =~ s/\s+/ /g; |
|
169
|
10
|
|
|
|
|
22
|
$parser->{ids}{$id} = $text; |
|
170
|
|
|
|
|
|
|
} |
|
171
|
|
|
|
|
|
|
|
|
172
|
19
|
|
|
|
|
32
|
return; |
|
173
|
|
|
|
|
|
|
} |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head1 AUTHOR |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Andy Lester, C<< >> |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=head1 BUGS |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
Please report any bugs or feature requests at L.. |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=head1 SUPPORT |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
perldoc HTML::Scrape |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
You can also look for information at: |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=over 4 |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=item * Search CPAN |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
L |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
=back |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
This software is Copyright (c) 2023 by Andy Lester. |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
This is free software, licensed under: The Artistic License 2.0 (GPL Compatible) |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=cut |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
1; # End of HTML::Scrape |