File Coverage

blib/lib/HTML/Scrape.pm

Criterion	Covered	Total	%
statement	99	103	96.1
branch	29	32	90.6
condition	10	12	83.3
subroutine	12	12	100.0
pod	2	2	100.0
total	152	161	94.4

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::Scrape;
2
3	3			3		214236	use 5.10.1;
	3					23
4	3			3		16	use strict;
	3					5
	3					58
5	3			3		16	use warnings;
	3					6
	3					153
6
7							=head1 NAME
8
9							HTML::Scrape - The great new HTML::Scrape!
10
11							=head1 VERSION
12
13							Version 0.2.0
14
15							=cut
16
17							our $VERSION = '0.2.0';
18
19	3			3		1830	use HTML::Parser;
	3					17500
	3					112
20	3			3		3070	use HTML::TokeParser;
	3					13515
	3					91
21	3			3		19	use HTML::Tagset;
	3					5
	3					2641
22
23
24							=head1 SYNOPSIS
25
26							Handy helpers for common HTML scraping tasks.
27
28							use HTML::Scrape;
29
30							my $ids = HTML::Scrape::scrape_all_ids( $html );
31
32							=head1 FUNCTIONS
33
34							=head2 scrape_id( $id, $html )
35
36							Scrapes the text of the single ID C<$id> from C<$html>.
37
38							=cut
39
40							sub scrape_id {
41	15			15	1	14072	my $id = shift;
42	15					21	my $html = shift;
43
44	15					41	my $all_ids = scrape_all_ids( $html, $id );
45
46	15					67	return $all_ids->{$id};
47							}
48
49
50							=head2 scrape_all_ids( $html [, $specific_id ] )
51
52							Parses the entire web page and returns all the text in a hashref keyed on ID.
53
54							If you pass in C<$specific_id>, then only that ID will be scraped,
55							and parsing will stop once it is found. The better way to do this is by
56							calling C.
57
58							=cut
59
60							sub scrape_all_ids {
61	30			30	1	6447	my $html = shift;
62	30					41	my $wanted_id = shift;
63
64	30					154	my $p = HTML::Parser->new(
65							start_h => [ \&_parser_handle_start, 'self, tagname, attr, line, column' ],
66							end_h => [ \&_parser_handle_end, 'self, tagname, line, column' ],
67							text_h => [ \&_parser_handle_text, 'self, dtext' ],
68							);
69	30					1682	$p->{stack} = [];
70	30					67	$p->{ids} = {};
71	30	100				73	if ( defined $wanted_id ) {
72	27					46	$p->{wanted_id} = $wanted_id;
73							}
74
75	30					82	$p->empty_element_tags(1);
76	30					195	$p->parse($html);
77	30					96	$p->eof;
78
79	30	100				61	if ( !defined $wanted_id ) {
80							# With a wanted_id, we would have stopped parsing early and left tags on the stack, so don't check.
81	3	100				6	if ( my $n = scalar @{$p->{stack}} ) {
	3					23
82	1					12	warn "$n tag(s) unclosed at end of document.\n";
83							}
84							}
85
86	30					166	return $p->{ids};
87							}
88
89
90							sub _parser_handle_start {
91	529			529		823	my $parser = shift;
92	529					678	my $tagname = shift;
93	529					618	my $attr = shift;
94	529					633	my $line = shift;
95	529					601	my $column = shift;
96
97	529	100				1071	return if $HTML::Tagset::emptyElement{$tagname};
98
99	505					667	my $id = $attr->{id};
100
101							# If it's a dupe ID, warn and ignore the ID.
102	505	50	66			1020	if ( defined($id) && exists $parser->{ids}{$id} ) {
103	0					0	warn "Duplicate ID $id found in <$tagname> at $line:$column\n";
104	0					0	$id = undef;
105							}
106
107	505					652	my $stack = $parser->{stack};
108
109							# Tags like and that don't have to close themselves get closed another of them comes along.
110							# For example:
111							#
112							# whatever
113							# thingy
114							#
115	505	100	66			920	if ( $HTML::Tagset::optionalEndTag{$tagname} && @{$stack} && $stack->[-1][0] eq $tagname ) {
	125		100			442
116	36					45	my $item = pop @{$stack};
	36					50
117	36					62	_close_tag( $parser, $item );
118							}
119
120	505					611	push @{$stack}, [ $tagname, $id, '' ];
	505					1119
121
122	505					1780	return;
123							}
124
125
126							sub _parser_handle_end {
127	402			402		589	my $parser = shift;
128	402					504	my $tagname = shift;
129	402					481	my $line = shift;
130	402					459	my $column = shift;
131
132	402	100				747	return if $HTML::Tagset::emptyElement{$tagname};
133
134	389					497	my $stack = $parser->{stack};
135
136							# Deal with tags that close others.
137	389	50				442	if ( @{$stack} ) {
	389					677
138	389					517	my $previous_item = $stack->[-1];
139	389					474	my $previous_tagname = $previous_item->[0];
140
141							#warn "tagname $tagname hprase markup = " , $HTML::Tagset::isPhraseMarkup{$tagname} // 'undef', ' previous = ' . $previous_tagname;
142							my $this_tag_closes_previous_one =
143							( $tagname ne $previous_tagname )
144							&&
145							(
146							( ($tagname eq 'ul' \|\| $tagname eq 'ol') && $previous_tagname eq 'li' )
147							\|\|
148							( ($tagname eq 'dl') && ($previous_tagname eq 'dt' \|\| $previous_tagname eq 'dd') )
149							\|\|
150	389		100			817	( !$HTML::Tagset::isPhraseMarkup{$tagname} && $previous_tagname eq 'p' )
151							)
152							;
153	389	100				644	if ( $this_tag_closes_previous_one ) {
154	14					18	_close_tag( $parser, pop @{$stack} );
	14					25
155							}
156							}
157
158	389	50				495	if ( !@{$stack} ) {
	389					666
159	0					0	warn "Unexpected closing at $line:$column\n";
160	0					0	return;
161							}
162	389	100				693	if ( $tagname ne $stack->[-1][0] ) {
163	4					40	warn "Unexpected closing at $line:$column: Expecting [-1][0]>\n";
164	4					30	return;
165							}
166
167	385					461	_close_tag( $parser, pop @{$stack} );
	385					791
168
169	385					1175	return;
170							}
171
172
173							sub _parser_handle_text {
174	833			833		1207	my $parser = shift;
175	833					1073	my $text = shift;
176
177	833					947	for my $item ( @{$parser->{stack}} ) {
	833					1443
178	4300	100				6984	if ( $item->[1] ) { # Only accumulate text for tags with IDs.
179	792					1200	$item->[2] .= $text;
180							}
181							}
182
183	833					2940	return;
184							}
185
186
187							sub _close_tag {
188	435			435		541	my $parser = shift;
189	435					516	my $item = shift;
190
191	435					499	my (undef, $id, $text) = @{$item};
	435					714
192	435	100				752	if ( defined $id ) {
193	109					126	my $keepit;
194
195	109	100				194	if ( defined $parser->{wanted_id} ) {
196							# We're searching for a specific ID.
197	86	100				144	if ( $id eq $parser->{wanted_id} ) {
198	24					33	$keepit = 1;
199	24					59	$parser->eof;
200							}
201							else {
202							# No need to keep the text of an ID we don't care about.
203							}
204							}
205							else {
206	23					32	$keepit = 1;
207							}
208
209	109	100				177	if ( $keepit ) {
210	47					160	$text =~ s/^\s+//;
211	47					261	$text =~ s/\s+$//;
212	47					417	$text =~ s/\s+/ /g;
213	47					127	$parser->{ids}{$id} = $text;
214							}
215							}
216
217	435					629	return;
218							}
219
220
221							=head1 AUTHOR
222
223							Andy Lester, C<< >>
224
225							=head1 BUGS
226
227							Please report any bugs or feature requests at L..
228
229							=head1 SUPPORT
230
231							You can find documentation for this module with the perldoc command.
232
233							perldoc HTML::Scrape
234
235							You can also look for information at:
236
237							=over 4
238
239							=item * Search CPAN
240
241							L
242
243							=back
244
245							=head1 LICENSE AND COPYRIGHT
246
247							This software is Copyright (c) 2023 by Andy Lester.
248
249							This is free software, licensed under: The Artistic License 2.0 (GPL Compatible)
250
251							=cut
252
253							1; # End of HTML::Scrape