File Coverage

blib/lib/HTML/Scrape.pm

Criterion	Covered	Total	%
statement	78	85	91.7
branch	17	22	77.2
condition	9	12	75.0
subroutine	10	10	100.0
pod	1	1	100.0
total	115	130	88.4

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::Scrape;
2
3	2			2		140399	use 5.10.1;
	2					12
4	2			2		17	use strict;
	2					6
	2					38
5	2			2		9	use warnings;
	2					4
	2					134
6
7							=head1 NAME
8
9							HTML::Scrape - The great new HTML::Scrape!
10
11							=head1 VERSION
12
13							Version 0.1.0
14
15							=cut
16
17							our $VERSION = '0.1.0';
18
19	2			2		1214	use HTML::Parser;
	2					11663
	2					66
20	2			2		994	use HTML::Tagset;
	2					2670
	2					1636
21
22
23							=head1 SYNOPSIS
24
25							Handy helpers for common HTML scraping tasks.
26
27							use HTML::Scrape;
28
29							my $ids = HTML::Scrape::scrape_all_ids( $html );
30
31							=head1 FUNCTIONS
32
33							=head2 scrape_all_ids( $html )
34
35							Parses the entire web page and returns all the text in a hashref keyed on ID.
36
37							=cut
38
39							sub scrape_all_ids {
40	2			2	1	1197	my $html = shift;
41
42	2					17	my $p = HTML::Parser->new(
43							start_h => [ \&_parser_handle_start, 'self, tagname, attr, line, column' ],
44							end_h => [ \&_parser_handle_end, 'self, tagname, line, column' ],
45							text_h => [ \&_parser_handle_text, 'self, dtext' ],
46							ignore_elements => [qw(script style)],
47							);
48	2					167	$p->{stack} = [];
49	2					5	$p->{ids} = {};
50
51	2					15	$p->empty_element_tags(1);
52	2					21	$p->parse($html);
53	2					16	$p->eof;
54
55	2	50				2	if ( my $n = scalar @{$p->{stack}} ) {
	2					7
56	0					0	warn "$n tag(s) unclosed at end of document.\n";
57							}
58
59	2					15	return $p->{ids};
60							}
61
62
63							sub _parser_handle_start {
64	22			22		30	my $parser = shift;
65	22					33	my $tagname = shift;
66	22					26	my $attr = shift;
67	22					37	my $line = shift;
68	22					22	my $column = shift;
69
70	22	100				56	return if $HTML::Tagset::emptyElement{$tagname};
71
72	19					22	my $id = $attr->{id};
73
74							# If it's a dupe ID, warn and ignore the ID.
75	19	50	66			53	if ( defined($id) && exists $parser->{ids}{$id} ) {
76	0					0	warn "Duplicate ID $id found in <$tagname> at $line:$column\n";
77	0					0	$id = undef;
78							}
79
80	19					29	my $stack = $parser->{stack};
81
82							# Tags like and that don't have to close themselves get closed another of them comes along.
83							# For example:
84							#
85							# whatever
86							# thingy
87							#
88	19	100	66			43	if ( $HTML::Tagset::optionalEndTag{$tagname} && @{$stack} && $stack->[-1][0] eq $tagname ) {
	8		100			31
89	3					5	my $item = pop @{$stack};
	3					4
90	3					20	_close_tag( $parser, $item );
91							}
92
93	19					23	push @{$stack}, [ $tagname, $id, '' ];
	19					64
94
95	19					73	return;
96							}
97
98
99							sub _parser_handle_end {
100	16			16		28	my $parser = shift;
101	16					21	my $tagname = shift;
102	16					19	my $line = shift;
103	16					19	my $column = shift;
104
105	16	100				34	return if $HTML::Tagset::emptyElement{$tagname};
106
107	14					18	my $stack = $parser->{stack};
108
109							# Deal with tags that close others.
110	14	50				16	if ( @{$stack} ) {
	14					29
111	14					18	my $previous_item = $stack->[-1];
112	14					20	my $previous_tagname = $previous_item->[0];
113
114							#warn "tagname $tagname hprase markup = " , $HTML::Tagset::isPhraseMarkup{$tagname} // 'undef', ' previous = ' . $previous_tagname;
115							my $this_tag_closes_previous_one =
116							( $tagname ne $previous_tagname )
117							&&
118							(
119							( ($tagname eq 'ul' \|\| $tagname eq 'ol') && $previous_tagname eq 'li' )
120							\|\|
121							( ($tagname eq 'dl') && ($previous_tagname eq 'dt' \|\| $previous_tagname eq 'dd') )
122							\|\|
123	14		66			45	( !$HTML::Tagset::isPhraseMarkup{$tagname} && $previous_tagname eq 'p' )
124							)
125							;
126	14	100				26	if ( $this_tag_closes_previous_one ) {
127	2					2	_close_tag( $parser, pop @{$stack} );
	2					5
128							}
129							}
130
131	14	50				18	if ( !@{$stack} ) {
	14					29
132	0					0	warn "Unexpected closing at $line:$column\n";
133	0					0	return;
134							}
135	14	50				26	if ( $tagname ne $stack->[-1][0] ) {
136	0					0	warn "Unexpected closing at $line:$column: Expecting [-1][0]>\n";
137	0					0	return;
138							}
139
140	14					17	_close_tag( $parser, pop @{$stack} );
	14					30
141
142	14					46	return;
143							}
144
145
146							sub _parser_handle_text {
147	38			38		56	my $parser = shift;
148	38					53	my $text = shift;
149
150	38					42	for my $item ( @{$parser->{stack}} ) {
	38					68
151	87	100				149	if ( $item->[1] ) { # Only accumulate text for tags with IDs.
152	17					41	$item->[2] .= $text;
153							}
154							}
155
156	38					156	return;
157							}
158
159
160							sub _close_tag {
161	19			19		26	my $parser = shift;
162	19					21	my $item = shift;
163
164	19					20	my (undef, $id, $text) = @{$item};
	19					37
165	19	100				34	if ( defined $id ) {
166	10					34	$text =~ s/^\s+//;
167	10					35	$text =~ s/\s+$//;
168	10					28	$text =~ s/\s+/ /g;
169	10					22	$parser->{ids}{$id} = $text;
170							}
171
172	19					32	return;
173							}
174
175
176							=head1 AUTHOR
177
178							Andy Lester, C<< >>
179
180							=head1 BUGS
181
182							Please report any bugs or feature requests at L..
183
184							=head1 SUPPORT
185
186							You can find documentation for this module with the perldoc command.
187
188							perldoc HTML::Scrape
189
190							You can also look for information at:
191
192							=over 4
193
194							=item * Search CPAN
195
196							L
197
198							=back
199
200							=head1 LICENSE AND COPYRIGHT
201
202							This software is Copyright (c) 2023 by Andy Lester.
203
204							This is free software, licensed under: The Artistic License 2.0 (GPL Compatible)
205
206							=cut
207
208							1; # End of HTML::Scrape