File Coverage

blib/lib/HTML/Index/Document.pm

Criterion	Covered	Total	%
statement	48	55	87.2
branch	13	24	54.1
condition			n/a
subroutine	9	9	100.0
pod	0	2	0.0
total	70	90	77.7

line	stmt	bran	sub	pod	time	code
1						package HTML::Index::Document;
2
3	8		8		220084	use strict;
	8				21
	8				281
4	8		8		44	use warnings;
	8				16
	8				259
5
6	8		8		13614	use Class::Struct;
	8				18652
	8				47
7	8		8		8577	use HTML::Entities qw( decode_entities );
	8				65284
	8				1398
8						require HTML::TreeBuilder;
9
10						struct 'HTML::Index::Document::Struct' => {
11						name => '$',
12						path => '$',
13						contents => '$',
14						parser => '$',
15						};
16
17						my @NON_VISIBLE_HTML_TAGS = qw(
18						style
19						script
20						head
21						);
22
23						my $NON_VISIBLE_HTML_TAGS = '(' . join( '\|', @NON_VISIBLE_HTML_TAGS ) . ')';
24
25	8		8		81	use vars qw( @ISA );
	8				21
	8				6126
26
27						@ISA = qw( HTML::Index::Document::Struct );
28
29						#------------------------------------------------------------------------------
30						#
31						# Constructor
32						#
33						#------------------------------------------------------------------------------
34
35						sub new
36						{
37	8		8	0	45	my $class = shift;
38	8				226	my $self = $class->SUPER::new( @_ );
39	8				295	$self->_init();
40	8				23	return $self;
41						}
42
43						sub parse
44						{
45	8		8	0	13	my $self = shift;
46
47	8				193	my $contents = $self->contents();
48
49	8	50			228	if ( lc( $self->parser ) eq 'html' )
		0
50						{
51	8				102	my $tree = HTML::TreeBuilder->new();
52	8				2111	$tree->parse( $contents );
53	8				13138	my $text = join( ' ', _get_text_array( $tree ) );
54	8				32	$tree->delete();
55	8				963	return $text;
56						}
57						elsif ( lc( $self->parser eq 'regex' ) )
58						{
59	0				0	my $text = $contents;
60						# get rid of non-visible (script / style / head) text
61	0				0	$text =~ s{
62						<$NON_VISIBLE_HTML_TAGS.*?> # a head, script, or style start tag
63						.*? # non-greedy match of anything
64						# matching end tag
65						}
66						{}gxis;
67						# get rid of tags
68	0				0	$text =~ s/<.*?>//gs;
69	0				0	$text = decode_entities( $text );
70	0				0	$text =~ s/[\s\240]+/ /g;
71	0				0	return $text;
72						}
73						else
74						{
75	0				0	die "Unrecognized value for parser - should be one of (html\|regex)\n";
76						}
77						}
78
79						#------------------------------------------------------------------------------
80						#
81						# Private functions
82						#
83						#------------------------------------------------------------------------------
84
85						sub _init
86						{
87	8		8		11	my $self = shift;
88	8	50			172	if ( my $path = $self->path() )
89						{
90	8	50			187	die "Can't read $path\n" unless -r $path;
91	8	50			170	unless ( $self->contents() )
92						{
93	8				285	open( FH, $path );
94	8				445	$self->contents( join( '', ) );
95	8				128	close( FH );
96						}
97	8	50			169	$self->name( $self->path() ) unless $self->name();
98						}
99	8	50			406	die "No name attribute\n" unless defined $self->name();
100	8	50			200	die "No contents attribute\n" unless defined $self->contents();
101	8	50			268	$self->parser( 'html' ) unless $self->parser();
102	8	50			389	die "parser attribute should be one of (html\|regex)\n"
103						unless $self->parser() =~ /^(html\|regex)$/i
104						;
105
106	8				79	return $self;
107						}
108
109						sub _get_text_array
110						{
111	22		22		32	my $element = shift;
112	22				29	my @text;
113
114	22				68	for my $child ( $element->content_list )
115						{
116	52	100			448	if ( ref( $child ) )
117						{
118	30	100			77	next if $child->tag =~ /^$NON_VISIBLE_HTML_TAGS$/;
119	14				125	push( @text, _get_text_array( $child ) );
120						}
121						else
122						{
123	22				51	push( @text, $child );
124						}
125						}
126
127	22				79	return @text;
128						}
129
130						#------------------------------------------------------------------------------
131						#
132						# Start of POD
133						#
134						#------------------------------------------------------------------------------
135
136						=head1 NAME
137
138						HTML::Index::Document - Perl object used by HTML::Index::Store to create an
139						index of HTML documents for searching
140
141						=head1 SYNOPSIS
142
143						$doc = HTML::Index::Document->new( path => $path );
144
145						$doc = HTML::Index::Document->new(
146						name => $name,
147						contents => $contents,
148						mod_time => $mod_time,
149						);
150
151						=head1 DESCRIPTION
152
153						This module allows you to create objects to represent HTML documents to be
154						indexed for searching using the HTML::Index modules. These might be HTML files
155						in a webserver document root, or HTML pages stored in a database, etc.
156
157						HTML::Index::Document is a subclass of Class::Struct, with 4 attributes:
158
159						=over 4
160
161						=item path
162
163						The path to the document. This is an optional attribute, but if used should
164						correspond to an existing, readable file.
165
166						=item name
167
168						The name of the document. This attribute is what is returned as a result of a
169						search, and is the primary identifier for the document. It should be unique. If
170						the path attribute is set, then the name attribute defaults to path. Otherwise,
171						it must be provided to the constructor.
172
173						=item contents
174
175						The (HTML) contents of the document. This attribute provides the text which is
176						indexed by HTML::Index::Store. If the path attribute is set, the contents
177						attribute defaults to the contents of path. Otherwise, it must be provided to
178						the constructor.
179
180						=item parser
181
182						Should be one of html or regex. If html, documents are parsed using
183						HTML::TreeBuilder to extract visible text. If regex, the
184						same job is done by a "quick and dirty" regex.
185
186						=back
187
188						=head1 SEE ALSO
189
190						=over 4
191
192						=item L
193
194						=item L
195
196						=back
197
198						=head1 AUTHOR
199
200						Ave Wrigley
201
202						=head1 COPYRIGHT
203
204						Copyright (c) 2001 Ave Wrigley. All rights reserved. This program is free
205						software; you can redistribute it and/or modify it under the same terms as Perl
206						itself.
207
208						=cut
209
210						#------------------------------------------------------------------------------
211						#
212						# End of POD
213						#
214						#------------------------------------------------------------------------------
215
216
217						#------------------------------------------------------------------------------
218						#
219						# True ...
220						#
221						#------------------------------------------------------------------------------
222
223						1;