File Coverage

blib/lib/Plucene/SearchEngine/Index/HTML.pm
Criterion Covered Total %
statement 6 18 33.3
branch 0 4 0.0
condition n/a
subroutine 2 3 66.6
pod 0 1 0.0
total 8 26 30.7


line stmt bran cond sub pod time code
1             package Plucene::SearchEngine::Index::HTML;
2 1     1   1936 use base 'Plucene::SearchEngine::Index::Base';
  1         2  
  1         697  
3 1     1   1301 use HTML::TreeBuilder;
  1         43648  
  1         15  
4             __PACKAGE__->register_handler("text/html", ".html");
5              
6             =head1 NAME
7              
8             Plucene::SearchEngine::Index::HTML - Backend for simply parsing HTML
9              
10             =head1 DESCRIPTION
11              
12             This backend analysis a HTML file for the following Plucene fields:
13              
14             =over 3
15              
16             =item text
17              
18             The text part of the HTML
19              
20             =item link
21              
22             A list of links in the HTML
23              
24             =back
25              
26             Additionally, any C tags are turned into Plucene fields.
27              
28             =cut
29              
30             sub gather_data_from_file {
31 0     0 0   my ($self, $filename) = @_;
32 0           my $tree = HTML::TreeBuilder->new;
33 0           $tree->parse_file($filename);
34 0           for($tree->look_down(_tag => "meta")) {
35 0 0         next if $_->attr("http-equiv");
36 0 0         next unless $_->attr("value");
37 0           $self->add_data($_->attr("name"), "Text", $_->attr("value"));
38             }
39 0           for (@{$tree->extract_links("a")}) {
  0            
40 0           $self->add_data("link", "Text", $_->[0]);
41             }
42 0           $self->add_data("text", "UnStored", $tree->as_trimmed_text);
43 0           return $self;
44             }
45              
46             1;