line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Plucene::SearchEngine::Index::HTML; |
2
|
1
|
|
|
1
|
|
1936
|
use base 'Plucene::SearchEngine::Index::Base'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
697
|
|
3
|
1
|
|
|
1
|
|
1301
|
use HTML::TreeBuilder; |
|
1
|
|
|
|
|
43648
|
|
|
1
|
|
|
|
|
15
|
|
4
|
|
|
|
|
|
|
__PACKAGE__->register_handler("text/html", ".html"); |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
Plucene::SearchEngine::Index::HTML - Backend for simply parsing HTML |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 DESCRIPTION |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
This backend analysis a HTML file for the following Plucene fields: |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=over 3 |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=item text |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
The text part of the HTML |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=item link |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
A list of links in the HTML |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=back |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
Additionally, any C tags are turned into Plucene fields. |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=cut |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub gather_data_from_file { |
31
|
0
|
|
|
0
|
0
|
|
my ($self, $filename) = @_; |
32
|
0
|
|
|
|
|
|
my $tree = HTML::TreeBuilder->new; |
33
|
0
|
|
|
|
|
|
$tree->parse_file($filename); |
34
|
0
|
|
|
|
|
|
for($tree->look_down(_tag => "meta")) { |
35
|
0
|
0
|
|
|
|
|
next if $_->attr("http-equiv"); |
36
|
0
|
0
|
|
|
|
|
next unless $_->attr("value"); |
37
|
0
|
|
|
|
|
|
$self->add_data($_->attr("name"), "Text", $_->attr("value")); |
38
|
|
|
|
|
|
|
} |
39
|
0
|
|
|
|
|
|
for (@{$tree->extract_links("a")}) { |
|
0
|
|
|
|
|
|
|
40
|
0
|
|
|
|
|
|
$self->add_data("link", "Text", $_->[0]); |
41
|
|
|
|
|
|
|
} |
42
|
0
|
|
|
|
|
|
$self->add_data("text", "UnStored", $tree->as_trimmed_text); |
43
|
0
|
|
|
|
|
|
return $self; |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
1; |