line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Plucene::SearchEngine::Index::PDF; |
2
|
1
|
|
|
1
|
|
1081
|
use base 'Plucene::SearchEngine::Index::Base'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
81
|
|
3
|
|
|
|
|
|
|
__PACKAGE__->register_handler("application/pdf", ".pdf"); |
4
|
1
|
|
|
1
|
|
6
|
use File::Temp qw/tmpnam/; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
215
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
Plucene::SearchEngine::Index::PDF - Backend for parsing PDF |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 DESCRIPTION |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
This backend analyzes a PDF file for its textual content (using C) |
13
|
|
|
|
|
|
|
and turns any metadata found in the PDF into Plucene fields. |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=cut |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
sub gather_data_from_file { |
18
|
0
|
|
|
0
|
0
|
|
my ($self, $filename) = @_; |
19
|
0
|
|
|
|
|
|
my $html = tmpnam(); |
20
|
0
|
|
|
|
|
|
system("pdftotext", "-htmlmeta", $filename, $html); |
21
|
0
|
0
|
|
|
|
|
return unless -e $html; |
22
|
0
|
|
|
|
|
|
$self->Plucene::SearchEngine::Index::HTML::gather_data_from_file($html); |
23
|
0
|
|
|
|
|
|
unlink $html; |
24
|
0
|
|
|
|
|
|
return $self; |
25
|
|
|
|
|
|
|
} |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
1; |