line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package SWISH::Filters::Pdf2HTML; |
2
|
1
|
|
|
1
|
|
530
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
42
|
|
3
|
1
|
|
|
1
|
|
4
|
use vars qw( $VERSION @ISA ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
450
|
|
4
|
|
|
|
|
|
|
$VERSION = '0.191'; |
5
|
|
|
|
|
|
|
@ISA = ('SWISH::Filters::Base'); |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
sub new { |
8
|
1
|
|
|
1
|
0
|
19
|
my ($class) = @_; |
9
|
|
|
|
|
|
|
|
10
|
1
|
|
|
|
|
6
|
my $self = bless { mimetypes => [qr!application/pdf!], }, $class; |
11
|
|
|
|
|
|
|
|
12
|
1
|
|
|
|
|
8
|
return $self->set_programs(qw/ pdftotext pdfinfo /); |
13
|
|
|
|
|
|
|
} |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
sub filter { |
16
|
0
|
|
|
0
|
1
|
|
my ( $self, $doc ) = @_; |
17
|
|
|
|
|
|
|
|
18
|
0
|
|
|
|
|
|
my $user_data = $doc->user_data; |
19
|
0
|
0
|
|
|
|
|
my $title_tag |
20
|
|
|
|
|
|
|
= ref $user_data eq 'HASH' |
21
|
|
|
|
|
|
|
? $user_data->{pdf}{title_tag} |
22
|
|
|
|
|
|
|
: 'title'; |
23
|
|
|
|
|
|
|
|
24
|
0
|
|
0
|
|
|
|
my $user_meta = $doc->meta_data || {}; |
25
|
0
|
|
|
|
|
|
my $file = $doc->fetch_filename; |
26
|
|
|
|
|
|
|
|
27
|
0
|
|
|
|
|
|
$self->mywarn("Pdf2HTML handling $file"); |
28
|
|
|
|
|
|
|
|
29
|
0
|
|
|
|
|
|
my $metadata = $self->get_pdf_headers($file); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# merge pdf meta with meta we inherited, preferring user meta |
32
|
0
|
|
|
|
|
|
$metadata->{$_} = $user_meta->{$_} for keys %$user_meta; |
33
|
|
|
|
|
|
|
|
34
|
0
|
|
|
|
|
|
my $headers = $self->format_meta_headers($metadata); |
35
|
|
|
|
|
|
|
|
36
|
0
|
0
|
0
|
|
|
|
if ( $title_tag && exists $metadata->{$title_tag} ) { |
37
|
0
|
|
|
|
|
|
my $title = $self->escapeXML( $metadata->{$title_tag} ); |
38
|
|
|
|
|
|
|
|
39
|
0
|
|
|
|
|
|
$headers = "$title\n" . $headers; |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# Check for encrypted content |
43
|
|
|
|
|
|
|
|
44
|
0
|
|
|
|
|
|
my $content_ref; |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# patch provided by Martial Chartoire |
47
|
0
|
0
|
0
|
|
|
|
if ( $metadata->{encrypted} |
48
|
|
|
|
|
|
|
&& $metadata->{encrypted} =~ /yes\.*\scopy:no\s\.*/i ) |
49
|
|
|
|
|
|
|
{ |
50
|
0
|
|
|
|
|
|
$content_ref = \''; |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
else { |
54
|
0
|
|
|
|
|
|
$content_ref = $self->get_pdf_content_ref($file); |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# update the document's content type |
58
|
0
|
|
|
|
|
|
$doc->set_content_type('text/html'); |
59
|
|
|
|
|
|
|
|
60
|
0
|
|
|
|
|
|
my $txt = <
|
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
$headers |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
$$content_ref |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
EOF |
72
|
|
|
|
|
|
|
|
73
|
0
|
|
|
|
|
|
return ( \$txt, $metadata ); |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub get_pdf_headers { |
78
|
|
|
|
|
|
|
|
79
|
0
|
|
|
0
|
0
|
|
my ( $self, $file ) = @_; |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# We need a file name to pass to the pdf conversion programs |
82
|
|
|
|
|
|
|
|
83
|
0
|
|
|
|
|
|
my %metadata; |
84
|
0
|
|
|
|
|
|
my $headers = $self->run_pdfinfo($file); |
85
|
0
|
0
|
|
|
|
|
return \%metadata unless $headers; |
86
|
|
|
|
|
|
|
|
87
|
0
|
|
|
|
|
|
for ( split /\n/, $headers ) { |
88
|
0
|
0
|
|
|
|
|
if (/^\s*([^:]+):\s+(.+)$/) { |
89
|
0
|
|
|
|
|
|
my ( $metaname, $value ) = ( lc($1), $2 ); |
90
|
0
|
|
|
|
|
|
$metaname =~ tr/ /_/; |
91
|
0
|
|
|
|
|
|
$metadata{$metaname} = $value; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
0
|
|
|
|
|
|
return \%metadata; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub get_pdf_content_ref { |
99
|
0
|
|
|
0
|
0
|
|
my ( $self, $file ) = @_; |
100
|
|
|
|
|
|
|
|
101
|
0
|
|
|
|
|
|
my $content = $self->escapeXML( $self->run_pdftotext( $file, '-' ) ); |
102
|
|
|
|
|
|
|
|
103
|
0
|
|
|
|
|
|
return \$content; |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
1; |
107
|
|
|
|
|
|
|
__END__ |