| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package SWISH::Filters::PDF2XML; |
|
2
|
1
|
|
|
1
|
|
802
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
30
|
|
|
3
|
1
|
|
|
1
|
|
5
|
use vars qw( $VERSION @ISA ); |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
1008
|
|
|
4
|
|
|
|
|
|
|
$VERSION = '0.190'; |
|
5
|
|
|
|
|
|
|
@ISA = ('SWISH::Filters::Base'); |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
sub new { |
|
8
|
1
|
|
|
1
|
0
|
21
|
my ($class) = @_; |
|
9
|
|
|
|
|
|
|
|
|
10
|
1
|
|
|
|
|
8
|
my $self = bless { mimetypes => [qr!application/pdf!], }, $class; |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# optional module for local timestamps |
|
13
|
1
|
50
|
|
|
|
7
|
if ( $self->use_modules(qw/ Time::Local /) ) { |
|
14
|
1
|
|
|
|
|
5
|
$self->{_has_time_local} = 1; |
|
15
|
1
|
|
|
|
|
11
|
$self->{_re}->{date} = qr/(\d{4})(\d{2})(\d{2})/xms; |
|
16
|
1
|
|
|
|
|
7
|
$self->{_re}->{time} = qr/(\d{2})(\d{2})(\d{2})/xms; |
|
17
|
1
|
|
|
|
|
8
|
$self->{_re}->{tz} = qr/([+-Z])(\d{2})\'(\d{2})\'/xms; |
|
18
|
|
|
|
|
|
|
} |
|
19
|
|
|
|
|
|
|
|
|
20
|
1
|
50
|
|
|
|
8
|
if ( $self->use_modules(qw/ CAM::PDF /) ) { |
|
21
|
0
|
|
|
|
|
0
|
return $self; |
|
22
|
|
|
|
|
|
|
} |
|
23
|
|
|
|
|
|
|
|
|
24
|
1
|
|
|
|
|
25
|
return undef; # CAM::PDF not installed |
|
25
|
|
|
|
|
|
|
} |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub filter { |
|
28
|
0
|
|
|
0
|
1
|
|
my ( $self, $doc ) = @_; |
|
29
|
|
|
|
|
|
|
|
|
30
|
0
|
|
|
|
|
|
my $user_data = $doc->user_data; |
|
31
|
0
|
0
|
|
|
|
|
my $title_tag |
|
32
|
|
|
|
|
|
|
= ref $user_data eq 'HASH' |
|
33
|
|
|
|
|
|
|
? $user_data->{pdf}{title_tag} |
|
34
|
|
|
|
|
|
|
: 'title'; |
|
35
|
|
|
|
|
|
|
|
|
36
|
0
|
|
0
|
|
|
|
my $user_meta = $doc->meta_data || {}; |
|
37
|
0
|
|
|
|
|
|
my $file = $doc->fetch_filename; |
|
38
|
|
|
|
|
|
|
|
|
39
|
0
|
|
|
|
|
|
$self->mywarn("PDF2XML handling $file"); |
|
40
|
|
|
|
|
|
|
|
|
41
|
0
|
|
|
|
|
|
my $metadata = $self->get_pdf_headers($file); |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# merge pdf meta with meta we inherited, preferring user meta |
|
44
|
0
|
|
|
|
|
|
$metadata->{$_} = $user_meta->{$_} for keys %$user_meta; |
|
45
|
|
|
|
|
|
|
|
|
46
|
0
|
|
|
|
|
|
my $headers = $self->format_meta_headers($metadata); |
|
47
|
|
|
|
|
|
|
|
|
48
|
0
|
0
|
0
|
|
|
|
if ( $title_tag && exists $metadata->{$title_tag} ) { |
|
49
|
0
|
|
|
|
|
|
my $title = $self->escapeXML( $metadata->{$title_tag} ); |
|
50
|
|
|
|
|
|
|
|
|
51
|
0
|
|
|
|
|
|
$headers = "$title\n" . $headers; |
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# Check for encrypted content |
|
55
|
|
|
|
|
|
|
|
|
56
|
0
|
|
|
|
|
|
my $content_ref; |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# patch provided by Martial Chartoire |
|
59
|
0
|
0
|
0
|
|
|
|
if ( $metadata->{encrypted} |
|
60
|
|
|
|
|
|
|
&& $metadata->{encrypted} =~ /yes\.*\scopy:no\s\.*/i ) |
|
61
|
|
|
|
|
|
|
{ |
|
62
|
0
|
|
|
|
|
|
$content_ref = \''; |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
} |
|
65
|
|
|
|
|
|
|
else { |
|
66
|
0
|
|
|
|
|
|
$content_ref = $self->get_pdf_content_ref($file); |
|
67
|
|
|
|
|
|
|
} |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# update the document's content type |
|
70
|
0
|
|
|
|
|
|
$doc->set_content_type('text/xml'); |
|
71
|
|
|
|
|
|
|
|
|
72
|
0
|
|
|
|
|
|
my $txt = <
|
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
$headers |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
$$content_ref |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
EOF |
|
84
|
|
|
|
|
|
|
|
|
85
|
0
|
|
|
|
|
|
return ( \$txt, $metadata ); |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub get_pdf_headers { |
|
90
|
0
|
|
|
0
|
0
|
|
my ( $self, $file ) = @_; |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# cribbed mostly from |
|
93
|
|
|
|
|
|
|
# http://api.metacpan.org/source/CDOLAN/CAM-PDF-1.60/bin/pdfinfo.pl |
|
94
|
0
|
|
|
|
|
|
my %metadata; |
|
95
|
0
|
0
|
|
|
|
|
my $pdfdoc = CAM::PDF->new( $file, q{}, q{}, 1 ) |
|
96
|
|
|
|
|
|
|
or die "$CAM::PDF::errstr\n"; |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# basic meta |
|
99
|
0
|
|
|
|
|
|
$metadata{pdf_version} = $pdfdoc->{pdfversion}; |
|
100
|
0
|
|
|
|
|
|
$metadata{size} = length $pdfdoc->{content}; |
|
101
|
0
|
|
|
|
|
|
$metadata{pages} = $pdfdoc->numPages(); |
|
102
|
0
|
|
|
|
|
|
$metadata{file} = $file; |
|
103
|
0
|
|
|
|
|
|
my $pdfinfo = $pdfdoc->{trailer}->{Info}; |
|
104
|
0
|
|
0
|
|
|
|
$pdfinfo &&= $pdfdoc->getValue($pdfinfo); |
|
105
|
0
|
0
|
|
|
|
|
if ( !$pdfinfo ) { |
|
106
|
0
|
|
|
|
|
|
return \%metadata; |
|
107
|
|
|
|
|
|
|
} |
|
108
|
0
|
|
|
|
|
|
for my $key ( sort keys %$pdfinfo ) { |
|
109
|
0
|
|
|
|
|
|
my $metaname = lc $key; |
|
110
|
0
|
|
|
|
|
|
$metaname =~ s/ /_/g; |
|
111
|
0
|
|
|
|
|
|
my $val = $pdfinfo->{$key}->{value}; |
|
112
|
0
|
0
|
0
|
|
|
|
if ( $pdfinfo->{$key}->{type} eq 'string' |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
113
|
|
|
|
|
|
|
&& $self->{_has_time_local} |
|
114
|
|
|
|
|
|
|
&& $val |
|
115
|
|
|
|
|
|
|
&& $val =~ m{ \A |
|
116
|
|
|
|
|
|
|
D: $self->{_re}->{date} $self->{_re}->{time} $self->{_re}->{tz} |
|
117
|
|
|
|
|
|
|
\z |
|
118
|
|
|
|
|
|
|
}xms |
|
119
|
|
|
|
|
|
|
) |
|
120
|
|
|
|
|
|
|
{ |
|
121
|
0
|
|
|
|
|
|
my ( $Y, $M, $D, $h, $m, $s, $sign, $tzh, $tzm ) |
|
122
|
|
|
|
|
|
|
= ( $1, $2, $3, $4, $5, $6, $7, $8, $9 ); |
|
123
|
0
|
0
|
|
|
|
|
if ( $sign eq 'Z' ) { |
|
124
|
0
|
|
|
|
|
|
$sign = q{+}; |
|
125
|
|
|
|
|
|
|
} |
|
126
|
0
|
|
|
|
|
|
my $timegm |
|
127
|
|
|
|
|
|
|
= Time::Local::timegm( $s, $m, $h, $D, $M - 1, $Y - 1900 ); |
|
128
|
0
|
|
|
|
|
|
my $tzshift = $sign . ( $tzh * 3600 + $tzm * 60 ); |
|
129
|
0
|
|
|
|
|
|
$timegm += $tzshift; |
|
130
|
0
|
|
|
|
|
|
$val = localtime $timegm; |
|
131
|
|
|
|
|
|
|
} |
|
132
|
0
|
|
|
|
|
|
$metadata{$metaname} = $val; |
|
133
|
|
|
|
|
|
|
} |
|
134
|
|
|
|
|
|
|
|
|
135
|
0
|
|
|
|
|
|
return \%metadata; |
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
sub get_pdf_content_ref { |
|
139
|
0
|
|
|
0
|
0
|
|
my ( $self, $file ) = @_; |
|
140
|
|
|
|
|
|
|
|
|
141
|
0
|
0
|
|
|
|
|
my $pdfdoc = CAM::PDF->new($file) or die "$CAM::PDF::errstr\n"; |
|
142
|
0
|
|
|
|
|
|
my $content = ''; |
|
143
|
0
|
|
|
|
|
|
for my $page ( $pdfdoc->rangeToArray( 1, $pdfdoc->numPages() ) ) { |
|
144
|
0
|
|
|
|
|
|
my $str = $pdfdoc->getPageText($page); |
|
145
|
0
|
0
|
|
|
|
|
if ( defined $str ) { |
|
146
|
0
|
|
|
|
|
|
CAM::PDF->asciify( \$str ); # TODO encodings? |
|
147
|
0
|
|
|
|
|
|
$content .= $str; |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
} |
|
150
|
|
|
|
|
|
|
|
|
151
|
0
|
|
|
|
|
|
return \$content; |
|
152
|
|
|
|
|
|
|
} |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
1; |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
__END__ |