line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package SWISH::Filters::PDF2XML; |
2
|
1
|
|
|
1
|
|
671
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
33
|
|
3
|
1
|
|
|
1
|
|
4
|
use vars qw( $VERSION @ISA ); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
744
|
|
4
|
|
|
|
|
|
|
$VERSION = '0.191'; |
5
|
|
|
|
|
|
|
@ISA = ('SWISH::Filters::Base'); |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
sub new { |
8
|
1
|
|
|
1
|
0
|
17
|
my ($class) = @_; |
9
|
|
|
|
|
|
|
|
10
|
1
|
|
|
|
|
5
|
my $self = bless { mimetypes => [qr!application/pdf!], }, $class; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# optional module for local timestamps |
13
|
1
|
50
|
|
|
|
7
|
if ( $self->use_modules(qw/ Time::Local /) ) { |
14
|
1
|
|
|
|
|
2
|
$self->{_has_time_local} = 1; |
15
|
1
|
|
|
|
|
5
|
$self->{_re}->{date} = qr/(\d{4})(\d{2})(\d{2})/xms; |
16
|
1
|
|
|
|
|
4
|
$self->{_re}->{time} = qr/(\d{2})(\d{2})(\d{2})/xms; |
17
|
1
|
|
|
|
|
3
|
$self->{_re}->{tz} = qr/([+-Z])(\d{2})\'(\d{2})\'/xms; |
18
|
|
|
|
|
|
|
} |
19
|
|
|
|
|
|
|
|
20
|
1
|
50
|
|
|
|
3
|
if ( $self->use_modules(qw/ CAM::PDF /) ) { |
21
|
0
|
|
|
|
|
0
|
return $self; |
22
|
|
|
|
|
|
|
} |
23
|
|
|
|
|
|
|
|
24
|
1
|
|
|
|
|
23
|
return undef; # CAM::PDF not installed |
25
|
|
|
|
|
|
|
} |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub filter { |
28
|
0
|
|
|
0
|
1
|
|
my ( $self, $doc ) = @_; |
29
|
|
|
|
|
|
|
|
30
|
0
|
|
|
|
|
|
my $user_data = $doc->user_data; |
31
|
0
|
0
|
|
|
|
|
my $title_tag |
32
|
|
|
|
|
|
|
= ref $user_data eq 'HASH' |
33
|
|
|
|
|
|
|
? $user_data->{pdf}{title_tag} |
34
|
|
|
|
|
|
|
: 'title'; |
35
|
|
|
|
|
|
|
|
36
|
0
|
|
0
|
|
|
|
my $user_meta = $doc->meta_data || {}; |
37
|
0
|
|
|
|
|
|
my $file = $doc->fetch_filename; |
38
|
|
|
|
|
|
|
|
39
|
0
|
|
|
|
|
|
$self->mywarn("PDF2XML handling $file"); |
40
|
|
|
|
|
|
|
|
41
|
0
|
|
|
|
|
|
my $metadata = $self->get_pdf_headers($file); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# merge pdf meta with meta we inherited, preferring user meta |
44
|
0
|
|
|
|
|
|
$metadata->{$_} = $user_meta->{$_} for keys %$user_meta; |
45
|
|
|
|
|
|
|
|
46
|
0
|
|
|
|
|
|
my $headers = $self->format_meta_headers($metadata); |
47
|
|
|
|
|
|
|
|
48
|
0
|
0
|
0
|
|
|
|
if ( $title_tag && exists $metadata->{$title_tag} ) { |
49
|
0
|
|
|
|
|
|
my $title = $self->escapeXML( $metadata->{$title_tag} ); |
50
|
|
|
|
|
|
|
|
51
|
0
|
|
|
|
|
|
$headers = "$title\n" . $headers; |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# Check for encrypted content |
55
|
|
|
|
|
|
|
|
56
|
0
|
|
|
|
|
|
my $content_ref; |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# patch provided by Martial Chartoire |
59
|
0
|
0
|
0
|
|
|
|
if ( $metadata->{encrypted} |
60
|
|
|
|
|
|
|
&& $metadata->{encrypted} =~ /yes\.*\scopy:no\s\.*/i ) |
61
|
|
|
|
|
|
|
{ |
62
|
0
|
|
|
|
|
|
$content_ref = \''; |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
else { |
66
|
0
|
|
|
|
|
|
$content_ref = $self->get_pdf_content_ref($file); |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# update the document's content type |
70
|
0
|
|
|
|
|
|
$doc->set_content_type('text/xml'); |
71
|
|
|
|
|
|
|
|
72
|
0
|
|
|
|
|
|
my $txt = <
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
$headers |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
$$content_ref |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
EOF |
84
|
|
|
|
|
|
|
|
85
|
0
|
|
|
|
|
|
return ( \$txt, $metadata ); |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub get_pdf_headers { |
90
|
0
|
|
|
0
|
0
|
|
my ( $self, $file ) = @_; |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# cribbed mostly from |
93
|
|
|
|
|
|
|
# http://api.metacpan.org/source/CDOLAN/CAM-PDF-1.60/bin/pdfinfo.pl |
94
|
0
|
|
|
|
|
|
my %metadata; |
95
|
0
|
0
|
|
|
|
|
my $pdfdoc = CAM::PDF->new( $file, q{}, q{}, 1 ) |
96
|
|
|
|
|
|
|
or die "$CAM::PDF::errstr\n"; |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# basic meta |
99
|
0
|
|
|
|
|
|
$metadata{pdf_version} = $pdfdoc->{pdfversion}; |
100
|
0
|
|
|
|
|
|
$metadata{size} = length $pdfdoc->{content}; |
101
|
0
|
|
|
|
|
|
$metadata{pages} = $pdfdoc->numPages(); |
102
|
0
|
|
|
|
|
|
$metadata{file} = $file; |
103
|
0
|
|
|
|
|
|
my $pdfinfo = $pdfdoc->{trailer}->{Info}; |
104
|
0
|
|
0
|
|
|
|
$pdfinfo &&= $pdfdoc->getValue($pdfinfo); |
105
|
0
|
0
|
|
|
|
|
if ( !$pdfinfo ) { |
106
|
0
|
|
|
|
|
|
return \%metadata; |
107
|
|
|
|
|
|
|
} |
108
|
0
|
|
|
|
|
|
for my $key ( sort keys %$pdfinfo ) { |
109
|
0
|
|
|
|
|
|
my $metaname = lc $key; |
110
|
0
|
|
|
|
|
|
$metaname =~ s/ /_/g; |
111
|
0
|
|
|
|
|
|
my $val = $pdfinfo->{$key}->{value}; |
112
|
0
|
0
|
0
|
|
|
|
if ( $pdfinfo->{$key}->{type} eq 'string' |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
113
|
|
|
|
|
|
|
&& $self->{_has_time_local} |
114
|
|
|
|
|
|
|
&& $val |
115
|
|
|
|
|
|
|
&& $val =~ m{ \A |
116
|
|
|
|
|
|
|
D: $self->{_re}->{date} $self->{_re}->{time} $self->{_re}->{tz} |
117
|
|
|
|
|
|
|
\z |
118
|
|
|
|
|
|
|
}xms |
119
|
|
|
|
|
|
|
) |
120
|
|
|
|
|
|
|
{ |
121
|
0
|
|
|
|
|
|
my ( $Y, $M, $D, $h, $m, $s, $sign, $tzh, $tzm ) |
122
|
|
|
|
|
|
|
= ( $1, $2, $3, $4, $5, $6, $7, $8, $9 ); |
123
|
0
|
0
|
|
|
|
|
if ( $sign eq 'Z' ) { |
124
|
0
|
|
|
|
|
|
$sign = q{+}; |
125
|
|
|
|
|
|
|
} |
126
|
0
|
|
|
|
|
|
my $timegm |
127
|
|
|
|
|
|
|
= Time::Local::timegm( $s, $m, $h, $D, $M - 1, $Y - 1900 ); |
128
|
0
|
|
|
|
|
|
my $tzshift = $sign . ( $tzh * 3600 + $tzm * 60 ); |
129
|
0
|
|
|
|
|
|
$timegm += $tzshift; |
130
|
0
|
|
|
|
|
|
$val = localtime $timegm; |
131
|
|
|
|
|
|
|
} |
132
|
0
|
|
|
|
|
|
$metadata{$metaname} = $val; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
0
|
|
|
|
|
|
return \%metadata; |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
sub get_pdf_content_ref { |
139
|
0
|
|
|
0
|
0
|
|
my ( $self, $file ) = @_; |
140
|
|
|
|
|
|
|
|
141
|
0
|
0
|
|
|
|
|
my $pdfdoc = CAM::PDF->new($file) or die "$CAM::PDF::errstr\n"; |
142
|
0
|
|
|
|
|
|
my $content = ''; |
143
|
0
|
|
|
|
|
|
for my $page ( $pdfdoc->rangeToArray( 1, $pdfdoc->numPages() ) ) { |
144
|
0
|
|
|
|
|
|
my $str = $pdfdoc->getPageText($page); |
145
|
0
|
0
|
|
|
|
|
if ( defined $str ) { |
146
|
0
|
|
|
|
|
|
CAM::PDF->asciify( \$str ); # TODO encodings? |
147
|
0
|
|
|
|
|
|
$content .= $str; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
|
151
|
0
|
|
|
|
|
|
return \$content; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
1; |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
__END__ |