| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package SWISH::Filters::PDF2XML; | 
| 2 | 1 |  |  | 1 |  | 671 | use strict; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 33 |  | 
| 3 | 1 |  |  | 1 |  | 4 | use vars qw( $VERSION @ISA ); | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 744 |  | 
| 4 |  |  |  |  |  |  | $VERSION = '0.191'; | 
| 5 |  |  |  |  |  |  | @ISA     = ('SWISH::Filters::Base'); | 
| 6 |  |  |  |  |  |  |  | 
| 7 |  |  |  |  |  |  | sub new { | 
| 8 | 1 |  |  | 1 | 0 | 17 | my ($class) = @_; | 
| 9 |  |  |  |  |  |  |  | 
| 10 | 1 |  |  |  |  | 5 | my $self = bless { mimetypes => [qr!application/pdf!], }, $class; | 
| 11 |  |  |  |  |  |  |  | 
| 12 |  |  |  |  |  |  | # optional module for local timestamps | 
| 13 | 1 | 50 |  |  |  | 7 | if ( $self->use_modules(qw/ Time::Local /) ) { | 
| 14 | 1 |  |  |  |  | 2 | $self->{_has_time_local} = 1; | 
| 15 | 1 |  |  |  |  | 5 | $self->{_re}->{date}     = qr/(\d{4})(\d{2})(\d{2})/xms; | 
| 16 | 1 |  |  |  |  | 4 | $self->{_re}->{time}     = qr/(\d{2})(\d{2})(\d{2})/xms; | 
| 17 | 1 |  |  |  |  | 3 | $self->{_re}->{tz}       = qr/([+-Z])(\d{2})\'(\d{2})\'/xms; | 
| 18 |  |  |  |  |  |  | } | 
| 19 |  |  |  |  |  |  |  | 
| 20 | 1 | 50 |  |  |  | 3 | if ( $self->use_modules(qw/ CAM::PDF /) ) { | 
| 21 | 0 |  |  |  |  | 0 | return $self; | 
| 22 |  |  |  |  |  |  | } | 
| 23 |  |  |  |  |  |  |  | 
| 24 | 1 |  |  |  |  | 23 | return undef;    # CAM::PDF not installed | 
| 25 |  |  |  |  |  |  | } | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | sub filter { | 
| 28 | 0 |  |  | 0 | 1 |  | my ( $self, $doc ) = @_; | 
| 29 |  |  |  |  |  |  |  | 
| 30 | 0 |  |  |  |  |  | my $user_data = $doc->user_data; | 
| 31 | 0 | 0 |  |  |  |  | my $title_tag | 
| 32 |  |  |  |  |  |  | = ref $user_data eq 'HASH' | 
| 33 |  |  |  |  |  |  | ? $user_data->{pdf}{title_tag} | 
| 34 |  |  |  |  |  |  | : 'title'; | 
| 35 |  |  |  |  |  |  |  | 
| 36 | 0 |  | 0 |  |  |  | my $user_meta = $doc->meta_data || {}; | 
| 37 | 0 |  |  |  |  |  | my $file = $doc->fetch_filename; | 
| 38 |  |  |  |  |  |  |  | 
| 39 | 0 |  |  |  |  |  | $self->mywarn("PDF2XML handling $file"); | 
| 40 |  |  |  |  |  |  |  | 
| 41 | 0 |  |  |  |  |  | my $metadata = $self->get_pdf_headers($file); | 
| 42 |  |  |  |  |  |  |  | 
| 43 |  |  |  |  |  |  | # merge pdf meta with meta we inherited, preferring user meta | 
| 44 | 0 |  |  |  |  |  | $metadata->{$_} = $user_meta->{$_} for keys %$user_meta; | 
| 45 |  |  |  |  |  |  |  | 
| 46 | 0 |  |  |  |  |  | my $headers = $self->format_meta_headers($metadata); | 
| 47 |  |  |  |  |  |  |  | 
| 48 | 0 | 0 | 0 |  |  |  | if ( $title_tag && exists $metadata->{$title_tag} ) { | 
| 49 | 0 |  |  |  |  |  | my $title = $self->escapeXML( $metadata->{$title_tag} ); | 
| 50 |  |  |  |  |  |  |  | 
| 51 | 0 |  |  |  |  |  | $headers = "$title\n" . $headers; | 
| 52 |  |  |  |  |  |  | } | 
| 53 |  |  |  |  |  |  |  | 
| 54 |  |  |  |  |  |  | # Check for encrypted content | 
| 55 |  |  |  |  |  |  |  | 
| 56 | 0 |  |  |  |  |  | my $content_ref; | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | # patch provided by Martial Chartoire | 
| 59 | 0 | 0 | 0 |  |  |  | if (   $metadata->{encrypted} | 
| 60 |  |  |  |  |  |  | && $metadata->{encrypted} =~ /yes\.*\scopy:no\s\.*/i ) | 
| 61 |  |  |  |  |  |  | { | 
| 62 | 0 |  |  |  |  |  | $content_ref = \''; | 
| 63 |  |  |  |  |  |  |  | 
| 64 |  |  |  |  |  |  | } | 
| 65 |  |  |  |  |  |  | else { | 
| 66 | 0 |  |  |  |  |  | $content_ref = $self->get_pdf_content_ref($file); | 
| 67 |  |  |  |  |  |  | } | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  | # update the document's content type | 
| 70 | 0 |  |  |  |  |  | $doc->set_content_type('text/xml'); | 
| 71 |  |  |  |  |  |  |  | 
| 72 | 0 |  |  |  |  |  | my $txt = < | 
| 73 |  |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | $headers | 
| 76 |  |  |  |  |  |  |  | 
| 77 |  |  |  |  |  |  |  | 
| 78 |  |  |  |  |  |  |   | 
| 79 |  |  |  |  |  |  | $$content_ref | 
| 80 |  |  |  |  |  |  |  | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  | EOF | 
| 84 |  |  |  |  |  |  |  | 
| 85 | 0 |  |  |  |  |  | return ( \$txt, $metadata ); | 
| 86 |  |  |  |  |  |  |  | 
| 87 |  |  |  |  |  |  | } | 
| 88 |  |  |  |  |  |  |  | 
| 89 |  |  |  |  |  |  | sub get_pdf_headers { | 
| 90 | 0 |  |  | 0 | 0 |  | my ( $self, $file ) = @_; | 
| 91 |  |  |  |  |  |  |  | 
| 92 |  |  |  |  |  |  | # cribbed mostly from | 
| 93 |  |  |  |  |  |  | # http://api.metacpan.org/source/CDOLAN/CAM-PDF-1.60/bin/pdfinfo.pl | 
| 94 | 0 |  |  |  |  |  | my %metadata; | 
| 95 | 0 | 0 |  |  |  |  | my $pdfdoc = CAM::PDF->new( $file, q{}, q{}, 1 ) | 
| 96 |  |  |  |  |  |  | or die "$CAM::PDF::errstr\n"; | 
| 97 |  |  |  |  |  |  |  | 
| 98 |  |  |  |  |  |  | # basic meta | 
| 99 | 0 |  |  |  |  |  | $metadata{pdf_version} = $pdfdoc->{pdfversion}; | 
| 100 | 0 |  |  |  |  |  | $metadata{size}        = length $pdfdoc->{content}; | 
| 101 | 0 |  |  |  |  |  | $metadata{pages}       = $pdfdoc->numPages(); | 
| 102 | 0 |  |  |  |  |  | $metadata{file}        = $file; | 
| 103 | 0 |  |  |  |  |  | my $pdfinfo = $pdfdoc->{trailer}->{Info}; | 
| 104 | 0 |  | 0 |  |  |  | $pdfinfo &&= $pdfdoc->getValue($pdfinfo); | 
| 105 | 0 | 0 |  |  |  |  | if ( !$pdfinfo ) { | 
| 106 | 0 |  |  |  |  |  | return \%metadata; | 
| 107 |  |  |  |  |  |  | } | 
| 108 | 0 |  |  |  |  |  | for my $key ( sort keys %$pdfinfo ) { | 
| 109 | 0 |  |  |  |  |  | my $metaname = lc $key; | 
| 110 | 0 |  |  |  |  |  | $metaname =~ s/ /_/g; | 
| 111 | 0 |  |  |  |  |  | my $val = $pdfinfo->{$key}->{value}; | 
| 112 | 0 | 0 | 0 |  |  |  | if (   $pdfinfo->{$key}->{type} eq 'string' | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
| 113 |  |  |  |  |  |  | && $self->{_has_time_local} | 
| 114 |  |  |  |  |  |  | && $val | 
| 115 |  |  |  |  |  |  | && $val =~ m{ \A | 
| 116 |  |  |  |  |  |  | D: $self->{_re}->{date} $self->{_re}->{time} $self->{_re}->{tz} | 
| 117 |  |  |  |  |  |  | \z | 
| 118 |  |  |  |  |  |  | }xms | 
| 119 |  |  |  |  |  |  | ) | 
| 120 |  |  |  |  |  |  | { | 
| 121 | 0 |  |  |  |  |  | my ( $Y, $M, $D, $h, $m, $s, $sign, $tzh, $tzm ) | 
| 122 |  |  |  |  |  |  | = ( $1, $2, $3, $4, $5, $6, $7, $8, $9 ); | 
| 123 | 0 | 0 |  |  |  |  | if ( $sign eq 'Z' ) { | 
| 124 | 0 |  |  |  |  |  | $sign = q{+}; | 
| 125 |  |  |  |  |  |  | } | 
| 126 | 0 |  |  |  |  |  | my $timegm | 
| 127 |  |  |  |  |  |  | = Time::Local::timegm( $s, $m, $h, $D, $M - 1, $Y - 1900 ); | 
| 128 | 0 |  |  |  |  |  | my $tzshift = $sign . ( $tzh * 3600 + $tzm * 60 ); | 
| 129 | 0 |  |  |  |  |  | $timegm += $tzshift; | 
| 130 | 0 |  |  |  |  |  | $val = localtime $timegm; | 
| 131 |  |  |  |  |  |  | } | 
| 132 | 0 |  |  |  |  |  | $metadata{$metaname} = $val; | 
| 133 |  |  |  |  |  |  | } | 
| 134 |  |  |  |  |  |  |  | 
| 135 | 0 |  |  |  |  |  | return \%metadata; | 
| 136 |  |  |  |  |  |  | } | 
| 137 |  |  |  |  |  |  |  | 
| 138 |  |  |  |  |  |  | sub get_pdf_content_ref { | 
| 139 | 0 |  |  | 0 | 0 |  | my ( $self, $file ) = @_; | 
| 140 |  |  |  |  |  |  |  | 
| 141 | 0 | 0 |  |  |  |  | my $pdfdoc = CAM::PDF->new($file) or die "$CAM::PDF::errstr\n"; | 
| 142 | 0 |  |  |  |  |  | my $content = ''; | 
| 143 | 0 |  |  |  |  |  | for my $page ( $pdfdoc->rangeToArray( 1, $pdfdoc->numPages() ) ) { | 
| 144 | 0 |  |  |  |  |  | my $str = $pdfdoc->getPageText($page); | 
| 145 | 0 | 0 |  |  |  |  | if ( defined $str ) { | 
| 146 | 0 |  |  |  |  |  | CAM::PDF->asciify( \$str );    # TODO encodings? | 
| 147 | 0 |  |  |  |  |  | $content .= $str; | 
| 148 |  |  |  |  |  |  | } | 
| 149 |  |  |  |  |  |  | } | 
| 150 |  |  |  |  |  |  |  | 
| 151 | 0 |  |  |  |  |  | return \$content; | 
| 152 |  |  |  |  |  |  | } | 
| 153 |  |  |  |  |  |  |  | 
| 154 |  |  |  |  |  |  | 1; | 
| 155 |  |  |  |  |  |  |  | 
| 156 |  |  |  |  |  |  | __END__ |