File Coverage

blib/lib/Image/ExifTool/PDF.pm
Criterion Covered Total %
statement 526 1029 51.1
branch 289 724 39.9
condition 75 247 30.3
subroutine 22 27 81.4
pod 0 23 0.0
total 912 2050 44.4


line stmt bran cond sub pod time code
1             #------------------------------------------------------------------------------
2             # File: PDF.pm
3             #
4             # Description: Read PDF meta information
5             #
6             # Revisions: 07/11/2005 - P. Harvey Created
7             # 07/25/2005 - P. Harvey Add support for encrypted documents
8             #
9             # References: 1) http://www.adobe.com/devnet/pdf/pdf_reference.html
10             # 2) http://search.cpan.org/dist/Crypt-RC4/
11             # 3) http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
12             # 4) http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000.pdf
13             # 5) http://tools.ietf.org/search/rfc3454
14             # 6) http://www.armware.dk/RFC/rfc/rfc4013.html
15             #------------------------------------------------------------------------------
16              
17             package Image::ExifTool::PDF;
18              
19 25     25   4885 use strict;
  25         80  
  25         1048  
20 25     25   164 use vars qw($VERSION $AUTOLOAD $lastFetched);
  25         73  
  25         1658  
21 25     25   193 use Image::ExifTool qw(:DataAccess :Utils);
  25         70  
  25         356088  
22             require Exporter;
23              
24             $VERSION = '1.55';
25              
26             sub FetchObject($$$$);
27             sub ExtractObject($$;$$);
28             sub ReadToNested($;$);
29             sub ProcessDict($$$$;$$);
30             sub ProcessAcroForm($$$$;$$);
31             sub ExpandArray($);
32             sub ReadPDFValue($);
33             sub CheckPDF($$$);
34              
35             # $lastFetched - last fetched object reference (used for decryption)
36             # (undefined if fetched object was already decrypted, eg. object from stream)
37              
38             my $cryptInfo; # encryption object reference (plus additional information)
39             my $cryptString; # flag that strings are encrypted
40             my $cryptStream; # flag that streams are encrypted
41             my $lastOffset; # last fetched object offset
42             my %streamObjs; # hash of stream objects
43             my %fetched; # dicts fetched in verbose mode (to avoid cyclical recursion)
44             my $pdfVer; # version of PDF file being processed
45              
46             # filters supported in DecodeStream()
47             my %supportedFilter = (
48             '/FlateDecode' => 1,
49             '/Crypt' => 1,
50             '/Identity' => 1, # (not filtered)
51             '/DCTDecode' => 1, # (JPEG image - not filtered)
52             '/JPXDecode' => 1, # (Jpeg2000 image - not filtered)
53             '/LZWDecode' => 1, # (usually a bitmapped image)
54             '/ASCIIHexDecode' => 1,
55             '/ASCII85Decode' => 1,
56             # other standard filters that we currently don't support
57             #'/JBIG2Decode' => 0, # (JBIG2 image format not supported)
58             #'/CCITTFaxDecode' => 0,
59             #'/RunLengthDecode' => 0,
60             );
61              
62             # tags in main PDF directories
63             %Image::ExifTool::PDF::Main = (
64             GROUPS => { 2 => 'Document' },
65             VARS => { CAPTURE => ['Main','Prev'] },
66             Info => {
67             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Info' },
68             # Adobe Acrobat 10.1.5 will create a duplicate Info dictionary with
69             # a different object number when metadata is edited. This flag
70             # is part of a patch to ignore this duplicate information (unless
71             # the IgnoreMinorErrors option is used)
72             IgnoreDuplicates => 1,
73             },
74             Root => {
75             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Root' },
76             },
77             Encrypt => {
78             NoProcess => 1, # don't process normally (processed in advance)
79             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Encrypt' },
80             },
81             _linearized => {
82             Name => 'Linearized',
83             Notes => 'flag set if document is linearized for fast web display; not a real Tag ID',
84             PrintConv => { 'true' => 'Yes', 'false' => 'No' },
85             },
86             );
87              
88             # tags in PDF Info dictionary
89             %Image::ExifTool::PDF::Info = (
90             GROUPS => { 2 => 'Document' },
91             VARS => { CAPTURE => ['Info'] },
92             EXTRACT_UNKNOWN => 1, # extract all unknown tags in this directory
93             WRITE_PROC => \&Image::ExifTool::DummyWriteProc,
94             CHECK_PROC => \&CheckPDF,
95             WRITABLE => 'string',
96             # set PRIORITY to 0 so most recent Info dictionary takes precedence
97             # (Acrobat Pro bug? doesn't use same object/generation number for
98             # new Info dictionary when doing incremental update)
99             PRIORITY => 0,
100             NOTES => q{
101             As well as the tags listed below, the PDF specification allows for
102             user-defined tags to exist in the Info dictionary. These tags, which should
103             have corresponding XMP-pdfx entries in the XMP of the PDF XML Metadata
104             object, are also extracted by ExifTool.
105              
106             B specifies the value format, and may be C, C,
107             C, C, C or C for PDF tags.
108             },
109             Title => { },
110             Author => { Groups => { 2 => 'Author' } },
111             Subject => { },
112             Keywords => { List => 'string' }, # this is a string list
113             Creator => { },
114             Producer => { },
115             CreationDate => {
116             Name => 'CreateDate',
117             Writable => 'date',
118             Groups => { 2 => 'Time' },
119             Shift => 'Time',
120             PrintConv => '$self->ConvertDateTime($val)',
121             PrintConvInv => '$self->InverseDateTime($val)',
122             },
123             ModDate => {
124             Name => 'ModifyDate',
125             Writable => 'date',
126             Groups => { 2 => 'Time' },
127             Shift => 'Time',
128             PrintConv => '$self->ConvertDateTime($val)',
129             PrintConvInv => '$self->InverseDateTime($val)',
130             },
131             Trapped => {
132             Protected => 1,
133             # remove leading '/' from '/True' or '/False'
134             ValueConv => '$val=~s{^/}{}; $val',
135             ValueConvInv => '"/$val"',
136             },
137             'AAPL:Keywords' => { #PH
138             Name => 'AppleKeywords',
139             List => 'array', # this is an array of values
140             Notes => q{
141             keywords written by Apple utilities, although they seem to use PDF:Keywords
142             when reading
143             },
144             },
145             );
146              
147             # tags in the PDF Root document catalog
148             %Image::ExifTool::PDF::Root = (
149             GROUPS => { 2 => 'Document' },
150             # note: can't capture previous versions of Root since they are not parsed
151             VARS => { CAPTURE => ['Root'] },
152             NOTES => 'This is the PDF document catalog.',
153             MarkInfo => {
154             SubDirectory => { TagTable => 'Image::ExifTool::PDF::MarkInfo' },
155             },
156             Metadata => {
157             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
158             },
159             Pages => {
160             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Pages' },
161             },
162             Perms => {
163             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Perms' },
164             },
165             AcroForm => {
166             SubDirectory => { TagTable => 'Image::ExifTool::PDF::AcroForm' },
167             },
168             Lang => 'Language',
169             PageLayout => { },
170             PageMode => { },
171             Version => 'PDFVersion',
172             );
173              
174             # tags extracted from the PDF Encrypt dictionary
175             %Image::ExifTool::PDF::Encrypt = (
176             GROUPS => { 2 => 'Document' },
177             NOTES => 'Tags extracted from the document Encrypt dictionary.',
178             Filter => {
179             Name => 'Encryption',
180             Notes => q{
181             extracted value is actually a combination of the Filter, SubFilter, V, R and
182             Length information from the Encrypt dictionary
183             },
184             },
185             P => {
186             Name => 'UserAccess',
187             ValueConv => '$val & 0x0f3c', # ignore reserved bits
188             PrintConvColumns => 2,
189             PrintConv => { BITMASK => {
190             2 => 'Print',
191             3 => 'Modify',
192             4 => 'Copy',
193             5 => 'Annotate',
194             8 => 'Fill forms',
195             9 => 'Extract',
196             10 => 'Assemble',
197             11 => 'Print high-res',
198             }},
199             },
200             );
201              
202             # tags in PDF Pages dictionary
203             %Image::ExifTool::PDF::Pages = (
204             GROUPS => { 2 => 'Document' },
205             Count => 'PageCount',
206             Kids => {
207             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Kids' },
208             },
209             );
210              
211             # tags in PDF Perms dictionary
212             %Image::ExifTool::PDF::Perms = (
213             NOTES => 'Additional document permissions imposed by digital signatures.',
214             DocMDP => {
215             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
216             },
217             FieldMDP => {
218             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
219             },
220             UR3 => {
221             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
222             },
223             );
224              
225             # tags in PDF Perms dictionary
226             %Image::ExifTool::PDF::AcroForm = (
227             PROCESS_PROC => \&ProcessAcroForm,
228             _has_xfa => {
229             Name => 'HasXFA',
230             Notes => q{
231             this tag is defined if a document contains form fields, and is true if it
232             uses XML Forms Architecture; not a real Tag ID
233             },
234             PrintConv => { 'true' => 'Yes', 'false' => 'No' },
235             },
236             );
237              
238             # tags in PDF Kids dictionary
239             %Image::ExifTool::PDF::Kids = (
240             Metadata => {
241             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
242             },
243             PieceInfo => {
244             SubDirectory => { TagTable => 'Image::ExifTool::PDF::PieceInfo' },
245             },
246             Resources => {
247             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Resources' },
248             },
249             Kids => {
250             Condition => '$self->Options("ExtractEmbedded")',
251             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Kids' },
252             },
253             );
254              
255             # tags in PDF Resources dictionary
256             %Image::ExifTool::PDF::Resources = (
257             ColorSpace => {
258             SubDirectory => { TagTable => 'Image::ExifTool::PDF::ColorSpace' },
259             },
260             XObject => {
261             Condition => '$self->Options("ExtractEmbedded")',
262             SubDirectory => { TagTable => 'Image::ExifTool::PDF::XObject' },
263             },
264             Properties => {
265             Condition => '$self->Options("ExtractEmbedded")',
266             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Properties' },
267             },
268             );
269              
270             # tags in PDF ColorSpace dictionary
271             %Image::ExifTool::PDF::ColorSpace = (
272             DefaultRGB => {
273             SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
274             ConvertToDict => 1, # (not seen yet, but just in case)
275             },
276             DefaultCMYK => {
277             SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
278             # hack: this is stored as an array instead of a dictionary in my
279             # sample, so convert to a dictionary to extract the ICCBased element
280             ConvertToDict => 1,
281             },
282             Cs1 => {
283             SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
284             ConvertToDict => 1, # (just in case)
285             },
286             CS0 => {
287             SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
288             ConvertToDict => 1, # (just in case)
289             },
290             );
291              
292             # tags in PDF DefaultRGB dictionary
293             %Image::ExifTool::PDF::DefaultRGB = (
294             ICCBased => {
295             SubDirectory => { TagTable => 'Image::ExifTool::PDF::ICCBased' },
296             },
297             );
298              
299             # tags in PDF ICCBased, Cs1 and CS0 dictionaries
300             %Image::ExifTool::PDF::ICCBased = (
301             _stream => {
302             SubDirectory => { TagTable => 'Image::ExifTool::ICC_Profile::Main' },
303             },
304             );
305              
306             # tags in PDF XObject dictionary (parsed only if ExtractEmbedded is enabled)
307             %Image::ExifTool::PDF::XObject = (
308             EXTRACT_UNKNOWN => 0, # extract known but numbered tags (Im1, Im2, etc)
309             Im => {
310             Notes => q{
311             the L option enables information to be extracted from these
312             embedded images
313             },
314             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Im' },
315             },
316             );
317              
318             # tags in PDF Im# dictionary
319             %Image::ExifTool::PDF::Im = (
320             NOTES => q{
321             Information extracted from embedded images with the L option.
322             The EmbeddedImage and its metadata are extracted only for JPEG and Jpeg2000
323             image formats.
324             },
325             Width => 'EmbeddedImageWidth',
326             Height => 'EmbeddedImageHeight',
327             Filter => { Name => 'EmbeddedImageFilter', List => 1 },
328             ColorSpace => {
329             Name => 'EmbeddedImageColorSpace',
330             List => 1,
331             RawConv => 'ref $val ? undef : $val', # (ignore color space data)
332             },
333             Image_stream => {
334             Name => 'EmbeddedImage',
335             Groups => { 2 => 'Preview' },
336             Binary => 1,
337             },
338             );
339              
340             # tags in PDF Properties dictionary
341             %Image::ExifTool::PDF::Properties = (
342             EXTRACT_UNKNOWN => 0, # extract known but numbered tags (MC0, MC1, etc)
343             MC => {
344             Notes => q{
345             the L option enables information to be extracted from these
346             embedded metadata dictionaries
347             },
348             SubDirectory => { TagTable => 'Image::ExifTool::PDF::MC' },
349             }
350             );
351              
352             # tags in PDF MC# dictionary
353             %Image::ExifTool::PDF::MC = (
354             Metadata => {
355             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
356             }
357             );
358              
359             # tags in PDF PieceInfo dictionary
360             %Image::ExifTool::PDF::PieceInfo = (
361             AdobePhotoshop => {
362             SubDirectory => { TagTable => 'Image::ExifTool::PDF::AdobePhotoshop' },
363             },
364             Illustrator => {
365             # assume this is an illustrator file if it contains this directory
366             # and doesn't have a ".PDF" extension
367             Condition => q{
368             $self->OverrideFileType("AI") unless $$self{FILE_EXT} and $$self{FILE_EXT} eq 'PDF';
369             return 1;
370             },
371             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Illustrator' },
372             },
373             );
374              
375             # tags in PDF AdobePhotoshop dictionary
376             %Image::ExifTool::PDF::AdobePhotoshop = (
377             Private => {
378             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Private' },
379             },
380             );
381              
382             # tags in PDF Illustrator dictionary
383             %Image::ExifTool::PDF::Illustrator = (
384             Private => {
385             SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIPrivate' },
386             },
387             );
388              
389             # tags in PDF Private dictionary
390             %Image::ExifTool::PDF::Private = (
391             ImageResources => {
392             SubDirectory => { TagTable => 'Image::ExifTool::PDF::ImageResources' },
393             },
394             );
395              
396             # tags in PDF AI Private dictionary
397             %Image::ExifTool::PDF::AIPrivate = (
398             GROUPS => { 2 => 'Document' },
399             EXTRACT_UNKNOWN => 0, # extract known but numbered tags
400             AIMetaData => {
401             SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIMetaData' },
402             },
403             AIPrivateData => {
404             Notes => q{
405             the L option enables information to be extracted from embedded
406             PostScript documents in the AIPrivateData# and AIPDFPrivateData# streams
407             },
408             JoinStreams => 1, # join streams from numbered tags and process as one
409             SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
410             },
411             AIPDFPrivateData => {
412             JoinStreams => 1, # join streams from numbered tags and process as one
413             SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
414             },
415             RoundTripVersion => { },
416             ContainerVersion => { },
417             CreatorVersion => { },
418             );
419              
420             # tags in PDF AIMetaData dictionary
421             %Image::ExifTool::PDF::AIMetaData = (
422             _stream => {
423             SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
424             },
425             );
426              
427             # tags in PDF ImageResources dictionary
428             %Image::ExifTool::PDF::ImageResources = (
429             _stream => {
430             SubDirectory => { TagTable => 'Image::ExifTool::Photoshop::Main' },
431             },
432             );
433              
434             # tags in PDF MarkInfo dictionary
435             %Image::ExifTool::PDF::MarkInfo = (
436             GROUPS => { 2 => 'Document' },
437             Marked => {
438             Name => 'TaggedPDF',
439             Notes => "not a Tagged PDF if this tag is missing",
440             PrintConv => { 'true' => 'Yes', 'false' => 'No' },
441             },
442             );
443              
444             # tags in PDF Metadata dictionary
445             %Image::ExifTool::PDF::Metadata = (
446             GROUPS => { 2 => 'Document' },
447             XML_stream => { # this is the stream for a Subtype /XML dictionary (not a real tag)
448             Name => 'XMP',
449             SubDirectory => { TagTable => 'Image::ExifTool::XMP::Main' },
450             },
451             );
452              
453             # tags in PDF signature directories (DocMDP, FieldMDP or UR3)
454             %Image::ExifTool::PDF::Signature = (
455             GROUPS => { 2 => 'Document' },
456             ContactInfo => 'SignerContactInfo',
457             Location => 'SigningLocation',
458             M => {
459             Name => 'SigningDate',
460             Format => 'date',
461             Groups => { 2 => 'Time' },
462             PrintConv => '$self->ConvertDateTime($val)',
463             },
464             Name => 'SigningAuthority',
465             Reason => 'SigningReason',
466             Reference => {
467             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Reference' },
468             },
469             Prop_AuthTime => {
470             Name => 'AuthenticationTime',
471             PrintConv => 'ConvertTimeSpan($val) . " ago"',
472             },
473             Prop_AuthType => 'AuthenticationType',
474             );
475              
476             # tags in PDF Reference dictionary
477             %Image::ExifTool::PDF::Reference = (
478             TransformParams => {
479             SubDirectory => { TagTable => 'Image::ExifTool::PDF::TransformParams' },
480             },
481             );
482              
483             # tags in PDF TransformParams dictionary
484             %Image::ExifTool::PDF::TransformParams = (
485             GROUPS => { 2 => 'Document' },
486             Annots => {
487             Name => 'AnnotationUsageRights',
488             Notes => q{
489             possible values are Create, Delete, Modify, Copy, Import and Export;
490             additional values for UR3 signatures are Online and SummaryView
491             },
492             List => 1,
493             },
494             Document => {
495             Name => 'DocumentUsageRights',
496             Notes => 'only possible value is FullSave',
497             List => 1,
498             },
499             Form => {
500             Name => 'FormUsageRights',
501             Notes => q{
502             possible values are FillIn, Import, Export, SubmitStandalone and
503             SpawnTemplate; additional values for UR3 signatures are BarcodePlaintext and
504             Online
505             },
506             List => 1,
507             },
508             FormEX => {
509             Name => 'FormExtraUsageRights',
510             Notes => 'UR signatures only; only possible value is BarcodePlaintext',
511             List => 1,
512             },
513             Signature => {
514             Name => 'SignatureUsageRights',
515             Notes => 'only possible value is Modify',
516             List => 1,
517             },
518             EF => {
519             Name => 'EmbeddedFileUsageRights',
520             Notes => 'possible values are Create, Delete, Modify and Import',
521             List => 1,
522             },
523             Msg => 'UsageRightsMessage',
524             P => {
525             Name => 'ModificationPermissions',
526             Notes => q{
527             1-3 for DocMDP signatures, default 2; true/false for UR3 signatures, default
528             false
529             },
530             PrintConv => {
531             1 => 'No changes permitted',
532             2 => 'Fill forms, Create page templates, Sign',
533             3 => 'Fill forms, Create page templates, Sign, Create/Delete/Edit annotations',
534             'true' => 'Restrict all applications to reader permissions',
535             'false' => 'Do not restrict applications to reader permissions',
536             },
537             },
538             Action => {
539             Name => 'FieldPermissions',
540             Notes => 'FieldMDP signatures only',
541             PrintConv => {
542             'All' => 'Disallow changes to all form fields',
543             'Include' => 'Disallow changes to specified form fields',
544             'Exclude' => 'Allow changes to specified form fields',
545             },
546             },
547             Fields => {
548             Notes => 'FieldMDP signatures only',
549             Name => 'FormFields',
550             List => 1,
551             },
552             );
553              
554             # unknown tags for use in verbose option
555             %Image::ExifTool::PDF::Unknown = (
556             GROUPS => { 2 => 'Unknown' },
557             );
558              
559             #------------------------------------------------------------------------------
560             # AutoLoad our writer routines when necessary
561             #
562             sub AUTOLOAD
563             {
564 19     19   141 return Image::ExifTool::DoAutoLoad($AUTOLOAD, @_);
565             }
566              
567             #------------------------------------------------------------------------------
568             # Convert from PDF to EXIF-style date/time
569             # Inputs: 0) PDF date/time string (D:YYYYmmddHHMMSS+HH'MM')
570             # Returns: EXIF date string (YYYY:mm:dd HH:MM:SS+HH:MM)
571             sub ConvertPDFDate($)
572             {
573 10     10 0 24 my $date = shift;
574             # remove optional 'D:' prefix
575 10         56 $date =~ s/^D://;
576             # fill in default values if necessary
577             # YYYYmmddHHMMSS
578 10         24 my $default = '00000101000000';
579 10 50       35 if (length $date < length $default) {
580 0         0 $date .= substr($default, length $date);
581             }
582 10 50       50 $date =~ /^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(.*)/ or return $date;
583 10         73 $date = "$1:$2:$3 $4:$5:$6";
584 10 50       32 if ($7) {
585 10         23 my $tz = $7;
586 10 50       78 if ($tz =~ /^\s*Z/i) {
    50          
587             # ignore any "HH'mm'" after the Z (OS X 10.6 does this)
588 0         0 $date .= 'Z';
589             # tolerate some improper formatting in timezone specification
590             } elsif ($tz =~ /^\s*([-+])\s*(\d+)[': ]+(\d*)/) {
591 10   50     54 $date .= $1 . $2 . ':' . ($3 || '00');
592             }
593             }
594 10         23 return $date;
595             }
596              
597             #------------------------------------------------------------------------------
598             # Locate any object in the XRef tables (including compressed objects)
599             # Inputs: 0) XRef reference, 1) object reference string (or free object number)
600             # Returns: offset to object in file or compressed object reference string,
601             # 0 if object is free, or undefined on error
602             sub LocateAnyObject($$)
603             {
604 238     238 0 500 my ($xref, $ref) = @_;
605 238 50       580 return undef unless $xref;
606 238 100       811 return $$xref{$ref} if exists $$xref{$ref};
607             # get the object number
608 7 50       66 return undef unless $ref =~ /^(\d+)/;
609 7         27 my $objNum = $1;
610             # return 0 if the object number has been reused (old object is free)
611 7 100       37 return 0 if defined $$xref{$objNum};
612             #
613             # scan our XRef stream dictionaries for this object
614             #
615 1 50       7 return undef unless $$xref{dicts};
616 0         0 my $dict;
617 0         0 foreach $dict (@{$$xref{dicts}}) {
  0         0  
618             # quick check to see if the object is in the range for this xref stream
619 0 0       0 next if $objNum >= $$dict{Size};
620 0         0 my $index = $$dict{Index};
621 0 0       0 next if $objNum < $$index[0];
622             # scan the tables for the specified object
623 0         0 my $size = $$dict{_entry_size};
624 0         0 my $num = scalar(@$index) / 2;
625 0         0 my $tot = 0;
626 0         0 my $i;
627 0         0 for ($i=0; $i<$num; ++$i) {
628 0         0 my $start = $$index[$i*2];
629 0         0 my $count = $$index[$i*2+1];
630             # table is in ascending order, so quit if we have passed the object
631 0 0       0 last if $objNum < $start;
632 0 0       0 if ($objNum < $start + $count) {
633 0         0 my $offset = $size * ($objNum - $start + $tot);
634 0 0       0 last if $offset + $size > length $$dict{_stream};
635 0         0 my @c = unpack("x$offset C$size", $$dict{_stream});
636             # extract values from this table entry
637             # (can be 1, 2, 3, 4, etc.. bytes per value)
638 0         0 my (@t, $j, $k);
639 0         0 my $w = $$dict{W};
640 0         0 for ($j=0; $j<3; ++$j) {
641             # use default value if W entry is 0 (as per spec)
642             # - 0th element defaults to 1, others default to 0
643 0 0       0 $$w[$j] or $t[$j] = ($j ? 0 : 1), next;
    0          
644 0         0 $t[$j] = shift(@c);
645 0         0 for ($k=1; $k < $$w[$j]; ++$k) {
646 0         0 $t[$j] = 256 * $t[$j] + shift(@c);
647             }
648             }
649             # by default, use "o g R" as the xref key
650             # (o = object number, g = generation number)
651 0         0 my $ref2 = "$objNum $t[2] R";
652 0 0       0 if ($t[0] == 1) {
    0          
    0          
653             # normal object reference:
654             # $t[1]=offset of object from start, $t[2]=generation number
655 0         0 $$xref{$ref2} = $t[1];
656             } elsif ($t[0] == 2) {
657             # compressed object reference:
658             # $t[1]=stream object number, $t[2]=index of object in stream
659 0         0 $ref2 = "$objNum 0 R";
660 0         0 $$xref{$ref2} = "I$t[2] $t[1] 0 R";
661             } elsif ($t[0] == 0) {
662             # free object:
663             # $t[1]=next free object in linked list, $t[2]=generation number
664 0         0 $$xref{$ref2} = 0;
665             } else {
666             # treat as a null object
667 0         0 $$xref{$ref2} = undef;
668             }
669 0         0 $$xref{$objNum} = $t[1]; # remember offsets by object number too
670 0 0       0 return $$xref{$ref} if $ref eq $ref2;
671 0         0 return 0; # object is free or was reused
672             }
673 0         0 $tot += $count;
674             }
675             }
676 0         0 return undef;
677             }
678              
679             #------------------------------------------------------------------------------
680             # Locate a regular object in the XRef tables (does not include compressed objects)
681             # Inputs: 0) XRef reference, 1) object reference string (or free object number)
682             # Returns: offset to object in file, 0 if object is free,
683             # or undef on error or if object was compressed
684             sub LocateObject($$)
685             {
686 41     41 0 154 my ($xref, $ref) = @_;
687 41         137 my $offset = LocateAnyObject($xref, $ref);
688 41 50 66     271 return undef if $offset and $offset =~ /^I/;
689 41         197 return $offset;
690             }
691              
692             #------------------------------------------------------------------------------
693             # Check that the correct object is located at the specified file offset
694             # Inputs: 0) ExifTool ref, 1) object name, 2) object reference string, 3) file offset
695             # Returns: first non-blank line at start of object, or undef on error
696             sub CheckObject($$$$)
697             {
698 217     217 0 520 my ($et, $tag, $ref, $offset) = @_;
699 217         367 my ($data, $obj, $dat, $pat);
700              
701 217         383 my $raf = $$et{RAF};
702 217 50       772 $raf->Seek($offset+$$et{PDFBase}, 0) or $et->Warn("Bad $tag offset"), return undef;
703             # verify that we are reading the expected object
704 217         1091 ($obj = $ref) =~ s/R/obj/;
705 217         381 for (;;) {
706 217 50       656 $raf->ReadLine($data) or $et->Warn("Error reading $tag data"), return undef;
707 217 50       4501 last if $data =~ s/^$obj//;
708 0 0       0 next if $data =~ /^\s+$/; # keep reading if this was a blank line
709             # handle cases where other whitespace characters are used in the object ID string
710 0         0 while ($data =~ /^\d+(\s+\d+)?\s*$/) {
711 0         0 $raf->ReadLine($dat);
712 0         0 $data .= $dat;
713             }
714 0         0 ($pat = $obj) =~ s/ /\\s+/g;
715 0 0       0 unless ($data =~ s/$pat//) {
716 0         0 $tag = ucfirst $tag;
717 0         0 $et->Warn("$tag object ($obj) not found at offset $offset");
718 0         0 return undef;
719             }
720 0         0 last;
721             }
722             # read the first line of data from the object (ignoring blank lines and comments)
723 217         543 for (;;) {
724 434 100 66     1930 last if $data =~ /\S/ and $data !~ /^\s*%/;
725 217 50       751 $raf->ReadLine($data) or $et->Warn("Error reading $tag data"), return undef;
726             }
727 217         599 return $data;
728             }
729              
730             #------------------------------------------------------------------------------
731             # Fetch indirect object from file (from inside a stream if required)
732             # Inputs: 0) ExifTool object reference, 1) object reference string,
733             # 2) xref lookup, 3) object name (for warning messages)
734             # Returns: object data or undefined on error
735             # Notes: sets $lastFetched to the object reference, or undef if the object
736             # was extracted from an encrypted stream
737             sub FetchObject($$$$)
738             {
739 197     197 0 507 my ($et, $ref, $xref, $tag) = @_;
740 197         345 $lastFetched = $ref; # save this for decoding if necessary
741 197         470 my $offset = LocateAnyObject($xref, $ref);
742 197         337 $lastOffset = $offset;
743 197 100       567 unless ($offset) {
744 5 50       20 $et->Warn("Bad $tag reference") unless defined $offset;
745 5         15 return undef;
746             }
747 192         313 my ($data, $obj);
748 192 50       753 if ($offset =~ s/^I(\d+) //) {
749 0         0 my $index = $1; # object index in stream
750 0         0 my ($objNum) = split ' ', $ref; # save original object number
751 0         0 $ref = $offset; # now a reference to the containing stream object
752 0         0 $obj = $streamObjs{$ref};
753 0 0       0 unless ($obj) {
754             # don't try to load the same object stream twice
755 0 0       0 return undef if defined $obj;
756 0         0 $streamObjs{$ref} = '';
757             # load the parent object stream
758 0         0 $obj = FetchObject($et, $ref, $xref, $tag);
759             # make sure it contains everything we need
760 0 0 0     0 return undef unless defined $obj and ref($obj) eq 'HASH';
761 0 0 0     0 return undef unless $$obj{First} and $$obj{N};
762 0 0       0 return undef unless DecodeStream($et, $obj);
763             # add a special '_table' entry to this dictionary which contains
764             # the list of object number/offset pairs from the stream header
765 0         0 my $num = $$obj{N} * 2;
766 0         0 my @table = split ' ', $$obj{_stream}, $num;
767 0 0       0 return undef unless @table == $num;
768             # remove everything before first object in stream
769 0         0 $$obj{_stream} = substr($$obj{_stream}, $$obj{First});
770 0         0 $table[$num-1] =~ s/^(\d+).*/$1/s; # trim excess from last number
771 0         0 $$obj{_table} = \@table;
772             # save the object stream so we don't have to re-load it later
773 0         0 $streamObjs{$ref} = $obj;
774             }
775             # verify that we have the specified object
776 0         0 my $i = 2 * $index;
777 0         0 my $table = $$obj{_table};
778 0 0 0     0 unless ($index < $$obj{N} and $$table[$i] == $objNum) {
779 0         0 $et->Warn("Bad index for stream object $tag");
780 0         0 return undef;
781             }
782             # extract the object at the specified index in the stream
783             # (offsets in table are in sequential order, so we can subtract from
784             # the next offset to get the object length)
785 0         0 $offset = $$table[$i + 1];
786 0   0     0 my $len = ($$table[$i + 3] || length($$obj{_stream})) - $offset;
787 0         0 $data = substr($$obj{_stream}, $offset, $len);
788             # avoid re-decrypting data in already decrypted streams
789 0 0       0 undef $lastFetched if $cryptStream;
790 0         0 return ExtractObject($et, \$data);
791             }
792             # load the start of the object
793 192         467 $data = CheckObject($et, $tag, $ref, $offset);
794 192 50       426 return undef unless defined $data;
795              
796 192         646 return ExtractObject($et, \$data, $$et{RAF}, $xref);
797             }
798              
799             #------------------------------------------------------------------------------
800             # Convert PDF value to something readable
801             # Inputs: 0) PDF object data
802             # Returns: converted object
803             sub ReadPDFValue($)
804             {
805 148     148 0 265 my $str = shift;
806             # decode all strings in an array
807 148 100       354 if (ref $str eq 'ARRAY') {
808             # create new list to not alter the original data when rewriting
809 12         39 my ($val, @vals);
810 12         34 foreach $val (@$str) {
811 20         70 push @vals, ReadPDFValue($val);
812             }
813 12         45 return \@vals;
814             }
815 136 50       323 length $str or return $str;
816 136         328 my $delim = substr($str, 0, 1);
817 136 100       510 if ($delim eq '(') { # literal string
    100          
    50          
818 58 50       363 $str = $1 if $str =~ /^.*?\((.*)\)/s; # remove brackets
819             # decode escape sequences in literal strings
820 58         187 while ($str =~ /\\(.)/sg) {
821 0         0 my $n = pos($str) - 2;
822 0         0 my $c = $1;
823 0         0 my $r;
824 0 0       0 if ($c =~ /[0-7]/) {
    0          
    0          
825             # get up to 2 more octal digits
826 0 0       0 $c .= $1 if $str =~ /\G([0-7]{1,2})/g;
827             # convert octal escape code
828 0         0 $r = chr(oct($c) & 0xff);
829             } elsif ($c eq "\x0d") {
830             # the string is continued if the line ends with '\'
831             # (also remove "\x0d\x0a")
832 0 0       0 $c .= $1 if $str =~ /\G(\x0a)/g;
833 0         0 $r = '';
834             } elsif ($c eq "\x0a") {
835 0         0 $r = '';
836             } else {
837             # convert escaped characters
838 0         0 ($r = $c) =~ tr/nrtbf/\n\r\t\b\f/;
839             }
840 0         0 substr($str, $n, length($c)+1) = $r;
841             # continue search after this character
842 0         0 pos($str) = $n + length($r);
843             }
844 58 50       142 Crypt(\$str, $lastFetched) if $cryptString;
845             } elsif ($delim eq '<') { # hex string
846             # decode hex data
847 41         120 $str =~ tr/0-9A-Fa-f//dc;
848 41 50       112 $str .= '0' if length($str) & 0x01; # (by the spec)
849 41         174 $str = pack('H*', $str);
850 41 100       119 Crypt(\$str, $lastFetched) if $cryptString;
851             } elsif ($delim eq '/') { # name
852 0         0 $str = substr($str, 1);
853             # convert escape codes (PDF 1.2 or later)
854 0 0       0 $str =~ s/#([0-9a-f]{2})/chr(hex($1))/sgei if $pdfVer >= 1.2;
  0         0  
855             }
856 136         336 return $str;
857             }
858              
859             #------------------------------------------------------------------------------
860             # Extract PDF object from combination of buffered data and file
861             # Inputs: 0) ExifTool object reference, 1) data reference,
862             # 2) optional raf reference, 3) optional xref table
863             # Returns: converted PDF object or undef on error
864             # a) dictionary object --> hash reference
865             # b) array object --> array reference
866             # c) indirect reference --> scalar reference
867             # d) string, name, integer, boolean, null --> scalar value
868             # - updates $$dataPt on return to contain unused data
869             # - creates two bogus entries ('_stream' and '_tags') in dictionaries to represent
870             # the stream data and a list of the tags (not including '_stream' and '_tags')
871             # in their original order
872             sub ExtractObject($$;$$)
873             {
874 754     754 0 1555 my ($et, $dataPt, $raf, $xref) = @_;
875 754         1157 my (@tags, $data, $objData);
876 754         1215 my $dict = { };
877 754         1115 my $delim;
878              
879 754         1001 for (;;) {
880 824 100       2975 if ($$dataPt =~ /^\s*(<{1,2}|\[|\()/s) {
    50          
881 754         1654 $delim = $1;
882 754         1598 $$dataPt =~ s/^\s+//; # remove leading white space
883 754         1549 $objData = ReadToNested($dataPt, $raf);
884 754 50       1532 return undef unless defined $objData;
885 754         1103 last;
886             } elsif ($$dataPt =~ s{^\s*(\S[^[(/<>\s]*)\s*}{}s) {
887             #
888             # extract boolean, numerical, string, name, null object or indirect reference
889             #
890 0         0 $objData = $1;
891             # look for an indirect reference
892 0 0 0     0 if ($objData =~ /^\d+$/ and $$dataPt =~ s/^(\d+)\s+R//s) {
893 0         0 $objData .= "$1 R";
894 0         0 $objData = \$objData; # return scalar reference
895             }
896 0         0 return $objData; # return simple scalar or scalar reference
897             }
898 70 50 33     299 $raf and $raf->ReadLine($data) or return undef;
899 70         165 $$dataPt .= $data;
900             }
901             #
902             # return literal string or hex string without parsing
903             #
904 754 100 100     2966 if ($delim eq '(' or $delim eq '<') {
    100          
905 160         486 return $objData;
906             #
907             # extract array
908             #
909             } elsif ($delim eq '[') {
910 167 50       789 $objData =~ /^.*?\[(.*)\]/s or return undef;
911 167         396 my $data = $1; # brackets removed
912 167         273 my @list;
913 167         248 for (;;) {
914 594 100       1794 last unless $data =~ m{\s*(\S[^[(/<>\s]*)}sg;
915 427         952 my $val = $1;
916 427 100       1479 if ($val =~ /^(<{1,2}|\[|\()/) {
    100          
917 78         164 my $pos = pos($data) - length($val);
918             # nested dict, array, literal string or hex string
919 78         164 my $buff = substr($data, $pos);
920 78         163 $val = ReadToNested(\$buff);
921 78 50       229 last unless defined $val;
922 78         272 pos($data) = $pos + length($val);
923 78         218 $val = ExtractObject($et, \$val);
924             } elsif ($val =~ /^\d/) {
925 245         368 my $pos = pos($data);
926 245 100       614 if ($data =~ /\G\s+(\d+)\s+R/g) {
927 37         179 $val = \ "$val $1 R"; # make a reference
928             } else {
929 208         390 pos($data) = $pos;
930             }
931             }
932 427         989 push @list, $val;
933             }
934 167         563 return \@list;
935             }
936             #
937             # extract dictionary
938             #
939             # Note: entries are not necessarily separated by whitespace (doh!)
940             # eg) "/Tag/Name", "/Tag(string)", "/Tag[array]", etc are legal!
941             # Also, they may be separated by a comment (eg. "/Tag%comment\nValue"),
942             # but comments have already been removed
943 427         2335 while ($objData =~ m{(\s*)/([^/[\]()<>{}\s]+)\s*(\S[^[(/<>\s]*)}sg) {
944 1229         2781 my $tag = $2;
945 1229         2303 my $val = $3;
946 1229 100       3861 if ($val =~ /^(<{1,2}|\[|\()/) {
    100          
947             # nested dict, array, literal string or hex string
948 396         1125 $objData = substr($objData, pos($objData)-length($val));
949 396         835 $val = ReadToNested(\$objData, $raf);
950 396 50       841 last unless defined $val;
951 396         893 $val = ExtractObject($et, \$val);
952 396         810 pos($objData) = 0;
953             } elsif ($val =~ /^\d/) {
954 618         972 my $pos = pos($objData);
955 618 100       1956 if ($objData =~ /\G\s+(\d+)\s+R/sg) {
956 416         1370 $val = \ "$val $1 R"; # make a reference
957             } else {
958 202         450 pos($objData) = $pos;
959             }
960             }
961 1229 50       2624 if ($$dict{$tag}) {
962             # duplicate dictionary entries are not allowed
963 0         0 $et->Warn("Duplicate '${tag}' entry in dictionary (ignored)");
964             } else {
965             # save the entry
966 1229         2323 push @tags, $tag;
967 1229         5661 $$dict{$tag} = $val;
968             }
969             }
970 427 50       1011 return undef unless @tags;
971 427         916 $$dict{_tags} = \@tags;
972 427 100       1114 return $dict unless $raf; # direct objects can not have streams
973             #
974             # extract the stream object
975             #
976             # dictionary must specify stream Length
977 262 100       1085 my $length = $$dict{Length} or return $dict;
978 43 100       174 if (ref $length) {
979 25         58 $length = $$length;
980 25         95 my $oldpos = $raf->Tell();
981             # get the location of the object specifying the length
982             # (compressed objects are not allowed)
983 25 50       120 my $offset = LocateObject($xref, $length) or return $dict;
984 25 50       81 $offset or $et->Warn('Bad stream Length object'), return $dict;
985 25         69 $data = CheckObject($et, 'stream Length', $length, $offset);
986 25 50       107 defined $data or return $dict;
987 25 50       149 $data =~ /^\s*(\d+)/ or $et->Warn('Stream Length not found'), return $dict;
988 25         76 $length = $1;
989 25         77 $raf->Seek($oldpos, 0); # restore position to start of stream
990             }
991             # extract the trailing stream data
992 43         119 for (;;) {
993             # find the stream token
994 86 100       407 if ($$dataPt =~ /(\S+)/) {
995 43 50       241 last unless $1 eq 'stream';
996             # read an extra line because it may contain our \x0a
997 43 50       166 $$dataPt .= $data if $raf->ReadLine($data);
998             # remove our stream header
999 43         442 $$dataPt =~ s/^\s*stream(\x0a|\x0d\x0a)//s;
1000 43         207 my $more = $length - length($$dataPt);
1001 43 100       176 if ($more > 0) {
    50          
1002 28 50       103 unless ($raf->Read($data, $more) == $more) {
1003 0         0 $et->Warn('Error reading stream data');
1004 0         0 $$dataPt = '';
1005 0         0 return $dict;
1006             }
1007 28         211 $$dict{_stream} = $$dataPt . $data;
1008 28         69 $$dataPt = '';
1009             } elsif ($more < 0) {
1010 15         136 $$dict{_stream} = substr($$dataPt, 0, $length);
1011 15         72 $$dataPt = substr($$dataPt, $length);
1012             } else {
1013 0         0 $$dict{_stream} = $$dataPt;
1014 0         0 $$dataPt = '';
1015             }
1016 43         141 last;
1017             }
1018 43 50       195 $raf->ReadLine($data) or last;
1019 43         194 $$dataPt .= $data;
1020             }
1021 43         176 return $dict;
1022             }
1023              
1024             #------------------------------------------------------------------------------
1025             # Read to nested delimiter
1026             # Inputs: 0) data reference, 1) optional raf reference
1027             # Returns: data up to and including matching delimiter (or undef on error)
1028             # - updates data reference with trailing data
1029             # - unescapes characters in literal strings
1030             my %closingDelim = ( # lookup for matching delimiter
1031             '(' => ')',
1032             '[' => ']',
1033             '<' => '>',
1034             '<<' => '>>',
1035             );
1036             sub ReadToNested($;$)
1037             {
1038 1228     1228 0 2259 my ($dataPt, $raf) = @_;
1039 1228         2325 my @delim = (''); # closing delimiter list, most deeply nested first
1040 1228         2711 pos($$dataPt) = 0; # begin at start of data
1041 1228         2126 for (;;) {
1042 5744 100       37279 unless ($$dataPt =~ /(\\*)(\(|\)|<{1,2}|>{1,2}|\[|\]|%)/g) {
1043             # must read some more data
1044 1148         1521 my $buff;
1045 1148 50 33     3357 last unless $raf and $raf->ReadLine($buff);
1046 1148         2785 $$dataPt .= $buff;
1047 1148         2432 pos($$dataPt) = length($$dataPt) - length($buff);
1048 1148         1993 next;
1049             }
1050             # are we in a literal string?
1051 4596 100       11431 if ($delim[0] eq ')') {
    50          
1052             # ignore escaped delimiters (preceded by odd number of \'s)
1053 434 50       977 next if length($1) & 0x01;
1054             # ignore all delimiters but unescaped braces
1055 434 50 33     1576 next unless $2 eq '(' or $2 eq ')';
1056             } elsif ($2 eq '%') {
1057             # ignore the comment
1058 0         0 my $pos = pos($$dataPt) - 1;
1059             # remove everything from '%' up to but not including newline
1060 0         0 $$dataPt =~ /.*/g;
1061 0         0 my $end = pos($$dataPt);
1062 0         0 $$dataPt = substr($$dataPt, 0, $pos) . substr($$dataPt, $end);
1063 0         0 pos($$dataPt) = $pos;
1064 0         0 next;
1065             }
1066 4596 100       9196 if ($closingDelim{$2}) {
1067             # push the corresponding closing delimiter
1068 2298         4776 unshift @delim, $closingDelim{$2};
1069 2298         3371 next;
1070             }
1071 2298 50       4428 unless ($2 eq $delim[0]) {
1072             # handle the case where we find a ">>>" and interpret it
1073             # as ">> >" instead of "> >>"
1074 0 0 0     0 next unless $2 eq '>>' and $delim[0] eq '>';
1075 0         0 pos($$dataPt) = pos($$dataPt) - 1;
1076             }
1077 2298         3135 shift @delim; # remove from nesting list
1078 2298 100       4604 next if $delim[0]; # keep going if we have more nested delimiters
1079 1228         1949 my $pos = pos($$dataPt);
1080 1228         2410 my $buff = substr($$dataPt, 0, $pos);
1081 1228         2408 $$dataPt = substr($$dataPt, $pos);
1082 1228         2856 return $buff; # success!
1083             }
1084 0         0 return undef; # didn't find matching delimiter
1085             }
1086              
1087             #------------------------------------------------------------------------------
1088             # Decode LZW-encoded data (ref 1)
1089             # Inputs: 0) data reference
1090             # Returns: true on success and data is decoded, or false and data is untouched
1091             sub DecodeLZW($)
1092             {
1093 0     0 0 0 my $dataPt = shift;
1094 0 0       0 return 0 if length $$dataPt < 4;
1095 0         0 my @lzw = (map(chr, 0..255), undef, undef); # LZW code table
1096 0         0 my $mask = 0x01ff; # mask for least-significant 9 bits
1097 0         0 my @dat = unpack 'n*', $$dataPt . "\0";
1098 0         0 my $word = ($dat[0] << 16) | $dat[1];
1099 0         0 my ($bit, $pos, $bits, $out) = (0, 2, 9, '');
1100 0         0 my $lastVal;
1101 0         0 for (;;) {
1102             # bits are packed MSB first in PDF LZW (the PDF spec doesn't mention this)
1103 0         0 my $shift = 32 - ($bit + $bits);
1104 0 0       0 if ($shift < 0) {
1105 0 0       0 return 0 if $pos >= @dat; # missing EOD marker
1106 0         0 $word = (($word & 0xffff) << 16) | $dat[$pos++]; # read next word
1107 0         0 $bit -= 16;
1108 0         0 $shift += 16;
1109             };
1110 0         0 my $code = ($word >> $shift) & $mask;
1111 0         0 $bit += $bits;
1112 0         0 my $val = $lzw[$code];
1113 0 0       0 if (defined $val) {
    0          
    0          
    0          
1114             # store new code as previous sequence plus 1st char of new sequence
1115 0 0       0 push @lzw, $lastVal . substr($val, 0, 1) if defined $lastVal;
1116             } elsif ($code == @lzw) { # new code
1117 0 0       0 return 0 unless defined $lastVal;
1118             # we are using the code that we are about to generate, so the last
1119             # character in the new sequence must be the same as the first
1120             # character in the previous sequence (makes sense if you think about it)
1121 0         0 $val = $lastVal . substr($lastVal, 0, 1);
1122 0         0 push @lzw, $val;
1123             } elsif ($code == 256) { # clear table
1124 0         0 splice @lzw, 258;
1125 0         0 $bits = 9;
1126 0         0 $mask = 0x1ff;
1127 0         0 undef $lastVal;
1128 0         0 next;
1129             } elsif ($code == 257) { # EOD marker
1130 0         0 last; # all done!
1131             } else {
1132 0         0 return 0;
1133             }
1134 0         0 $out .= $val; # add this byte sequence to the output
1135             # we added a new entry to the LZW table, so we must increase
1136             # the bit width if necessary, up to a maximum of 12
1137 0 0 0     0 @lzw >= $mask and $bits < 12 and ++$bits, $mask |= $mask << 1;
1138 0         0 $lastVal = $val;
1139             }
1140 0         0 $$dataPt = $out; # return decompressed data
1141 0         0 return 1;
1142             }
1143              
1144             #------------------------------------------------------------------------------
1145             # Decode filtered stream
1146             # Inputs: 0) ExifTool object reference, 1) dictionary reference
1147             # Returns: true if stream has been decoded OK
1148             sub DecodeStream($$)
1149             {
1150 43     43 0 92 local $_;
1151 43         106 my ($et, $dict) = @_;
1152              
1153 43 50       182 return 0 unless $$dict{_stream}; # no stream to decode
1154              
1155             # get list of filters
1156 43         108 my (@filters, @decodeParms, $filter);
1157 43 50       242 if (ref $$dict{Filter} eq 'ARRAY') {
    50          
1158 0         0 @filters = @{$$dict{Filter}};
  0         0  
1159             } elsif (defined $$dict{Filter}) {
1160 0         0 @filters = ($$dict{Filter});
1161             }
1162             # be sure we can process all the filters before we take the time to do the decryption
1163 43         126 foreach $filter (@filters) {
1164 0 0       0 next if $supportedFilter{$filter};
1165 0         0 $et->WarnOnce("Unsupported Filter $filter");
1166 0         0 return 0;
1167             }
1168             # apply decryption first if required (and if the default encryption
1169             # has not been overridden by a Crypt filter. Note: the Crypt filter
1170             # must be first in the Filter array: ref 3, page 38)
1171 43 50 33     225 unless (defined $$dict{_decrypted} or ($filters[0] and $filters[0] eq '/Crypt')) {
      33        
1172 43         138 CryptStream($dict, $lastFetched);
1173             }
1174 43 50       205 return 1 unless $$dict{Filter}; # Filter entry is mandatory
1175 0 0       0 return 0 if defined $$dict{_filtered}; # avoid double-filtering
1176 0         0 $$dict{_filtered} = 1; # set flag to prevent double-filtering
1177              
1178             # get array of DecodeParms dictionaries
1179 0 0       0 if (ref $$dict{DecodeParms} eq 'ARRAY') {
1180 0         0 @decodeParms = @{$$dict{DecodeParms}};
  0         0  
1181             } else {
1182 0         0 @decodeParms = ($$dict{DecodeParms});
1183             }
1184              
1185 0         0 foreach $filter (@filters) {
1186 0         0 my $decodeParms = shift @decodeParms;
1187              
1188 0 0       0 if ($filter eq '/FlateDecode') {
    0          
    0          
    0          
    0          
1189             # make sure we support the predictor (if used) before decoding
1190 0         0 my $pre;
1191 0 0       0 if (ref $decodeParms eq 'HASH') {
1192 0         0 $pre = $$decodeParms{Predictor};
1193 0 0 0     0 if ($pre and $pre ne '1' and $pre ne '12') {
      0        
1194 0         0 $et->WarnOnce("FlateDecode Predictor $pre currently not supported");
1195 0         0 return 0;
1196             }
1197             }
1198 0 0       0 if (eval { require Compress::Zlib }) {
  0         0  
1199 0         0 my $inflate = Compress::Zlib::inflateInit();
1200 0         0 my ($buff, $stat);
1201 0 0       0 $inflate and ($buff, $stat) = $inflate->inflate($$dict{_stream});
1202 0 0 0     0 if ($inflate and $stat == Compress::Zlib::Z_STREAM_END()) {
1203 0         0 $$dict{_stream} = $buff;
1204             } else {
1205 0         0 $et->Warn('Error inflating stream');
1206 0         0 return 0;
1207             }
1208             } else {
1209 0         0 $et->WarnOnce('Install Compress::Zlib to process filtered streams');
1210 0         0 return 0;
1211             }
1212 0 0 0     0 next unless $pre and $pre eq '12'; # 12 = 'up' prediction
1213              
1214             # apply anti-predictor
1215 0         0 my $cols = $$decodeParms{Columns};
1216 0 0       0 unless ($cols) {
1217             # currently only support 'up' prediction
1218 0         0 $et->WarnOnce('No Columns for decoding stream');
1219 0         0 return 0;
1220             }
1221 0         0 my @bytes = unpack('C*', $$dict{_stream});
1222 0         0 my @pre = (0) x $cols; # initialize predictor array
1223 0         0 my $buff = '';
1224 0         0 while (@bytes > $cols) {
1225 0 0       0 unless (($_ = shift @bytes) == 2) {
1226 0         0 $et->WarnOnce("Unsupported PNG filter $_"); # (yes, PNG)
1227 0         0 return 0;
1228             }
1229 0         0 foreach (@pre) {
1230 0         0 $_ = ($_ + shift(@bytes)) & 0xff;
1231             }
1232 0         0 $buff .= pack('C*', @pre);
1233             }
1234 0         0 $$dict{_stream} = $buff;
1235              
1236             } elsif ($filter eq '/Crypt') {
1237              
1238             # (we shouldn't have to check the _decrypted flag since we
1239             # already checked the _filtered flag, but what the heck...)
1240 0 0       0 next if defined $$dict{_decrypted};
1241             # assume Identity filter (the default) if DecodeParms are missing
1242 0 0       0 next unless ref $decodeParms eq 'HASH';
1243 0         0 my $name = $$decodeParms{Name};
1244 0 0 0     0 next unless defined $name or $name eq 'Identity';
1245 0 0       0 if ($name ne 'StdCF') {
1246 0         0 $et->WarnOnce("Unsupported Crypt Filter $name");
1247 0         0 return 0;
1248             }
1249 0 0       0 unless ($cryptInfo) {
1250 0         0 $et->WarnOnce('Missing Encrypt StdCF entry');
1251 0         0 return 0;
1252             }
1253             # decrypt the stream manually because we want to:
1254             # 1) ignore $cryptStream (StmF) setting
1255             # 2) ignore EncryptMetadata setting (I can't find mention of how to
1256             # reconcile this in the spec., but this would make sense)
1257             # 3) avoid adding the crypt key extension (ref 3, page 58, Algorithm 1b)
1258             # 4) set _decrypted flag so we will recrypt according to StmF when
1259             # writing (since we don't yet write Filter'd streams)
1260 0         0 Crypt(\$$dict{_stream}, 'none');
1261 0 0       0 $$dict{_decrypted} = ($cryptStream ? 1 : 0);
1262              
1263             } elsif ($filter eq '/LZWDecode') {
1264              
1265             # make sure we don't have any unsupported decoding parameters
1266 0 0       0 if (ref $decodeParms eq 'HASH') {
1267 0 0       0 if ($$decodeParms{Predictor}) {
    0          
1268 0         0 $et->WarnOnce("LZWDecode Predictor $$decodeParms{Predictor} currently not supported");
1269 0         0 return 0;
1270             } elsif ($$decodeParms{EarlyChange}) {
1271 0         0 $et->WarnOnce("LZWDecode EarlyChange currently not supported");
1272 0         0 return 0;
1273             }
1274             }
1275 0 0       0 unless (DecodeLZW(\$$dict{_stream})) {
1276 0         0 $et->WarnOnce('LZW decompress error');
1277 0         0 return 0;
1278             }
1279              
1280             } elsif ($filter eq '/ASCIIHexDecode') {
1281              
1282 0         0 $$dict{_stream} =~ s/>.*//; # truncate at '>' (end of data mark)
1283 0         0 $$dict{_stream} =~ tr/0-9a-zA-Z//d; # remove illegal characters
1284 0         0 $$dict{_stream} = pack 'H*', $$dict{_stream};
1285              
1286             } elsif ($filter eq '/ASCII85Decode') {
1287              
1288 0         0 my ($err, @out, $i);
1289 0         0 my ($n, $val) = (0, 0);
1290 0         0 foreach (split //, $$dict{_stream}) {
1291 0 0 0     0 if ($_ ge '!' and $_ le 'u') {;
    0          
    0          
1292 0         0 $val = 85 * $val + ord($_) - 33;
1293 0 0       0 next unless ++$n == 5;
1294             } elsif ($_ eq '~') {
1295 0 0       0 $n == 1 and $err = 1; # error to have a single char in the last group of 5
1296 0         0 for ($i=$n; $i<5; ++$i) { $val *= 85; }
  0         0  
1297             } elsif ($_ eq 'z') {
1298 0 0       0 $n and $err = 2, last; # error if 'z' isn't the first char
1299 0         0 $n = 5;
1300             } else {
1301 0 0       0 next if /^\s$/; # ignore white space
1302 0         0 $err = 3, last; # any other character is an error
1303             }
1304 0         0 $val = unpack('V', pack('N', $val)); # reverse byte order
1305 0         0 while (--$n > 0) {
1306 0         0 push @out, $val & 0xff;
1307 0         0 $val >>= 8;
1308             }
1309 0 0       0 last if $_ eq '~';
1310             # (both $n and $val are zero again now)
1311             }
1312 0 0       0 $err and $et->WarnOnce("ASCII85Decode error $err");
1313 0         0 $$dict{_stream} = pack('C*', @out);
1314             }
1315             }
1316 0         0 return 1;
1317             }
1318              
1319             #------------------------------------------------------------------------------
1320             # Initialize state for RC4 en/decryption (ref 2)
1321             # Inputs: 0) RC4 key string
1322             # Returns: RC4 key hash reference
1323             sub RC4Init($)
1324             {
1325 22     22 0 58 my @key = unpack('C*', shift);
1326 22         208 my @state = (0 .. 255);
1327 22         50 my ($i, $j) = (0, 0);
1328 22         52 while ($i < 256) {
1329 5632         7214 my $st = $state[$i];
1330 5632         8169 $j = ($j + $st + $key[$i % scalar(@key)]) & 0xff;
1331 5632         7454 $state[$i++] = $state[$j];
1332 5632         9119 $state[$j] = $st;
1333             }
1334 22         119 return { State => \@state, XY => [ 0, 0 ] };
1335             }
1336              
1337             #------------------------------------------------------------------------------
1338             # Apply RC4 en/decryption (ref 2)
1339             # Inputs: 0) data reference, 1) RC4 key hash reference or RC4 key string
1340             # - can call this method directly with a key string, or with with the key
1341             # reference returned by RC4Init
1342             # - RC4 is a symmetric algorithm, so encryption is the same as decryption
1343             sub RC4Crypt($$)
1344             {
1345 22     22 0 44 my ($dataPt, $key) = @_;
1346 22 50       62 $key = RC4Init($key) unless ref $key eq 'HASH';
1347 22         37 my $state = $$key{State};
1348 22         32 my ($x, $y) = @{$$key{XY}};
  22         51  
1349              
1350 22         70 my @data = unpack('C*', $$dataPt);
1351 22         48 foreach (@data) {
1352 356         478 $x = ($x + 1) & 0xff;
1353 356         457 my $stx = $$state[$x];
1354 356         451 $y = ($stx + $y) & 0xff;
1355 356         481 my $sty = $$state[$x] = $$state[$y];
1356 356         435 $$state[$y] = $stx;
1357 356         573 $_ ^= $$state[($stx + $sty) & 0xff];
1358             }
1359 22         70 $$key{XY} = [ $x, $y ];
1360 22         191 $$dataPt = pack('C*', @data);
1361             }
1362              
1363             #------------------------------------------------------------------------------
1364             # Update AES cipher with a bit of data
1365             # Inputs: 0) data
1366             # Returns: encrypted data
1367             my $cipherMore;
1368             sub CipherUpdate($)
1369             {
1370 0     0 0 0 my $dat = shift;
1371 0         0 my $pos = 0;
1372 0 0       0 $dat = $cipherMore . $dat if length $dat;
1373 0         0 while ($pos + 16 <= length($dat)) {
1374 0         0 substr($dat,$pos,16) = Image::ExifTool::AES::Cipher(substr($dat,$pos,16));
1375 0         0 $pos += 16;
1376             }
1377 0 0       0 if ($pos < length $dat) {
1378 0         0 $cipherMore = substr($dat,$pos);
1379 0         0 $dat = substr($dat,0,$pos);
1380             } else {
1381 0         0 $cipherMore = '';
1382             }
1383 0         0 return $dat;
1384             }
1385              
1386             #------------------------------------------------------------------------------
1387             # Get encrypted hash
1388             # Inputs: 0) Password, 1) salt, 2) vector, 3) encryption revision
1389             # Returns: hash
1390             sub GetHash($$$$)
1391             {
1392 6     6 0 24 my ($password, $salt, $vector, $rev) = @_;
1393              
1394             # return Rev 5 hash
1395 6 50       52 return Digest::SHA::sha256($password, $salt, $vector) if $rev == 5;
1396              
1397             # compute Rev 6 hardened hash
1398             # (ref http://code.google.com/p/origami-pdf/source/browse/lib/origami/encryption.rb)
1399 0         0 my $blockSize = 32;
1400 0         0 my $input = Digest::SHA::sha256($password, $salt, $vector) . ("\0" x 32);
1401 0         0 my $key = substr($input, 0, 16);
1402 0         0 my $iv = substr($input, 16, 16);
1403 0         0 my $h;
1404 0         0 my $x = '';
1405 0         0 my $i = 0;
1406 0   0     0 while ($i < 64 or $i < ord(substr($x,-1,1))+32) {
1407              
1408 0         0 my $block = substr($input, 0, $blockSize);
1409 0         0 $x = '';
1410 0         0 Image::ExifTool::AES::Crypt(\$x, $key, $iv, 1);
1411 0         0 $cipherMore = '';
1412              
1413 0         0 my ($j, $digest);
1414 0         0 for ($j=0; $j<64; ++$j) {
1415 0         0 $x = '';
1416 0 0       0 $x .= CipherUpdate($password) if length $password;
1417 0         0 $x .= CipherUpdate($block);
1418 0 0       0 $x .= CipherUpdate($vector) if length $vector;
1419 0 0       0 if ($j == 0) {
1420 0         0 my @a = unpack('C16', $x);
1421 0         0 my $sum = 0;
1422 0         0 $sum += $_ foreach @a;
1423             # set SHA block size (32, 48 or 64 bytes = SHA-256, 384 or 512)
1424 0         0 $blockSize = 32 + ($sum % 3) * 16;
1425 0         0 $digest = Digest::SHA->new($blockSize * 8);
1426             }
1427 0         0 $digest->add($x);
1428             }
1429              
1430 0         0 $h = $digest->digest();
1431 0         0 $key = substr($h, 0, 16);
1432 0         0 substr($input,0,16) = $h;
1433 0         0 $iv = substr($h, 16, 16);
1434 0         0 ++$i;
1435             }
1436 0         0 return substr($h, 0, 32);
1437             }
1438              
1439             #------------------------------------------------------------------------------
1440             # Initialize decryption
1441             # Inputs: 0) ExifTool object reference, 1) Encrypt dictionary reference,
1442             # 2) ID from file trailer dictionary
1443             # Returns: error string or undef on success (and sets $cryptInfo)
1444             sub DecryptInit($$$)
1445             {
1446 4     4 0 31 local $_;
1447 4         14 my ($et, $encrypt, $id) = @_;
1448              
1449 4         8 undef $cryptInfo;
1450 4 50 33     22 unless ($encrypt and ref $encrypt eq 'HASH') {
1451 0         0 return 'Error loading Encrypt object';
1452             }
1453 4         10 my $filt = $$encrypt{Filter};
1454 4 50 33     38 unless ($filt and $filt =~ s/^\///) {
1455 0         0 return 'Encrypt dictionary has no Filter!';
1456             }
1457             # extract some interesting tags
1458 4   50     17 my $ver = $$encrypt{V} || 0;
1459 4   100     15 my $rev = $$encrypt{R} || 0;
1460 4         12 my $enc = "$filt V$ver";
1461 4 50       16 $enc .= ".$rev" if $filt eq 'Standard';
1462 4 50 33     12 $enc .= " ($1)" if $$encrypt{SubFilter} and $$encrypt{SubFilter} =~ /^\/(.*)/;
1463 4 50 100     26 $enc .= ' (' . ($$encrypt{Length} || 40) . '-bit)' if $filt eq 'Standard';
1464 4         13 my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Encrypt');
1465 4         21 $et->HandleTag($tagTablePtr, 'Filter', $enc);
1466 4 50       22 if ($filt ne 'Standard') {
    50          
1467 0         0 return "Encryption filter $filt currently not supported";
1468             } elsif (not defined $$encrypt{R}) {
1469 0         0 return 'Standard security handler missing revision';
1470             }
1471 4 50 33     29 unless ($$encrypt{O} and $$encrypt{P} and $$encrypt{U}) {
      33        
1472 0         0 return 'Incomplete Encrypt specification';
1473             }
1474 4 50       43 if ("$ver.$rev" >= 5.6) {
1475             # apologize for poor performance (AES is a pure Perl implementation)
1476 0         0 $et->Warn('Decryption is very slow for encryption V5.6 or higher', 3);
1477             }
1478 4         18 $et->HandleTag($tagTablePtr, 'P', $$encrypt{P});
1479              
1480 4         12 my %parm; # optional parameters extracted from Encrypt dictionary
1481              
1482 4 100 66     31 if ($ver == 1 or $ver == 2) {
    50 66        
1483 1         3 $cryptString = $cryptStream = 1;
1484             } elsif ($ver == 4 or $ver == 5) {
1485             # initialize our $cryptString and $cryptStream flags
1486 3         8 foreach ('StrF', 'StmF') {
1487 6 100       15 my $flagPt = $_ eq 'StrF' ? \$cryptString : \$cryptStream;
1488 6         16 $$flagPt = $$encrypt{$_};
1489 6 50 33     24 undef $$flagPt if $$flagPt and $$flagPt eq '/Identity';
1490 6 50 33     23 return "Unsupported $_ encryption $$flagPt" if $$flagPt and $$flagPt ne '/StdCF';
1491             }
1492 3 50 33     11 if ($cryptString or $cryptStream) {
1493             return 'Missing or invalid Encrypt StdCF entry' unless ref $$encrypt{CF} eq 'HASH' and
1494 3 50 33     27 ref $$encrypt{CF}{StdCF} eq 'HASH' and $$encrypt{CF}{StdCF}{CFM};
      33        
1495 3         8 my $cryptMeth = $$encrypt{CF}{StdCF}{CFM};
1496 3 50       19 unless ($cryptMeth =~ /^\/(V2|AESV2|AESV3)$/) {
1497 0         0 return "Unsupported encryption method $cryptMeth";
1498             }
1499             # set "_aesv2" or "_aesv3" flag in %$encrypt hash if AES encryption was used
1500 3 50       25 $$encrypt{'_' . lc($1)} = 1 if $cryptMeth =~ /^\/(AESV2|AESV3)$/;
1501             }
1502 3 100       11 if ($ver == 5) {
1503             # validate OE and UE entries
1504 2         6 foreach ('OE', 'UE') {
1505 4 50       12 return "Missing Encrypt $_ entry" unless $$encrypt{$_};
1506 4         12 $parm{$_} = ReadPDFValue($$encrypt{$_});
1507 4 50       12 return "Invalid Encrypt $_ entry" unless length $parm{$_} == 32;
1508             }
1509 2         14 require Image::ExifTool::AES; # will need this later
1510             }
1511             } else {
1512 0         0 return "Encryption version $ver currently not supported";
1513             }
1514 4 50       14 $id or return "Can't decrypt (no document ID)";
1515              
1516             # make sure we have the necessary libraries available
1517 4 100       13 if ($ver < 5) {
1518 2 50       4 unless (eval { require Digest::MD5 }) {
  2         20  
1519 0         0 return "Install Digest::MD5 to process encrypted PDF";
1520             }
1521             } else {
1522 2 50       6 unless (eval { require Digest::SHA }) {
  2         12  
1523 0         0 return "Install Digest::SHA to process AES-256 encrypted PDF";
1524             }
1525             }
1526              
1527             # calculate file-level en/decryption key
1528 4         12 my $pad = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08".
1529             "\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A";
1530 4         15 my $o = ReadPDFValue($$encrypt{O});
1531 4         14 my $u = ReadPDFValue($$encrypt{U});
1532              
1533             # set flag indicating whether metadata is encrypted
1534             # (in version 4 and higher, metadata streams may not be encrypted)
1535 4 100 100     55 if ($ver < 4 or not $$encrypt{EncryptMetadata} or $$encrypt{EncryptMetadata} !~ /false/i) {
      66        
1536 3         8 $$encrypt{_meta} = 1;
1537             }
1538             # try no password first, then try provided password if available
1539 4         11 my ($try, $key);
1540 4         12 for ($try=0; ; ++$try) {
1541 5         11 my $password;
1542 5 100       16 if ($try == 0) {
    50          
1543 4         9 $password = '';
1544             } elsif ($try == 1) {
1545 1         6 $password = $et->Options('Password');
1546 1 50       5 return 'Document is password protected (use Password option)' unless defined $password;
1547             # make sure there is no UTF-8 flag on the password
1548 1 50 33     6 if ($] >= 5.006 and (eval { require Encode; Encode::is_utf8($password) } or $@)) {
      33        
1549             # repack by hand if Encode isn't available
1550 0 0       0 $password = $@ ? pack('C*',unpack($] < 5.010000 ? 'U0C*' : 'C0C*',$password)) : Encode::encode('utf8',$password);
    0          
1551             }
1552             } else {
1553 0         0 return 'Incorrect password';
1554             }
1555 5 100       21 if ($ver < 5) {
1556 2 50       6 if (length $password) {
1557             # password must be encoding in PDFDocEncoding (ref iso32000)
1558 0         0 $password = $et->Encode($password, 'PDFDoc');
1559             # truncate or pad the password to exactly 32 bytes
1560 0 0       0 if (length($password) > 32) {
    0          
1561 0         0 $password = substr($password, 0, 32);
1562             } elsif (length($password) < 32) {
1563 0         0 $password .= substr($pad, 0, 32-length($password));
1564             }
1565             } else {
1566 2         5 $password = $pad;
1567             }
1568 2         14 $key = $password . $o . pack('V', $$encrypt{P}) . $id;
1569 2         6 my $rep = 1;
1570 2 100 66     14 if ($rev == 3 or $rev == 4) {
1571             # must add this if metadata not encrypted
1572 1 50       7 $key .= "\xff\xff\xff\xff" unless $$encrypt{_meta};
1573 1         5 $rep += 50; # repeat MD5 50 more times if revision is 3 or greater
1574             }
1575 2         6 my ($len, $i, $dat);
1576 2 100       7 if ($ver == 1) {
1577 1         2 $len = 5;
1578             } else {
1579 1   50     6 $len = $$encrypt{Length} || 40;
1580 1 50       4 $len >= 40 or return 'Bad Encrypt Length';
1581 1         4 $len = int($len / 8);
1582             }
1583 2         11 for ($i=0; $i<$rep; ++$i) {
1584 52         149 $key = substr(Digest::MD5::md5($key), 0, $len);
1585             }
1586             # decrypt U to see if a user password is required
1587 2 100       15 if ($rev >= 3) {
1588 1         4 $dat = Digest::MD5::md5($pad . $id);
1589 1         7 RC4Crypt(\$dat, $key);
1590 1         5 for ($i=1; $i<=19; ++$i) {
1591 19         49 my @key = unpack('C*', $key);
1592 19         33 foreach (@key) { $_ ^= $i; }
  304         399  
1593 19         57 RC4Crypt(\$dat, pack('C*', @key));
1594             }
1595 1         6 $dat .= substr($u, 16);
1596             } else {
1597 1         2 $dat = $pad;
1598 1         5 RC4Crypt(\$dat, $key);
1599             }
1600 2 50       12 last if $dat eq $u; # all done if this was the correct key
1601             } else {
1602 3 50 33     16 return 'Invalid O or U Encrypt entries' if length($o) < 48 or length($u) < 48;
1603 3 100       12 if (length $password) {
1604             # Note: this should be good for passwords containing reasonable characters,
1605             # but to be bullet-proof we need to apply the SASLprep (IETF RFC 4013) profile
1606             # of stringprep (IETF RFC 3454) to the password before encoding in UTF-8
1607 1         7 $password = $et->Encode($password, 'UTF8');
1608 1 50       3 $password = substr($password, 0, 127) if length($password) > 127;
1609             }
1610             # test for the owner password
1611 3         16 my $sha = GetHash($password, substr($o,32,8), substr($u,0,48), $rev);
1612 3 100       14 if ($sha eq substr($o, 0, 32)) {
1613 2         10 $key = GetHash($password, substr($o,40,8), substr($u,0,48), $rev);
1614 2         9 my $dat = ("\0" x 16) . $parm{OE};
1615             # decrypt with no padding
1616 2         8 my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
1617 2 50       8 return $err if $err;
1618 2         5 $key = $dat; # use this as the file decryption key
1619 2         6 last;
1620             }
1621             # test for the user password
1622 1         5 $sha = GetHash($password, substr($u,32,8), '', $rev);
1623 1 50       7 if ($sha eq substr($u, 0, 32)) {
1624 0         0 $key = GetHash($password, substr($u,40,8), '', $rev);
1625 0         0 my $dat = ("\0" x 16) . $parm{UE};
1626 0         0 my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
1627 0 0       0 return $err if $err;
1628 0         0 $key = $dat; # use this as the file decryption key
1629 0         0 last;
1630             }
1631             }
1632             }
1633 4         14 $$encrypt{_key} = $key; # save the file-level encryption key
1634 4         11 $cryptInfo = $encrypt; # save reference to the file-level Encrypt object
1635 4         22 return undef; # success!
1636             }
1637              
1638             #------------------------------------------------------------------------------
1639             # Decrypt/Encrypt data
1640             # Inputs: 0) data ref
1641             # 1) PDF object reference to use as crypt key extension (may be 'none' to
1642             # avoid extending the encryption key, as for streams with Crypt Filter)
1643             # 2) encrypt flag (false for decryption)
1644             sub Crypt($$;$)
1645             {
1646 29 100   29 0 118 return unless $cryptInfo;
1647 4         16 my ($dataPt, $keyExt, $encrypt) = @_;
1648             # do not decrypt if the key extension object is undefined
1649             # (this doubles as a flag to disable decryption/encryption)
1650 4 50       12 return unless defined $keyExt;
1651 4         9 my $key = $$cryptInfo{_key};
1652             # apply the necessary crypt key extension
1653 4 100       12 unless ($$cryptInfo{_aesv3}) {
1654 2 50       7 unless ($keyExt eq 'none') {
1655             # extend crypt key using object and generation number
1656 2 50       20 unless ($keyExt =~ /^(I\d+ )?(\d+) (\d+)/) {
1657 0         0 $$cryptInfo{_error} = 'Invalid object reference for encryption';
1658 0         0 return;
1659             }
1660 2         19 $key .= substr(pack('V', $2), 0, 3) . substr(pack('V', $3), 0, 2);
1661             }
1662             # add AES-128 salt if necessary (this little gem is conveniently
1663             # omitted from the Adobe PDF 1.6 documentation, causing me to
1664             # waste 12 hours trying to figure out why this wasn't working --
1665             # it appears in ISO32000 though, so I should have been using that)
1666 2 100       9 $key .= 'sAlT' if $$cryptInfo{_aesv2};
1667 2         7 my $len = length($key);
1668 2         10 $key = Digest::MD5::md5($key); # get 16-byte MD5 digest
1669 2 100       11 $key = substr($key, 0, $len) if $len < 16; # trim if necessary
1670             }
1671             # perform the decryption/encryption
1672 4 100 100     27 if ($$cryptInfo{_aesv2} or $$cryptInfo{_aesv3}) {
1673 3         24 require Image::ExifTool::AES;
1674 3         12 my $err = Image::ExifTool::AES::Crypt($dataPt, $key, $encrypt);
1675 3 50       15 $err and $$cryptInfo{_error} = $err;
1676             } else {
1677 1         4 RC4Crypt($dataPt, $key);
1678             }
1679             }
1680              
1681             #------------------------------------------------------------------------------
1682             # Decrypt/Encrypt stream data
1683             # Inputs: 0) dictionary ref, 1) PDF object reference to use as crypt key extension
1684             sub CryptStream($$)
1685             {
1686 52 50   52 0 145 return unless $cryptStream;
1687 0         0 my ($dict, $keyExt) = @_;
1688 0   0     0 my $type = $$dict{Type} || '';
1689             # XRef streams are not encrypted (ref 3, page 50),
1690             # and Metadata may or may not be encrypted
1691 0 0 0     0 if ($cryptInfo and $type ne '/XRef' and
      0        
      0        
1692             ($$cryptInfo{_meta} or $type ne '/Metadata'))
1693             {
1694 0         0 Crypt(\$$dict{_stream}, $keyExt, $$dict{_decrypted});
1695             # toggle _decrypted flag
1696 0 0       0 $$dict{_decrypted} = ($$dict{_decrypted} ? undef : 1);
1697             } else {
1698 0         0 $$dict{_decrypted} = 0; # stream should never be encrypted
1699             }
1700             }
1701              
1702             #------------------------------------------------------------------------------
1703             # Generate a new PDF tag (based on its ID) and add it to a tag table
1704             # Inputs: 0) tag table ref, 1) tag ID
1705             # Returns: tag info ref
1706             sub NewPDFTag($$)
1707             {
1708 0     0 0 0 my ($tagTablePtr, $tag) = @_;
1709 0         0 my $name = $tag;
1710             # translate URL-like escape sequences
1711 0         0 $name =~ s/#([0-9a-f]{2})/chr(hex($1))/ige;
  0         0  
1712 0         0 $name =~ s/[^-\w]+/_/g; # translate invalid characters to an underline
1713 0         0 $name =~ s/(^|_)([a-z])/\U$2/g; # start words with upper case
1714 0         0 my $tagInfo = { Name => $name };
1715 0         0 AddTagToTable($tagTablePtr, $tag, $tagInfo);
1716 0         0 return $tagInfo;
1717             }
1718              
1719             #------------------------------------------------------------------------------
1720             # Process AcroForm dictionary to set HasXMLFormsArchitecture flag
1721             # Inputs: Same as ProcessDict
1722             sub ProcessAcroForm($$$$;$$)
1723             {
1724 0     0 0 0 my ($et, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
1725 0 0       0 $et->HandleTag($tagTablePtr, '_has_xfa', $$dict{XFA} ? 'true' : 'false');
1726 0         0 return ProcessDict($et, $tagTablePtr, $dict, $xref, $nesting, $type);
1727             }
1728              
1729             #------------------------------------------------------------------------------
1730             # Expand array into a string
1731             # Inputs: 0) array ref
1732             # Return: string
1733             sub ExpandArray($)
1734             {
1735 0     0 0 0 my $val = shift;
1736 0         0 my @list = @$val;
1737 0         0 foreach (@list) {
1738 0 0       0 ref $_ eq 'SCALAR' and $_ = "ref($$_)", next;
1739 0 0       0 ref $_ eq 'ARRAY' and $_ = ExpandArray($_), next;
1740 0 0       0 defined $_ or $_ = '', next;
1741             }
1742 0         0 return '[' . join(',',@list) . ']';
1743             }
1744              
1745             #------------------------------------------------------------------------------
1746             # Process PDF dictionary extract tag values
1747             # Inputs: 0) ExifTool object reference, 1) tag table reference
1748             # 2) dictionary reference, 3) cross-reference table reference,
1749             # 4) nesting depth, 5) dictionary capture type
1750             sub ProcessDict($$$$;$$)
1751             {
1752 350     350 0 944 my ($et, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
1753 350         1083 my $verbose = $et->Options('Verbose');
1754 350         864 my $unknown = $$tagTablePtr{EXTRACT_UNKNOWN};
1755 350   33     1068 my $embedded = (defined $unknown and not $unknown and $et->Options('ExtractEmbedded'));
1756 350         532 my @tags = @{$$dict{_tags}};
  350         1360  
1757 350         669 my ($next, %join);
1758 350         543 my $index = 0;
1759              
1760 350   100     1019 $nesting = ($nesting || 0) + 1;
1761 350 50       778 if ($nesting > 50) {
1762 0         0 $et->WarnOnce('Nesting too deep (directory ignored)');
1763 0         0 return;
1764             }
1765             # save entire dictionary for rewriting if specified
1766 350 50 100     1376 if ($$et{PDF_CAPTURE} and $$tagTablePtr{VARS} and
      66        
1767             $tagTablePtr->{VARS}->{CAPTURE})
1768             {
1769 66         114 my $name;
1770 66         97 foreach $name (@{$tagTablePtr->{VARS}->{CAPTURE}}) {
  66         204  
1771 82 100       228 next if $$et{PDF_CAPTURE}{$name};
1772             # make sure we load the right type if indicated
1773 66 50 66     274 next if $type and $type ne $name;
1774 66         196 $$et{PDF_CAPTURE}{$name} = $dict;
1775 66         121 last;
1776             }
1777             }
1778             #
1779             # extract information from all tags in the dictionary
1780             #
1781 350         481 for (;;) {
1782 1473         2120 my ($tag, $isSubDoc);
1783 1473 100 33     2982 if (@tags) {
    50          
1784 1123         2166 $tag = shift @tags;
1785             } elsif (defined $next and not $next) {
1786 0         0 $tag = 'Next';
1787 0         0 $next = 1;
1788             } else {
1789 350         951 last;
1790             }
1791 1123         2408 my $val = $$dict{$tag};
1792 1123         2669 my $tagInfo = $et->GetTagInfo($tagTablePtr, $tag);
1793 1123 100 33     3170 if ($tagInfo) {
    50 33        
      0        
      0        
1794 419 50       975 undef $tagInfo if $$tagInfo{NoProcess};
1795             } elsif ($embedded and $tag =~ /^(.*?)(\d+)$/ and
1796             $$tagTablePtr{$1} and (ref $val ne 'SCALAR' or not $fetched{$$val}))
1797             {
1798 0         0 my ($name, $num) = ($1, $2);
1799 0         0 $tagInfo = $et->GetTagInfo($tagTablePtr, $name);
1800 0 0 0     0 if (ref $tagInfo eq 'HASH' and $$tagInfo{JoinStreams}) {
1801 0         0 $fetched{$$val} = 1;
1802 0         0 my $obj = FetchObject($et, $$val, $xref, $tag);
1803 0 0       0 $join{$name} = [] unless $join{$name};
1804 0 0 0     0 next unless ref $obj eq 'HASH' and $$obj{_stream};
1805             # save all the stream data to join later
1806 0         0 DecodeStream($et, $obj);
1807 0         0 $join{$name}->[$num] = $$obj{_stream};
1808 0         0 undef $tagInfo; # don't process
1809             } else {
1810 0         0 $isSubDoc = 1; # treat as a sub-document
1811             }
1812             }
1813 1123 50       2248 if ($verbose) {
1814 0         0 my ($val2, $extra);
1815 0 0       0 if (ref $val eq 'SCALAR') {
    0          
    0          
1816 0         0 $extra = ", indirect object ($$val)";
1817 0 0 0     0 if ($fetched{$$val}) {
    0          
1818 0         0 $val2 = "ref($$val)";
1819             } elsif ($tag eq 'Next' and not $next) {
1820             # handle 'Next' links after all others
1821 0         0 $next = 0;
1822 0         0 next;
1823             } else {
1824 0         0 $fetched{$$val} = 1;
1825 0         0 $val = FetchObject($et, $$val, $xref, $tag);
1826 0 0       0 unless (defined $val) {
1827 0         0 my $str;
1828 0 0       0 if (defined $lastOffset) {
1829 0         0 $val2 = '';
1830 0         0 $str = 'Object was freed';
1831             } else {
1832 0         0 $val2 = '';
1833 0         0 $str = 'Error reading object';
1834             }
1835 0         0 $et->VPrint(0, "$$et{INDENT}${str}:\n");
1836             }
1837             }
1838             } elsif (ref $val eq 'HASH') {
1839 0         0 $extra = ', direct dictionary';
1840             } elsif (ref $val eq 'ARRAY') {
1841 0         0 $extra = ', direct array of ' . scalar(@$val) . ' objects';
1842             } else {
1843 0         0 $extra = ', direct object';
1844             }
1845 0         0 my $isSubdir;
1846 0 0       0 if (ref $val eq 'HASH') {
    0          
1847 0         0 $isSubdir = 1;
1848             } elsif (ref $val eq 'ARRAY') {
1849             # recurse into objects in arrays only if they are lists of
1850             # dictionaries or indirect objects which could be dictionaries
1851 0 0       0 $isSubdir = 1 if @$val;
1852 0         0 foreach (@$val) {
1853 0 0 0     0 next if ref $_ eq 'HASH' or ref $_ eq 'SCALAR';
1854 0         0 undef $isSubdir;
1855 0         0 last;
1856             }
1857             }
1858 0 0       0 if ($isSubdir) {
1859             # create bogus subdirectory to recurse into this dict
1860 0 0       0 $tagInfo or $tagInfo = {
1861             Name => $tag,
1862             SubDirectory => { TagTable => 'Image::ExifTool::PDF::Unknown' },
1863             };
1864             } else {
1865 0 0       0 $val2 = ExpandArray($val) if ref $val eq 'ARRAY';
1866             # generate tag info if we will use it later
1867 0 0 0     0 if (not $tagInfo and defined $val and $unknown) {
      0        
1868 0         0 $tagInfo = NewPDFTag($tagTablePtr, $tag);
1869             }
1870             }
1871 0   0     0 $et->VerboseInfo($tag, $tagInfo,
1872             Value => $val2 || $val,
1873             Extra => $extra,
1874             Index => $index++,
1875             );
1876 0 0       0 next unless defined $val;
1877             }
1878 1123 100       2111 unless ($tagInfo) {
1879             # add any tag found in Info dictionary to table
1880 704 50       1424 next unless $unknown;
1881 0         0 $tagInfo = NewPDFTag($tagTablePtr, $tag);
1882             }
1883             # increment document number if necessary
1884 419         818 my ($oldDocNum, $oldNumTags);
1885 419 50       881 if ($isSubDoc) {
1886 0         0 $oldDocNum = $$et{DOC_NUM};
1887 0         0 $oldNumTags = $$et{NUM_FOUND};
1888 0         0 $$et{DOC_NUM} = ++$$et{DOC_COUNT};
1889             }
1890 419 100       1040 if ($$tagInfo{SubDirectory}) {
1891             # process the subdirectory
1892 332         543 my @subDicts;
1893 332 100       748 if (ref $val eq 'ARRAY') {
1894             # hack to convert array to dictionary if necessary
1895 37 50 33     195 if ($$tagInfo{ConvertToDict} and @$val == 2 and not ref $$val[0]) {
      33        
1896 0         0 my $tg = $$val[0];
1897 0         0 $tg =~ s(^/)(); # remove name
1898 0         0 my %dict = ( _tags => [ $tg ], $tg => $$val[1] );
1899 0         0 @subDicts = ( \%dict );
1900             } else {
1901 37         58 @subDicts = @{$val};
  37         131  
1902             }
1903             } else {
1904 295         566 @subDicts = ( $val );
1905             }
1906             # loop through all values of this tag
1907 332         477 for (;;) {
1908 664 100       1559 my $subDict = shift @subDicts or last;
1909             # save last fetched object in case we fetch another one here
1910 332         648 my $prevFetched = $lastFetched;
1911 332 100       989 if (ref $subDict eq 'SCALAR') {
1912             # only fetch once (other copies are obsolete)
1913 244 100       673 next if $fetched{$$subDict};
1914 197 100       522 if ($$tagInfo{IgnoreDuplicates}) {
1915 28         103 my $flag = "ProcessedPDF_$tag";
1916 28 50       97 if ($$et{$flag}) {
1917 0 0       0 next if $et->WarnOnce("Ignored duplicate $tag dictionary", 2);
1918             } else {
1919 28         101 $$et{$flag} = 1;
1920             }
1921             }
1922             # load dictionary via an indirect reference
1923 197         457 $fetched{$$subDict} = 1;
1924 197         521 my $obj = FetchObject($et, $$subDict, $xref, $tag);
1925 197 100       654 unless (defined $obj) {
1926 5 50       28 unless (defined $lastOffset) {
1927 0         0 $et->Warn("Error reading $tag object ($$subDict)");
1928             }
1929 5         15 next;
1930             }
1931 192         357 $subDict = $obj;
1932             }
1933 280 50       774 if (ref $subDict eq 'ARRAY') {
1934             # convert array of key/value pairs to a hash
1935 0 0       0 next if @$subDict < 2;
1936 0         0 my %hash = ( _tags => [] );
1937 0         0 while (@$subDict >= 2) {
1938 0         0 my $key = shift @$subDict;
1939 0         0 $key =~ s/^\///;
1940 0         0 push @{$hash{_tags}}, $key;
  0         0  
1941 0         0 $hash{$key} = shift @$subDict;
1942             }
1943 0         0 $subDict = \%hash;
1944             } else {
1945 280 50       712 next unless ref $subDict eq 'HASH';
1946             }
1947             # set flag to re-crypt all strings when rewriting if the dictionary
1948             # came from an encrypted stream
1949 280 50       616 $$subDict{_needCrypt}{'*'} = 1 unless $lastFetched;
1950 280         1011 my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
1951 280 50       704 if (not $verbose) {
    0          
1952 280   50     1132 my $proc = $$subTablePtr{PROCESS_PROC} || \&ProcessDict;
1953 280         897 &$proc($et, $subTablePtr, $subDict, $xref, $nesting);
1954             } elsif ($next) {
1955             # handle 'Next' links at this level to avoid deep recursion
1956 0         0 undef $next;
1957 0         0 $index = 0;
1958 0         0 $tagTablePtr = $subTablePtr;
1959 0         0 $dict = $subDict;
1960 0         0 @tags = @{$$subDict{_tags}};
  0         0  
1961 0         0 $et->VerboseDir($tag, scalar(@tags));
1962             } else {
1963 0         0 my $oldIndent = $$et{INDENT};
1964 0         0 my $oldDir = $$et{DIR_NAME};
1965 0         0 $$et{INDENT} .= '| ';
1966 0         0 $$et{DIR_NAME} = $tag;
1967 0         0 $et->VerboseDir($tag, scalar(@{$$subDict{_tags}}));
  0         0  
1968 0         0 ProcessDict($et, $subTablePtr, $subDict, $xref, $nesting);
1969 0         0 $$et{INDENT} = $oldIndent;
1970 0         0 $$et{DIR_NAME} = $oldDir;
1971             }
1972 280         1548 $lastFetched = $prevFetched;
1973             }
1974             } else {
1975             # fetch object if necessary
1976             # (OS X 10.6 writes indirect objects in the Info dictionary!)
1977 87 50       247 if (ref $val eq 'SCALAR') {
1978 0         0 my $prevFetched = $lastFetched;
1979             # (note: fetching the same object multiple times is OK here)
1980 0         0 $val = FetchObject($et, $$val, $xref, $tag);
1981 0 0       0 if (defined $val) {
1982 0         0 $val = ReadPDFValue($val);
1983             # set flag to re-encrypt if necessary if rewritten
1984 0 0       0 $$dict{_needCrypt}{$tag} = ($lastFetched ? 0 : 1) if $cryptString;
    0          
1985 0         0 $lastFetched = $prevFetched; # restore last fetched object reference
1986             }
1987             } else {
1988 87         262 $val = ReadPDFValue($val);
1989             }
1990 87 100       279 if (ref $val) {
    50          
1991 12 50       58 if (ref $val eq 'ARRAY') {
1992 12 50       66 delete $$et{LIST_TAGS}{$tagInfo} if $$tagInfo{List};
1993 12         21 my $v;
1994 12         77 foreach $v (@$val) {
1995 20         63 $et->FoundTag($tagInfo, $v);
1996             }
1997             }
1998             } elsif (defined $val) {
1999             # convert from UTF-16 (big endian) to UTF-8 or Latin if necessary
2000             # unless this is binary data (hex-encoded strings would not have been converted)
2001 75   100     519 my $format = $$tagInfo{Format} || $$tagInfo{Writable} || 'string';
2002 75 100       232 $val = ConvertPDFDate($val) if $format eq 'date';
2003 75 50 33     428 if (not $$tagInfo{Binary} and $val =~ /[\x18-\x1f\x80-\xff]/) {
2004             # text string is already in Unicode if it starts with "\xfe\xff",
2005             # otherwise we must first convert from PDFDocEncoding
2006 0 0       0 $val = $et->Decode($val, ($val=~s/^\xfe\xff// ? 'UCS2' : 'PDFDoc'), 'MM');
2007             }
2008 75 100 66     291 if ($$tagInfo{List} and not $$et{OPTIONS}{NoPDFList}) {
2009             # separate tokens in comma or whitespace delimited lists
2010 12 50       112 my @values = ($val =~ /,/) ? split /,+\s*/, $val : split ' ', $val;
2011 12         36 foreach $val (@values) {
2012 28         88 $et->FoundTag($tagInfo, $val);
2013             }
2014             } else {
2015             # a simple tag value
2016 63         196 $et->FoundTag($tagInfo, $val);
2017             }
2018             }
2019             }
2020 419 50       1135 if ($isSubDoc) {
2021             # restore original document number
2022 0         0 $$et{DOC_NUM} = $oldDocNum;
2023 0 0       0 --$$et{DOC_COUNT} if $oldNumTags == $$et{NUM_FOUND};
2024             }
2025             }
2026             #
2027             # extract information from joined streams if necessary
2028             #
2029              
2030 350 50       800 if (%join) {
2031 0         0 my ($tag, $i);
2032 0         0 foreach $tag (sort keys %join) {
2033 0         0 my $list = $join{$tag};
2034 0 0 0     0 last unless defined $$list[1] and $$list[1] =~ /^%.*?([\x0d\x0a]*)/;
2035 0         0 my $buff = "%!PS-Adobe-3.0$1"; # add PS header with same line break
2036 0         0 for ($i=1; defined $$list[$i]; ++$i) {
2037 0         0 $buff .= $$list[$i];
2038 0         0 undef $$list[$i]; # free memory
2039             }
2040             # increment document number for tags extracted from embedded EPS
2041 0         0 my $oldDocNum = $$et{DOC_NUM};
2042 0         0 my $oldNumTags = $$et{NUM_FOUND};
2043 0         0 $$et{DOC_NUM} = ++$$et{DOC_COUNT};
2044             # extract PostScript information
2045 0         0 $et->HandleTag($tagTablePtr, $tag, $buff);
2046 0         0 $$et{DOC_NUM} = $oldDocNum;
2047             # revert document counter if we didn't add any new tags
2048 0 0       0 --$$et{DOC_COUNT} if $oldNumTags == $$et{NUM_FOUND};
2049 0         0 delete $$et{DOC_NUM};
2050             }
2051             }
2052             #
2053             # extract information from stream object if it exists (eg. Metadata stream)
2054             #
2055 350         508 for (;;) { # (cheap goto)
2056 350 100       1344 last unless $$dict{_stream};
2057 43         125 my $tag = '_stream';
2058             # add Subtype (if it exists) to stream name and remove leading '/'
2059 43 100       289 ($tag = $$dict{Subtype} . $tag) =~ s/^\/// if $$dict{Subtype};
2060 43 50       164 last unless $$tagTablePtr{$tag};
2061 43 50       126 my $tagInfo = $et->GetTagInfo($tagTablePtr, $tag) or last;
2062 43 50       174 unless ($$tagInfo{SubDirectory}) {
2063             # don't build filter lists across different images
2064 0         0 delete $$et{LIST_TAGS}{$$tagTablePtr{Filter}};
2065             # we arrive here only when extracting embedded images
2066             # - only extract known image types and ignore others
2067 0   0     0 my $filter = $$dict{Filter} || '';
2068 0 0       0 $filter = @$filter[-1] if ref $filter eq 'ARRAY'; # (get last Filter type)
2069 0         0 my $result;
2070 0 0 0     0 if ($filter eq '/DCTDecode' or $filter eq '/JPXDecode') {
2071 0 0       0 DecodeStream($et, $dict) or last;
2072             # save the image itself
2073 0         0 $et->FoundTag($tagInfo, \$$dict{_stream});
2074             # extract information from embedded image
2075 0         0 $result = $et->ExtractInfo(\$$dict{_stream}, { ReEntry => 1 });
2076             }
2077 0 0       0 unless ($result) {
2078 0 0       0 $et->FoundTag('FileType', defined $result ? '(unknown)' : '(unsupported)');
2079             }
2080 0         0 last;
2081             }
2082             # decode stream if necessary
2083 43 50       187 DecodeStream($et, $dict) or last;
2084 43 50       185 if ($verbose > 2) {
2085 0         0 $et->VPrint(2,"$$et{INDENT}$$et{DIR_NAME} stream data\n");
2086 0         0 $et->VerboseDump(\$$dict{_stream});
2087             }
2088             # extract information from stream
2089             my %dirInfo = (
2090             DataPt => \$$dict{_stream},
2091             DataLen => length $$dict{_stream},
2092             DirStart => 0,
2093             DirLen => length $$dict{_stream},
2094 43         398 Parent => 'PDF',
2095             );
2096 43         196 my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
2097 43 50       268 unless ($et->ProcessDirectory(\%dirInfo, $subTablePtr)) {
2098 0         0 $et->Warn("Error processing $$tagInfo{Name} information");
2099             }
2100 43         275 last;
2101             }
2102             }
2103              
2104             #------------------------------------------------------------------------------
2105             # Extract information from PDF file
2106             # Inputs: 0) ExifTool object reference, 1) dirInfo reference
2107             # Returns: 0 if not a PDF file, 1 on success, otherwise a negative error number
2108             sub ReadPDF($$)
2109             {
2110 37     37 0 116 my ($et, $dirInfo) = @_;
2111 37         91 my $raf = $$dirInfo{RAF};
2112 37         142 my $verbose = $et->Options('Verbose');
2113 37         93 my ($buff, $encrypt, $id);
2114             #
2115             # validate PDF file
2116             #
2117             # (linearization dictionary must be in the first 1024 bytes of the file)
2118 37 50       117 $raf->Read($buff, 1024) >= 8 or return 0;
2119 37 50       357 $buff =~ /^(\s*)%PDF-(\d+\.\d+)/ or return 0;
2120 37 50       227 $$et{PDFBase} = length $1 and $et->Warn('PDF header is not at start of file',1);
2121 37         95 $pdfVer = $2;
2122 37         204 $et->SetFileType(); # set the FileType tag
2123 37 50       252 $et->Warn("The PDF $pdfVer specification is held hostage by the ISO") if $pdfVer >= 2.0;
2124             # store PDFVersion tag
2125 37         120 my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Root');
2126 37         210 $et->HandleTag($tagTablePtr, 'Version', $pdfVer);
2127 37         116 $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Main');
2128             #
2129             # check for a linearized PDF (only if reading)
2130             #
2131 37         100 my $capture = $$et{PDF_CAPTURE};
2132 37 100       115 unless ($capture) {
2133 18         54 my $lin = 'false';
2134 18 50       108 if ($buff =~ /<
2135 18         82 $buff = substr($buff, pos($buff) - 2);
2136 18         73 my $dict = ExtractObject($et, \$buff);
2137 18 0 33     159 if (ref $dict eq 'HASH' and $$dict{Linearized} and $$dict{L}) {
      33        
2138 0 0       0 if (not $$et{VALUE}{FileSize}) {
    0          
2139 0         0 undef $lin; # can't determine if it is linearized
2140             } elsif ($$dict{L} == $$et{VALUE}{FileSize} - $$et{PDFBase}) {
2141 0         0 $lin = 'true';
2142             }
2143             }
2144             }
2145 18 50       103 $et->HandleTag($tagTablePtr, '_linearized', $lin) if $lin;
2146             }
2147             #
2148             # read the xref tables referenced from startxref at the end of the file
2149             #
2150 37         90 my @xrefOffsets;
2151 37 50       249 $raf->Seek(0, 2) or return -2;
2152             # the %%EOF must occur within the last 1024 bytes of the file (PDF spec, appendix H)
2153 37         197 my $len = $raf->Tell();
2154 37 50       122 $len = 1024 if $len > 1024;
2155 37 50       107 $raf->Seek(-$len, 2) or return -2;
2156 37 50       167 $raf->Read($buff, $len) == $len or return -3;
2157             # find the LAST xref table in the file (may be multiple %%EOF marks,
2158             # and comments between "startxref" and "%%EOF")
2159 37 50       453 $buff =~ /^.*startxref(\s+)(\d+)(\s+)(%[^\x0d\x0a]*\s+)*%%EOF/s or return -4;
2160 37         179 my $ws = $1 . $3;
2161 37         96 my $xr = $2;
2162 37         181 push @xrefOffsets, $xr, 'Main';
2163             # set input record separator
2164 37 50       385 local $/ = $ws =~ /(\x0d\x0a|\x0d|\x0a)/ ? $1 : "\x0a";
2165 37         89 my (%xref, @mainDicts, %loaded, $mainFree);
2166 37         91 my ($xrefSize, $mainDictSize) = (0, 0);
2167             # initialize variables to capture when rewriting
2168 37 100       107 if ($capture) {
2169 19         79 $capture->{startxref} = $xr;
2170 19         75 $capture->{xref} = \%xref;
2171 19         73 $capture->{newline} = $/;
2172 19         54 $capture->{mainFree} = $mainFree = { };
2173             }
2174             XRef:
2175 37         141 while (@xrefOffsets) {
2176 70         145 my $offset = shift @xrefOffsets;
2177 70         121 my $type = shift @xrefOffsets;
2178 70 50       179 next if $loaded{$offset}; # avoid infinite recursion
2179 70 50       300 unless ($raf->Seek($offset+$$et{PDFBase}, 0)) {
2180 0 0       0 %loaded or return -5;
2181 0         0 $et->Warn('Bad offset for secondary xref table');
2182 0         0 next;
2183             }
2184             # Note: care must be taken because ReadLine may read more than we want if
2185             # the newline sequence for this table is different than the rest of the file
2186 70         170 for (;;) {
2187 70 50       258 unless ($raf->ReadLine($buff)) {
2188 0 0       0 %loaded or return -6;
2189 0         0 $et->Warn('Bad offset for secondary xref table');
2190 0         0 next XRef;
2191             }
2192 70 50       416 last if $buff =~/\S/; # skip blank lines
2193             }
2194 70         127 my $loadXRefStream;
2195 70 50       465 if ($buff =~ s/^\s*xref\s+//s) {
    0          
2196             # load xref table
2197 70         133 for (;;) {
2198             # read another line if necessary (skipping blank lines)
2199 177   50     670 $raf->ReadLine($buff) or return -6 until $buff =~ /\S/;
2200 177 100       725 last if $buff =~ s/^\s*trailer([\s<[(])/$1/s;
2201 107 50       505 $buff =~ s/^\s*(\d+)\s+(\d+)\s+//s or return -4;
2202 107         340 my ($start, $num) = ($1, $2);
2203 107 50       303 $raf->Seek(-length($buff), 1) or return -4;
2204 107         213 my $i;
2205 107         406 for ($i=0; $i<$num; ++$i) {
2206 622 50       1516 $raf->Read($buff, 20) == 20 or return -6;
2207 622 50       2482 $buff =~ /^\s*(\d{10}) (\d{5}) (f|n)/s or return -4;
2208 622         1059 my $num = $start + $i;
2209 622 100       1191 $xrefSize = $num if $num > $xrefSize;
2210             # locate object to generate entry from stream if necessary
2211             # (must do this before we test $xref{$num})
2212 622 50       1218 LocateAnyObject(\%xref, $num) if $xref{dicts};
2213             # save offset for newest copy of all objects
2214             # (or next object number for free objects)
2215 622 100       1264 unless (defined $xref{$num}) {
2216 526         1542 my ($offset, $gen) = (int($1), int($2));
2217 526         1088 $xref{$num} = $offset;
2218 526 100       1101 if ($3 eq 'f') {
2219             # save free objects in last xref table for rewriting
2220 52 100       185 $$mainFree{$num} = [ $offset, $gen, 'f' ] if $mainFree;
2221 52         163 next;
2222             }
2223             # also save offset keyed by object reference string
2224 474         1695 $xref{"$num $gen R"} = $offset;
2225             }
2226             }
2227             # (I have a sample from Adobe which has an empty xref table)
2228             # %xref or return -4; # xref table may not be empty
2229 107         255 $buff = '';
2230             }
2231 70         181 undef $mainFree; # only do this for the last xref table
2232             } elsif ($buff =~ s/^\s*(\d+)\s+(\d+)\s+obj//s) {
2233             # this is a PDF-1.5 cross-reference stream dictionary
2234 0         0 $loadXRefStream = 1;
2235             } else {
2236 0 0       0 %loaded or return -4;
2237 0         0 $et->Warn('Invalid secondary xref table');
2238 0         0 next;
2239             }
2240 70         293 my $mainDict = ExtractObject($et, \$buff, $raf, \%xref);
2241 70 50       235 unless (ref $mainDict eq 'HASH') {
2242 0 0       0 %loaded or return -8;
2243 0         0 $et->Warn('Error loading secondary dictionary');
2244 0         0 next;
2245             }
2246             # keep track of total trailer dictionary Size
2247 70 100 66     423 $mainDictSize = $$mainDict{Size} if $$mainDict{Size} and $$mainDict{Size} > $mainDictSize;
2248 70 50       182 if ($loadXRefStream) {
2249             # decode and save our XRef stream from PDF-1.5 file
2250             # (but parse it later as required to save time)
2251             # Note: this technique can potentially result in an old object
2252             # being used if the file was incrementally updated and an older
2253             # object from an xref table was replaced by a newer object in an
2254             # xref stream. But doing so isn't a good idea (if allowed at all)
2255             # because a PDF 1.4 consumer would also make this same mistake.
2256 0 0 0     0 if ($$mainDict{Type} eq '/XRef' and $$mainDict{W} and
      0        
      0        
      0        
2257 0         0 @{$$mainDict{W}} > 2 and $$mainDict{Size} and
2258             DecodeStream($et, $mainDict))
2259             {
2260             # create Index entry if it doesn't exist
2261 0 0       0 $$mainDict{Index} or $$mainDict{Index} = [ 0, $$mainDict{Size} ];
2262             # create '_entry_size' entry for internal use
2263 0         0 my $w = $$mainDict{W};
2264 0         0 my $size = 0;
2265 0         0 foreach (@$w) { $size += $_; }
  0         0  
2266 0         0 $$mainDict{_entry_size} = $size;
2267             # save this stream dictionary to use later if required
2268 0 0       0 $xref{dicts} = [] unless $xref{dicts};
2269 0         0 push @{$xref{dicts}}, $mainDict;
  0         0  
2270             } else {
2271 0 0       0 %loaded or return -9;
2272 0         0 $et->Warn('Invalid xref stream in secondary dictionary');
2273             }
2274             }
2275 70         186 $loaded{$offset} = 1;
2276             # load XRef stream in hybrid file if it exists
2277 70 50       187 push @xrefOffsets, $$mainDict{XRefStm}, 'XRefStm' if $$mainDict{XRefStm};
2278 70 50       198 $encrypt = $$mainDict{Encrypt} if $$mainDict{Encrypt};
2279 70 50 33     162 undef $encrypt if $encrypt and $encrypt eq 'null'; # (have seen "null")
2280 70 100 66     256 if ($$mainDict{ID} and ref $$mainDict{ID} eq 'ARRAY') {
2281 29         77 $id = ReadPDFValue($mainDict->{ID}->[0]);
2282             }
2283 70         171 push @mainDicts, $mainDict, $type;
2284             # load previous xref table if it exists
2285 70 100       311 push @xrefOffsets, $$mainDict{Prev}, 'Prev' if $$mainDict{Prev};
2286             }
2287 37 50       136 if ($xrefSize > $mainDictSize) {
2288 0         0 my $str = "Objects in xref table ($xrefSize) exceed trailer dictionary Size ($mainDictSize)";
2289 0 0       0 $capture ? $et->Error($str) : $et->Warn($str);
2290             }
2291             #
2292             # extract encryption information if necessary
2293             #
2294 37 50       127 if ($encrypt) {
2295 0 0       0 if (ref $encrypt eq 'SCALAR') {
2296 0         0 $encrypt = FetchObject($et, $$encrypt, \%xref, 'Encrypt');
2297             }
2298             # generate Encryption tag information
2299 0         0 my $err = DecryptInit($et, $encrypt, $id);
2300 0 0       0 if ($err) {
2301 0         0 $et->Warn($err);
2302 0 0       0 $$capture{Error} = $err if $capture;
2303 0         0 return -1;
2304             }
2305             }
2306             #
2307             # extract the information beginning with each of the main dictionaries
2308             #
2309 37         79 my $i = 0;
2310 37         127 my $num = (scalar @mainDicts) / 2;
2311 37         92 while (@mainDicts) {
2312 70         141 my $dict = shift @mainDicts;
2313 70         127 my $type = shift @mainDicts;
2314 70 50       182 if ($verbose) {
2315 0         0 ++$i;
2316 0         0 my $n = scalar(@{$$dict{_tags}});
  0         0  
2317 0         0 $et->VPrint(0, "PDF dictionary ($i of $num) with $n entries:\n");
2318             }
2319 70         261 ProcessDict($et, $tagTablePtr, $dict, \%xref, 0, $type);
2320             }
2321             # handle any decryption errors
2322 37 50       135 if ($encrypt) {
2323 0         0 my $err = $$encrypt{_error};
2324 0 0       0 if ($err) {
2325 0         0 $et->Warn($err);
2326 0 0       0 $$capture{Error} = $err if $capture;
2327 0         0 return -1;
2328             }
2329             }
2330 37         451 return 1;
2331             }
2332              
2333             #------------------------------------------------------------------------------
2334             # ReadPDF() warning strings for each error return value
2335             my %pdfWarning = (
2336             # -1 is reserved as error return value with no associated warning
2337             -2 => 'Error seeking in file',
2338             -3 => 'Error reading file',
2339             -4 => 'Invalid xref table',
2340             -5 => 'Invalid xref offset',
2341             -6 => 'Error reading xref table',
2342             -7 => 'Error reading trailer',
2343             -8 => 'Error reading main dictionary',
2344             -9 => 'Invalid xref stream in main dictionary',
2345             );
2346              
2347             #------------------------------------------------------------------------------
2348             # Extract information from PDF file
2349             # Inputs: 0) ExifTool object reference, 1) dirInfo reference
2350             # Returns: 1 if this was a valid PDF file
2351             sub ProcessPDF($$)
2352             {
2353 37     37 0 129 my ($et, $dirInfo) = @_;
2354              
2355 37         85 undef $cryptInfo; # (must not delete after returning so writer can use it)
2356 37         79 undef $cryptStream;
2357 37         53 undef $cryptString;
2358 37         161 my $result = ReadPDF($et, $dirInfo);
2359 37 50       125 if ($result < 0) {
2360 0 0       0 $et->Warn($pdfWarning{$result}) if $pdfWarning{$result};
2361 0         0 $result = 1;
2362             }
2363             # clean up and return
2364 37         83 undef %streamObjs;
2365 37         112 undef %fetched;
2366 37         119 return $result;
2367             }
2368              
2369             1; # end
2370              
2371              
2372             __END__