File Coverage

blib/lib/PDF/API2/Basic/PDF/File.pm
Criterion Covered Total %
statement 542 717 75.5
branch 187 322 58.0
condition 41 90 45.5
subroutine 45 48 93.7
pod 27 27 100.0
total 842 1204 69.9


line stmt bran cond sub pod time code
1             # Code in the PDF::API2::Basic::PDF namespace was originally copied from the
2             # Text::PDF distribution.
3             #
4             # Copyright Martin Hosken
5             #
6             # Martin Hosken's code may be used under the terms of the MIT license.
7             # Subsequent versions of the code have the same license as PDF::API2.
8              
9             package PDF::API2::Basic::PDF::File;
10              
11 39     39   69274 use strict;
  39         108  
  39         2695  
12              
13             our $VERSION = '2.044'; # VERSION
14              
15             =head1 NAME
16              
17             PDF::API2::Basic::PDF::File - Low-level PDF file access
18              
19             =head1 SYNOPSIS
20              
21             $p = PDF::API2::Basic::PDF::File->open("filename.pdf", 1);
22             $p->new_obj($obj_ref);
23             $p->free_obj($obj_ref);
24             $p->append_file;
25             $p->close_file;
26             $p->release; # IMPORTANT!
27              
28             =head1 DESCRIPTION
29              
30             This class keeps track of the directory aspects of a PDF file. There are two
31             parts to the directory: the main directory object which is the parent to all
32             other objects and a chain of cross-reference tables and corresponding trailer
33             dictionaries starting with the main directory object.
34              
35             =head1 INSTANCE VARIABLES
36              
37             Within this class hierarchy, rather than making everything visible via methods,
38             which would be a lot of work, there are various instance variables which are
39             accessible via associative array referencing. To distinguish instance variables
40             from content variables (which may come from the PDF content itself), each such
41             variable will start with a space.
42              
43             Variables which do not start with a space directly reflect elements in a PDF
44             dictionary. In the case of a PDF::API2::Basic::PDF::File, the elements reflect those in the
45             trailer dictionary.
46              
47             Since some variables are not designed for class users to access, variables are
48             marked in the documentation with (R) to indicate that such an entry should only
49             be used as read-only information. (P) indicates that the information is private
50             and not designed for user use at all, but is included in the documentation for
51             completeness and to ensure that nobody else tries to use it.
52              
53             =over
54              
55             =item newroot
56              
57             This variable allows the user to create a new root entry to occur in the trailer
58             dictionary which is output when the file is written or appended. If you wish to
59             over-ride the root element in the dictionary you have, use this entry to indicate
60             that without losing the current Root entry. Notice that newroot should point to
61             a PDF level object and not just to a dictionary which does not have object status.
62              
63             =item INFILE (R)
64              
65             Contains the filehandle used to read this information into this PDF directory. Is
66             an IO object.
67              
68             =item fname (R)
69              
70             This is the filename which is reflected by INFILE, or the original IO object passed
71             in.
72              
73             =item update (R)
74              
75             This indicates that the read file has been opened for update and that at some
76             point, $p->appendfile() can be called to update the file with the changes that
77             have been made to the memory representation.
78              
79             =item maxobj (R)
80              
81             Contains the first usable object number above any that have already appeared
82             in the file so far.
83              
84             =item outlist (P)
85              
86             This is a list of Objind which are to be output when the next appendfile or outfile
87             occurs.
88              
89             =item firstfree (P)
90              
91             Contains the first free object in the free object list. Free objects are removed
92             from the front of the list and added to the end.
93              
94             =item lastfree (P)
95              
96             Contains the last free object in the free list. It may be the same as the firstfree
97             if there is only one free object.
98              
99             =item objcache (P)
100              
101             All objects are held in the cache to ensure that a system only has one occurrence of
102             each object. In effect, the objind class acts as a container type class to hold the
103             PDF object structure and it would be unfortunate if there were two identical
104             place-holders floating around a system.
105              
106             =item epos (P)
107              
108             The end location of the read-file.
109              
110             =back
111              
112             Each trailer dictionary contains a number of private instance variables which
113             hold the chain together.
114              
115             =over
116              
117             =item loc (P)
118              
119             Contains the location of the start of the cross-reference table preceding the
120             trailer.
121              
122             =item xref (P)
123              
124             Contains an anonymous array of each cross-reference table entry.
125              
126             =item prev (P)
127              
128             A reference to the previous table. Note this differs from the Prev entry which
129             is in PDF which contains the location of the previous cross-reference table.
130              
131             =back
132              
133             =head1 METHODS
134              
135             =cut
136              
137 39     39   273 use Scalar::Util qw(blessed weaken);
  39         97  
  39         2588  
138              
139 39     39   252 use vars qw($cr $irreg_char $reg_char $ws_char $delim_char %types);
  39         90  
  39         6263  
140              
141             $ws_char = '[ \t\r\n\f\0]';
142             $delim_char = '[][<>{}()/%]';
143             $reg_char = '[^][<>{}()/% \t\r\n\f\0]';
144             $irreg_char = '[][<>{}()/% \t\r\n\f\0]';
145             $cr = '\s*(?:\015|\012|(?:\015\012))';
146              
147             my $re_comment = qr/(?:\%[^\r\n]*)/;
148             my $re_whitespace = qr/(?:[ \t\r\n\f\0]|$re_comment)/;
149              
150             %types = (
151             'Page' => 'PDF::API2::Basic::PDF::Page',
152             'Pages' => 'PDF::API2::Basic::PDF::Pages',
153             );
154              
155             my $readDebug = 0;
156              
157 39     39   305 use Carp;
  39         107  
  39         2477  
158 39     39   822 use IO::File;
  39         8742  
  39         5487  
159              
160             # Now for the basic PDF types
161 39     39   745 use PDF::API2::Basic::PDF::Utils;
  39         108  
  39         2974  
162              
163 39     39   282 use PDF::API2::Basic::PDF::Array;
  39         101  
  39         1109  
164 39     39   245 use PDF::API2::Basic::PDF::Bool;
  39         88  
  39         982  
165 39     39   244 use PDF::API2::Basic::PDF::Dict;
  39         103  
  39         953  
166 39     39   231 use PDF::API2::Basic::PDF::Name;
  39         97  
  39         1000  
167 39     39   295 use PDF::API2::Basic::PDF::Number;
  39         138  
  39         1023  
168 39     39   213 use PDF::API2::Basic::PDF::Objind;
  39         93  
  39         1175  
169 39     39   235 use PDF::API2::Basic::PDF::String;
  39         92  
  39         944  
170 39     39   17421 use PDF::API2::Basic::PDF::Page;
  39         122  
  39         1320  
171 39     39   272 use PDF::API2::Basic::PDF::Pages;
  39         85  
  39         782  
172 39     39   198 use PDF::API2::Basic::PDF::Null;
  39         95  
  39         931  
173 39     39   198 use POSIX qw(ceil floor);
  39         165  
  39         252  
174              
175 39     39   3254 no warnings qw[ deprecated recursion uninitialized ];
  39         92  
  39         43240  
176              
177              
178             =head2 PDF::API2::Basic::PDF::File->new
179              
180             Creates a new, empty file object which can act as the host to other PDF objects.
181             Since there is no file associated with this object, it is assumed that the
182             object is created in readiness for creating a new PDF file.
183              
184             =cut
185              
186             sub new {
187 164     164 1 437 my ($class, $root) = @_;
188 164         545 my $self = $class->_new();
189              
190 164 50       504 unless ($root) {
191 164         570 $root = PDFDict();
192 164         527 $root->{'Type'} = PDFName('Catalog');
193             }
194 164         602 $self->new_obj($root);
195 164         388 $self->{'Root'} = $root;
196 164         593 return $self;
197             }
198              
199              
200             =head2 $p = PDF::API2::Basic::PDF::File->open($filename, $update)
201              
202             Opens the file and reads all the trailers and cross reference tables to build
203             a complete directory of objects.
204              
205             $update specifies whether this file is being opened for updating and editing,
206             or simply to be read.
207              
208             $filename may be an IO object
209              
210             =cut
211              
212             sub open {
213 16     16 1 59 my ($class, $filename, $update) = @_;
214 16         41 my ($fh, $buffer);
215              
216 16         63 my $self = $class->_new();
217 16 100       87 if (ref $filename) {
218 8         26 $self->{' INFILE'} = $filename;
219 8 50       25 if ($update) {
220 8         21 $self->{' update'} = 1;
221 8         29 $self->{' OUTFILE'} = $filename;
222             }
223 8         18 $fh = $filename;
224             }
225             else {
226 8 50       111 die "File '$filename' does not exist" unless -f $filename;
227 8 50       121 die "File '$filename' is not readable" unless -r $filename;
228 8 50       49 if ($update) {
229 8 50       109 die "File '$filename' is not writable" unless -w $filename;
230             }
231 8   50     121 $fh = IO::File->new(($update ? '+' : '') . "<$filename")
232             || die "Error opening '$filename': $!";
233 8         900 $self->{' INFILE'} = $fh;
234 8 50       33 if ($update) {
235 8         26 $self->{' update'} = 1;
236 8         33 $self->{' OUTFILE'} = $fh;
237 8         26 $self->{' fname'} = $filename;
238             }
239             }
240 16         109 binmode $fh, ':raw';
241 16         176 $fh->seek(0, 0); # go to start of file
242 16         266 $fh->read($buffer, 255);
243 16 50       1040 unless ($buffer =~ /^\%PDF\-([12]\.\d+)\s*$cr/m) {
244 0         0 croak "$filename does not appear to be a valid PDF";
245             }
246 16         104 $self->{' version'} = $1;
247              
248 16         92 $fh->seek(0, 2); # go to end of file
249 16         276 my $end = $fh->tell();
250 16         126 $self->{' epos'} = $end;
251 16         66 foreach my $offset (1..64) {
252 32         185 $fh->seek($end - 16 * $offset, 0);
253 32         363 $fh->read($buffer, 16 * $offset);
254 32 100       987 last if $buffer =~ m/startxref($cr|\s*)\d+($cr|\s*)\%\%eof.*?/i;
255             }
256 16 50       465 unless ($buffer =~ m/startxref[^\d]+([0-9]+)($cr|\s*)\%\%eof.*?/i) {
257 0         0 die "Malformed PDF file $filename";
258             }
259 16         74 my $xpos = $1;
260 16         53 $self->{' xref_position'} = $xpos;
261              
262 16         94 my $tdict = $self->readxrtr($xpos, $self);
263 16         89 foreach my $key (keys %$tdict) {
264 121         274 $self->{$key} = $tdict->{$key};
265             }
266 16         132 return $self;
267             }
268              
269             =head2 $p->version($version)
270              
271             Gets/sets the PDF version (e.g. 1.4)
272              
273             =cut
274              
275             sub version {
276 9     9 1 14 my $self = shift();
277              
278 9 100       21 if (@_) {
279 2         5 my $version = shift();
280 2 50       22 croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
281 2         6 $self->header_version($version);
282 2 50       9 if ($version >= 1.4) {
283 2         5 $self->trailer_version($version);
284             }
285             else {
286 0         0 delete $self->{'Root'}->{'Version'};
287 0         0 $self->out_obj($self->{'Root'});
288             }
289 2         4 return $version;
290             }
291              
292 7         15 my $header_version = $self->header_version();
293 7         14 my $trailer_version = $self->trailer_version();
294 7 100       35 return $trailer_version if $trailer_version > $header_version;
295 3         15 return $header_version;
296             }
297              
298             =head2 $version = $p->header_version($version)
299              
300             Gets/sets the PDF version stored in the file header.
301              
302             =cut
303              
304             sub header_version {
305 14     14 1 31 my $self = shift();
306              
307 14 100       29 if (@_) {
308 5         10 my $version = shift();
309 5 50       28 croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
310 5         12 $self->{' version'} = $version;
311             }
312              
313 14         33 return $self->{' version'};
314             }
315              
316             =head2 $version = $p->trailer_version($version)
317              
318             Gets/sets the PDF version stored in the document catalog.
319              
320             =cut
321              
322             sub trailer_version {
323 12     12 1 27 my $self = shift();
324              
325 12 100       25 if (@_) {
326 4         7 my $version = shift();
327 4 50       15 croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
328 4         10 $self->{'Root'}->{'Version'} = PDFName($version);
329 4         15 $self->out_obj($self->{'Root'});
330 4         8 return $version;
331             }
332              
333 8 50       20 return unless $self->{'Root'}->{'Version'};
334 8         33 $self->{'Root'}->{'Version'}->realise();
335 8         23 return $self->{'Root'}->{'Version'}->val();
336             }
337              
338             =head2 $prev_version = $p->require_version($version)
339              
340             Ensures that the PDF version is at least C<$version>.
341              
342             =cut
343              
344             sub require_version {
345 3     3 1 712 my ($self, $min_version) = @_;
346 3         8 my $current_version = $self->version();
347 3 100       12 $self->version($min_version) if $current_version < $min_version;
348 3         8 return $current_version;
349             }
350              
351             =head2 $p->release()
352              
353             Releases ALL of the memory used by the PDF document and all of its
354             component objects. After calling this method, do B expect to
355             have anything left in the C object (so if
356             you need to save, be sure to do it before calling this method).
357              
358             B, that it is important that you call this method on any
359             C object when you wish to destruct it and
360             free up its memory. Internally, PDF files have an enormous number of
361             cross-references and this causes circular references within the
362             internal data structures. Calling 'C' forces a brute-force
363             cleanup of the data structures, freeing up all of the memory. Once
364             you've called this method, though, don't expect to be able to do
365             anything else with the C object; it'll
366             have B internal state whatsoever.
367              
368             =cut
369              
370             # Maintainer's Question: Couldn't this be handled by a DESTROY method
371             # instead of requiring an explicit call to release()?
372             sub release {
373 615     615 1 966 my $self = shift();
374              
375 615 50       1284 return $self unless ref($self);
376 615         1349 my @tofree = values %$self;
377              
378 615         1427 foreach my $key (keys %$self) {
379 2271         2993 $self->{$key} = undef;
380 2271         3624 delete $self->{$key};
381             }
382              
383             # PDFs with highly-interconnected page trees or outlines can hit Perl's
384             # recursion limit pretty easily, so disable the warning for this specific
385             # loop.
386 39     39   338 no warnings 'recursion';
  39         122  
  39         296566  
387              
388 615         2323 while (my $item = shift @tofree) {
389 7103 100 100     24799 if (blessed($item) and $item->can('release')) {
    100          
    100          
390 2115         4413 $item->release(1);
391             }
392             elsif (ref($item) eq 'ARRAY') {
393 1552         4394 push @tofree, @$item;
394             }
395             elsif (ref($item) eq 'HASH') {
396 877         2265 push @tofree, values %$item;
397 877         2182 foreach my $key (keys %$item) {
398 3822         4962 $item->{$key} = undef;
399 3822         6757 delete $item->{$key};
400             }
401             }
402             else {
403 2559         6142 $item = undef;
404             }
405             }
406             }
407              
408             =head2 $p->append_file()
409              
410             Appends the objects for output to the read file and then appends the appropriate table.
411              
412             =cut
413              
414             sub append_file {
415 3     3 1 7 my $self = shift();
416 3 50       13 return unless $self->{' update'};
417              
418 3         7 my $fh = $self->{' INFILE'};
419              
420 3         11 my $tdict = PDFDict();
421 3         10 $tdict->{'Prev'} = PDFNum($self->{' loc'});
422 3         8 $tdict->{'Info'} = $self->{'Info'};
423 3 50       11 if (defined $self->{' newroot'}) {
424 0         0 $tdict->{'Root'} = $self->{' newroot'};
425             }
426             else {
427 3         9 $tdict->{'Root'} = $self->{'Root'};
428             }
429 3         8 $tdict->{'Size'} = $self->{'Size'};
430              
431 3         18 foreach my $key (grep { $_ !~ m/^\s/ } keys %$self) {
  54         118  
432 9 50       27 $tdict->{$key} = $self->{$key} unless defined $tdict->{$key};
433             }
434              
435 3         22 $fh->seek($self->{' epos'}, 0);
436 3         32 $self->out_trailer($tdict, $self->{' update'});
437 3         43 close $self->{' OUTFILE'};
438             }
439              
440              
441             =head2 $p->out_file($fname)
442              
443             Writes a PDF file to a file of the given filename based on the current list of
444             objects to be output. It creates the trailer dictionary based on information
445             in $self.
446              
447             $fname may be an IO object;
448              
449             =cut
450              
451             sub out_file {
452 140     140 1 362 my ($self, $fname) = @_;
453              
454 140         477 $self->create_file($fname);
455 140         462 $self->close_file();
456 140         350 return $self;
457             }
458              
459              
460             =head2 $p->create_file($fname)
461              
462             Creates a new output file (no check is made of an existing open file) of
463             the given filename or IO object. Note, make sure that $p->{' version'} is set
464             correctly before calling this function.
465              
466             =cut
467              
468             sub create_file {
469 140     140 1 304 my ($self, $filename) = @_;
470 140         223 my $fh;
471              
472 140         318 $self->{' fname'} = $filename;
473 140 50       360 if (ref $filename) {
474 140         235 $fh = $filename;
475             }
476             else {
477 0   0     0 $fh = IO::File->new(">$filename") || die "Unable to open $filename for writing";
478 0         0 binmode($fh,':raw');
479             }
480              
481 140         333 $self->{' OUTFILE'} = $fh;
482 140   50     848 $fh->print('%PDF-' . ($self->{' version'} // '1.4') . "\n");
483 140         1228 $fh->print("%\xC6\xCD\xCD\xB5\n"); # and some binary stuff in a comment
484 140         789 return $self;
485             }
486              
487              
488             =head2 $p->clone_file($fname)
489              
490             Creates a copy of the input file at the specified filename and sets it as the
491             output file for future writes. A file handle may be passed instead of a
492             filename.
493              
494             =cut
495              
496             sub clone_file {
497 4     4 1 14 my ($self, $filename) = @_;
498 4         11 my $fh;
499              
500 4         16 $self->{' fname'} = $filename;
501 4 50       22 if (ref $filename) {
502 4         10 $fh = $filename;
503             }
504             else {
505 0   0     0 $fh = IO::File->new(">$filename") || die "Unable to open $filename for writing";
506 0         0 binmode($fh,':raw');
507             }
508              
509 4         9 $self->{' OUTFILE'} = $fh;
510              
511 4         11 my $in = $self->{' INFILE'};
512 4         20 $in->seek(0, 0);
513 4         62 my $data;
514 4         20 while (not $in->eof()) {
515 4         93 $in->read($data, 1024 * 1024);
516 4         136 $fh->print($data);
517             }
518 4         95 return $self;
519             }
520              
521             =head2 $p->close_file
522              
523             Closes up the open file for output by outputting the trailer etc.
524              
525             =cut
526              
527             sub close_file {
528 145     145 1 262 my $self = shift();
529              
530 145         424 my $tdict = PDFDict();
531 145 50       616 $tdict->{'Info'} = $self->{'Info'} if defined $self->{'Info'};
532 145 50 33     607 $tdict->{'Root'} = (defined $self->{' newroot'} and $self->{' newroot'} ne '') ? $self->{' newroot'} : $self->{'Root'};
533              
534             # remove all freed objects from the outlist, AND the outlist_cache if not updating
535             # NO! Don't do that thing! In fact, let out_trailer do the opposite!
536              
537 145   66     538 $tdict->{'Size'} = $self->{'Size'} || PDFNum(1);
538 145 100       441 $tdict->{'Prev'} = PDFNum($self->{' loc'}) if $self->{' loc'};
539 145 100       433 if ($self->{' update'}) {
540 5         32 foreach my $key (grep { $_ !~ m/^[\s\-]/ } keys %$self) {
  97         219  
541 16 50       40 $tdict->{$key} = $self->{$key} unless defined $tdict->{$key};
542             }
543              
544 5         16 my $fh = $self->{' INFILE'};
545 5         23 $fh->seek($self->{' epos'}, 0);
546             }
547              
548 145         849 $self->out_trailer($tdict, $self->{' update'});
549 145         1497 close($self->{' OUTFILE'});
550 145 50 33     668 if ($^O eq 'MacOS' and not ref($self->{' fname'})) {
551 0         0 MacPerl::SetFileInfo('CARO', 'TEXT', $self->{' fname'});
552             }
553              
554 145         711 return $self;
555             }
556              
557             =head2 ($value, $str) = $p->readval($str, %opts)
558              
559             Reads a PDF value from the current position in the file. If $str is too short
560             then read some more from the current location in the file until the whole object
561             is read. This is a recursive call which may slurp in a whole big stream (unprocessed).
562              
563             Returns the recursive data structure read and also the current $str that has been
564             read from the file.
565              
566             =cut
567              
568             sub readval {
569 842     842 1 14532 my ($self, $str, %opts) = @_;
570 842         1392 my $fh = $self->{' INFILE'};
571 842         1166 my ($result, $value);
572              
573 842 100       1673 my $update = defined($opts{update}) ? $opts{update} : 1;
574 842 100       1886 $str = update($fh, $str) if $update;
575              
576 842         2537 $str =~ s/^$ws_char+//; # Ignore initial white space
577 842         2200 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
578              
579             # Dictionary
580 842 100       9951 if ($str =~ m/^<
    100          
    100          
    100          
    100          
    50          
    100          
    50          
    50          
    0          
581 146         400 $str = substr ($str, 2);
582 146 100       390 $str = update($fh, $str) if $update;
583 146         478 $result = PDFDict();
584              
585 146         415 while ($str !~ m/^>>/) {
586 398         1321 $str =~ s/^$ws_char+//; # Ignore initial white space
587 398         1140 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
588              
589 398 50       2136 if ($str =~ s|^/($reg_char+)||) {
    0          
    0          
590 398         1116 my $key = PDF::API2::Basic::PDF::Name::name_to_string($1, $self);
591 398         1400 ($value, $str) = $self->readval($str, %opts);
592 398 50 50     1274 unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
593 398         1027 $result->{$key} = $value;
594             }
595             }
596             elsif ($str =~ s|^/$ws_char+||) {
597             # fixes a broken key problem of acrobat. -- fredo
598 0         0 ($value, $str) = $self->readval($str, %opts);
599 0 0 0     0 unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
600 0         0 $result->{'null'} = $value;
601             }
602             }
603             elsif ($str =~ s|^//|/|) {
604             # fixes again a broken key problem of illustrator/enfocus. -- fredo
605 0         0 ($value, $str) = $self->readval($str, %opts);
606 0 0 0     0 unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
607 0         0 $result->{'null'} = $value;
608             }
609             }
610             else {
611 0         0 die "Invalid dictionary key";
612             }
613 398 100       1058 $str = update($fh, $str) if $update; # thanks gareth.jones@stud.man.ac.uk
614             }
615 146         485 $str =~ s/^>>//;
616 146 100       400 $str = update($fh, $str) if $update;
617             # streams can't be followed by a lone carriage-return.
618             # fredo: yes they can !!! -- use the MacOS Luke.
619 146 100 66     482 if (($str =~ s/^stream(?:(?:\015\012)|\012|\015)//) and ($result->{'Length'}->val != 0)) { # stream
620 10         38 my $length = $result->{'Length'}->val;
621 10         30 $result->{' streamsrc'} = $fh;
622 10         40 $result->{' streamloc'} = $fh->tell - length($str);
623              
624 10 50       127 unless ($opts{'nostreams'}) {
625 10 50       38 if ($length > length($str)) {
626 0         0 $value = $str;
627 0         0 $length -= length($str);
628 0         0 read $fh, $str, $length + 11; # slurp the whole stream!
629             }
630             else {
631 10         21 $value = '';
632             }
633 10         38 $value .= substr($str, 0, $length);
634 10         28 $result->{' stream'} = $value;
635 10         37 $result->{' nofilt'} = 1;
636 10 50       36 $str = update($fh, $str, 1) if $update; # tell update we are in-stream and only need an endstream
637 10         52 $str = substr($str, index($str, 'endstream') + 9);
638             }
639             }
640              
641 146 100 100     580 if (defined $result->{'Type'} and defined $types{$result->{'Type'}->val}) {
642 34         121 bless $result, $types{$result->{'Type'}->val};
643             }
644             # gdj: FIXME: if any of the ws chars were crs, then the whole
645             # string might not have been read.
646             }
647              
648             # Indirect Object
649             elsif ($str =~ m/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+R/s) {
650 152         401 my $num = $1;
651 152         272 $value = $2;
652 152         1434 $str =~ s/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+R//s;
653 152 100       453 unless ($result = $self->test_obj($num, $value)) {
654 119         476 $result = PDF::API2::Basic::PDF::Objind->new();
655 119         381 $result->{' objnum'} = $num;
656 119         285 $result->{' objgen'} = $value;
657 119         266 $self->add_obj($result, $num, $value);
658             }
659 152         324 $result->{' parent'} = $self;
660 152         491 weaken $result->{' parent'};
661              
662             # Removed to address changes being lost when an indirect object is realised twice
663             # $result->{' realised'} = 0;
664              
665             # gdj: FIXME: if any of the ws chars were crs, then the whole
666             # string might not have been read.
667             }
668              
669             # Object
670             elsif ($str =~ m/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+obj/s) {
671 95         187 my $obj;
672 95         264 my $num = $1;
673 95         190 $value = $2;
674 95         1065 $str =~ s/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+obj//s;
675 95         397 ($obj, $str) = $self->readval($str, %opts);
676 95 100       269 if ($result = $self->test_obj($num, $value)) {
677 81         271 $result->merge($obj);
678             }
679             else {
680 14         25 $result = $obj;
681 14         35 $self->add_obj($result, $num, $value);
682 14         25 $result->{' realised'} = 1;
683             }
684 95 100       309 $str = update($fh, $str) if $update; # thanks to kundrat@kundrat.sk
685 95         630 $str =~ s/^endobj//;
686             }
687              
688             # Name
689             elsif ($str =~ m|^/($reg_char*)|s) {
690 287         740 $value = $1;
691 287         1440 $str =~ s|^/($reg_char*)||s;
692 287         1049 $result = PDF::API2::Basic::PDF::Name->from_pdf($value, $self);
693             }
694              
695             # Literal String
696             elsif ($str =~ m/^\(/) {
697             # We now need to find an unbalanced, unescaped right-paren.
698             # This can't be done with a regex.
699 2         10 my $value = '(';
700 2         7 $str = substr($str, 1);
701              
702 2         6 my $nested_level = 1;
703 2         6 while (1) {
704             # Ignore everything up to the first escaped or parenthesis character
705 2 50       11 if ($str =~ /^([^\\()]+)(.*)/s) {
706 2         8 $value .= $1;
707 2         6 $str = $2;
708             }
709              
710             # Ignore escaped parentheses
711 2 50       20 if ($str =~ /^(\\[()])/) {
    50          
    50          
    0          
712 0         0 $value .= $1;
713 0         0 $str = substr($str, 2);
714             }
715              
716             # Left parenthesis: increase nesting
717             elsif ($str =~ /^\(/) {
718 0         0 $value .= '(';
719 0         0 $str = substr($str, 1);
720 0         0 $nested_level++;
721             }
722              
723             # Right parenthesis: decrease nesting
724             elsif ($str =~ /^\)/) {
725 2         6 $value .= ')';
726 2         6 $str = substr($str, 1);
727 2         4 $nested_level--;
728 2 50       7 last unless $nested_level;
729             }
730              
731             # Other escaped character
732             elsif ($str =~ /^(\\[^()])/) {
733 0         0 $value .= $1;
734 0         0 $str = substr($str, 2);
735             }
736              
737             # If there wasn't an escaped or parenthesis character,
738             # read some more.
739             else {
740             # We don't use update because we don't want to remove
741             # whitespace or comments.
742 0 0       0 $fh->read($str, 255, length($str)) or die 'Unterminated string.';
743             }
744             }
745              
746 2         9 $result = PDF::API2::Basic::PDF::String->from_pdf($value);
747             }
748              
749             # Hex String
750             elsif ($str =~ m/^
751 0         0 $str =~ s/^
752 0         0 $fh->read($str, 255, length($str)) while (0 > index($str, '>'));
753 0         0 ($value, $str) = ($str =~ /^(.*?)>(.*)/s);
754 0         0 $result = PDF::API2::Basic::PDF::String->from_pdf('<' . $value . '>');
755             }
756              
757             # Array
758             elsif ($str =~ m/^\[/) {
759 74         267 $str =~ s/^\[//;
760 74 50       247 $str = update($fh, $str) if $update;
761 74         263 $result = PDFArray();
762 74         250 while ($str !~ m/^\]/) {
763 224         845 $str =~ s/^$ws_char+//; # Ignore initial white space
764 224         715 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
765              
766 224         682 ($value, $str) = $self->readval($str, %opts);
767 224         719 $result->add_elements($value);
768 224 50       551 $str = update($fh, $str) if $update; # str might just be exhausted!
769             }
770 74         287 $str =~ s/^\]//;
771             }
772              
773             # Boolean
774             elsif ($str =~ m/^(true|false)($irreg_char|$)/) {
775 0         0 $value = $1;
776 0         0 $str =~ s/^(?:true|false)//;
777 0         0 $result = PDF::API2::Basic::PDF::Bool->from_pdf($value);
778             }
779              
780             # Number
781             elsif ($str =~ m/^([+-.0-9]+)($irreg_char|$)/) {
782 86         270 $value = $1;
783 86         350 $str =~ s/^([+-.0-9]+)//;
784              
785             # If $str only consists of whitespace (or is empty), call update to
786             # see if this is the beginning of an indirect object or reference
787 86 100 100     1772 if ($update and ($str =~ /^$re_whitespace*$/s or $str =~ /^$re_whitespace+[0-9]+$re_whitespace*$/s)) {
      100        
788 6         61 $str =~ s/^$re_whitespace+/ /s;
789 6         47 $str =~ s/$re_whitespace+$/ /s;
790 6         17 $str = update($fh, $str);
791 6 100       137 if ($str =~ m/^$re_whitespace*([0-9]+)$re_whitespace+(?:R|obj)/s) {
792 4         18 return $self->readval("$value $str", %opts);
793             }
794             }
795              
796 82         520 $result = PDF::API2::Basic::PDF::Number->from_pdf($value);
797             }
798              
799             # Null
800             elsif ($str =~ m/^null($irreg_char|$)/) {
801 0         0 $str =~ s/^null//;
802 0         0 $result = PDF::API2::Basic::PDF::Null->new;
803             }
804              
805             else {
806 0         0 die "Can't parse `$str' near " . ($fh->tell()) . " length " . length($str) . ".";
807             }
808              
809 838         3857 $str =~ s/^$ws_char+//s;
810 838         2841 return ($result, $str);
811             }
812              
813              
814             =head2 $ref = $p->read_obj($objind, %opts)
815              
816             Given an indirect object reference, locate it and read the object returning
817             the read in object.
818              
819             =cut
820              
821             sub read_obj {
822 77     77 1 174 my ($self, $objind, %opts) = @_;
823              
824 77   50     297 my $res = $self->read_objnum($objind->{' objnum'}, $objind->{' objgen'}, %opts) || return;
825 77 50       267 $objind->merge($res) unless $objind eq $res;
826 77         321 return $objind;
827             }
828              
829              
830             =head2 $ref = $p->read_objnum($num, $gen, %opts)
831              
832             Returns a fully read object of given number and generation in this file
833              
834             =cut
835              
836             sub read_objnum {
837 85     85 1 2439 my ($self, $num, $gen, %opts) = @_;
838 85 50       194 croak 'Undefined object number in call to read_objnum($num, $gen)' unless defined $num;
839 85 50       176 croak 'Undefined object generation in call to read_objnum($num, $gen)' unless defined $gen;
840 85 50       428 croak "Invalid object number '$num' in call to read_objnum" unless $num =~ /^[0-9]+$/;
841 85 50       293 croak "Invalid object generation '$gen' in call to read_objnum" unless $gen =~ /^[0-9]+$/;
842              
843 85   50     224 my $object_location = $self->locate_obj($num, $gen) || return;
844 85         151 my $object;
845              
846             # Compressed object
847 85 100       199 if (ref($object_location)) {
848 4         9 my ($object_stream_num, $object_stream_pos) = @{$object_location};
  4         9  
849              
850 4         32 my $object_stream = $self->read_objnum($object_stream_num, 0, %opts);
851 4 50       17 die 'Cannot find the compressed object stream' unless $object_stream;
852              
853 4 50       25 $object_stream->read_stream() if $object_stream->{' nofilt'};
854              
855             # An object stream starts with pairs of integers containing object numbers and
856             # stream offsets relative to the First key
857 4         8 my $fh;
858             my $pairs;
859 4 50       14 unless ($object_stream->{' streamfile'}) {
860 4         15 $pairs = substr($object_stream->{' stream'}, 0, $object_stream->{'First'}->val);
861             }
862             else {
863 0         0 CORE::open($fh, '<', $object_stream->{' streamfile'});
864 0         0 read($fh, $pairs, $object_stream->{'First'}->val());
865             }
866 4         31 my @map = split /\s+/, $pairs;
867              
868             # Find the offset of the object in the stream
869 4         11 my $index = $object_stream_pos * 2;
870 4 50       14 die "Objind $num does not exist at index $index" unless $map[$index] == $num;
871 4         10 my $start = $map[$index + 1];
872              
873             # Unless this is the last object in the stream, its length is determined by the
874             # offset of the next object
875 4         9 my $last_object_in_stream = $map[-2];
876 4         6 my $length;
877 4 100       15 if ($last_object_in_stream == $num) {
878 2 50       7 if ($object_stream->{' stream'}) {
879 2         20 $length = length($object_stream->{' stream'}) - $object_stream->{'First'}->val() - $start;
880             }
881             else {
882 0         0 $length = (-s $object_stream->{' streamfile'}) - $object_stream->{'First'}->val() - $start;
883             }
884             }
885             else {
886 2         7 my $next_start = $map[$index + 3];
887 2         6 $length = $next_start - $start;
888             }
889              
890             # Read the object from the stream
891 4         14 my $stream = "$num 0 obj ";
892 4 50       15 unless ($object_stream->{' streamfile'}) {
893 4         12 $stream .= substr($object_stream->{' stream'}, $object_stream->{'First'}->val() + $start, $length);
894             }
895             else {
896 0         0 seek($fh, $object_stream->{'First'}->val() + $start, 0);
897 0         0 read($fh, $stream, $length, length($stream));
898 0         0 close $fh;
899             }
900              
901 4         16 ($object) = $self->readval($stream, %opts, update => 0);
902 4         24 return $object;
903             }
904              
905 81         287 my $current_location = $self->{' INFILE'}->tell;
906 81         586 $self->{' INFILE'}->seek($object_location, 0);
907 81         816 ($object) = $self->readval('', %opts);
908 81         394 $self->{' INFILE'}->seek($current_location, 0);
909 81         1083 return $object;
910             }
911              
912              
913             =head2 $objind = $p->new_obj($obj)
914              
915             Creates a new, free object reference based on free space in the cross reference chain.
916             If nothing free then thinks up a new number. If $obj then turns that object into this
917             new object rather than returning a new object.
918              
919             =cut
920              
921             sub new_obj {
922 1072     1072 1 2094 my ($self, $base) = @_;
923 1072         1603 my $res;
924              
925 1072 50 66     2952 if (defined $self->{' free'} and scalar @{$self->{' free'}} > 0) {
  15         63  
926 0         0 $res = shift(@{$self->{' free'}});
  0         0  
927 0 0       0 if (defined $base) {
928 0         0 my ($num, $gen) = @{$self->{' objects'}{$res->uid}};
  0         0  
929 0         0 $self->remove_obj($res);
930 0         0 $self->add_obj($base, $num, $gen);
931 0         0 return $self->out_obj($base);
932             }
933             else {
934 0         0 $self->{' objects'}{$res->uid}[2] = 0;
935 0         0 return $res;
936             }
937             }
938              
939 1072         1722 my $tdict = $self;
940 1072         1502 my $i;
941 1072         2346 while (defined $tdict) {
942 1073 50       3241 $i = $tdict->{' xref'}{defined($i) ? $i : ''}[0];
943 1073   33     2798 while (defined $i and $i != 0) {
944 0         0 my ($ni, $ng) = @{$tdict->{' xref'}{$i}};
  0         0  
945 0 0       0 unless (defined $self->locate_obj($i, $ng)) {
946 0 0       0 if (defined $base) {
947 0         0 $self->add_obj($base, $i, $ng);
948 0         0 return $base;
949             }
950             else {
951 0   0     0 $res = $self->test_obj($i, $ng) || $self->add_obj(PDF::API2::Basic::PDF::Objind->new(), $i, $ng);
952 0         0 $self->out_obj($res);
953 0         0 return $res;
954             }
955             }
956 0         0 $i = $ni;
957             }
958 1073         2298 $tdict = $tdict->{' prev'};
959             }
960              
961 1072         1876 $i = $self->{' maxobj'}++;
962 1072 50       2015 if (defined $base) {
963 1072         2977 $self->add_obj($base, $i, 0);
964 1072         2788 $self->out_obj($base);
965 1072         2290 return $base;
966             }
967             else {
968 0         0 $res = $self->add_obj(PDF::API2::Basic::PDF::Objind->new(), $i, 0);
969 0         0 $self->out_obj($res);
970 0         0 return $res;
971             }
972             }
973              
974              
975             =head2 $p->out_obj($objind)
976              
977             Indicates that the given object reference should appear in the output xref
978             table whether with data or freed.
979              
980             =cut
981              
982             sub out_obj {
983 2676     2676 1 4534 my ($self, $obj) = @_;
984              
985             # This is why we've been keeping the outlist CACHE around; to speed
986             # up this method by orders of magnitude (it saves up from having to
987             # grep the full outlist each time through as we'll just do a lookup
988             # in the hash) (which is super-fast).
989 2676 100       7098 unless (exists $self->{' outlist_cache'}{$obj}) {
990 1087         1627 push @{$self->{' outlist'}}, $obj;
  1087         2436  
991             # weaken $self->{' outlist'}->[-1];
992 1087         3482 $self->{' outlist_cache'}{$obj} = 1;
993             }
994 2676         4879 return $obj;
995             }
996              
997              
998             =head2 $p->free_obj($objind)
999              
1000             Marks an object reference for output as being freed.
1001              
1002             =cut
1003              
1004             sub free_obj {
1005 0     0 1 0 my ($self, $obj) = @_;
1006              
1007 0         0 push @{$self->{' free'}}, $obj;
  0         0  
1008 0         0 $self->{' objects'}{$obj->uid()}[2] = 1;
1009 0         0 $self->out_obj($obj);
1010             }
1011              
1012              
1013             =head2 $p->remove_obj($objind)
1014              
1015             Removes the object from all places where we might remember it
1016              
1017             =cut
1018              
1019             sub remove_obj {
1020 0     0 1 0 my ($self, $objind) = @_;
1021              
1022             # who says it has to be fast
1023 0         0 delete $self->{' objects'}{$objind->uid()};
1024 0         0 delete $self->{' outlist_cache'}{$objind};
1025 0         0 delete $self->{' printed_cache'}{$objind};
1026 0         0 @{$self->{' outlist'}} = grep { $_ ne $objind } @{$self->{' outlist'}};
  0         0  
  0         0  
  0         0  
1027 0         0 @{$self->{' printed'}} = grep { $_ ne $objind } @{$self->{' printed'}};
  0         0  
  0         0  
  0         0  
1028             $self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} = undef
1029 0 0       0 if $self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} eq $objind;
1030 0         0 return $self;
1031             }
1032              
1033              
1034             =head2 $p->ship_out(@objects)
1035              
1036             Ships the given objects (or all objects for output if @objects is empty) to
1037             the currently open output file (assuming there is one). Freed objects are not
1038             shipped, and once an object is shipped it is switched such that this file
1039             becomes its source and it will not be shipped again unless out_obj is called
1040             again. Notice that a shipped out object can be re-output or even freed, but
1041             that it will not cause the data already output to be changed.
1042              
1043             =cut
1044              
1045             sub ship_out {
1046 154     154 1 321 my ($self, @objs) = @_;
1047              
1048 154 50       402 die "No output file specified" unless defined $self->{' OUTFILE'};
1049 154         295 my $fh = $self->{' OUTFILE'};
1050 154         396 seek($fh, 0, 2); # go to the end of the file
1051              
1052 154 50       434 @objs = @{$self->{' outlist'}} unless scalar @objs > 0;
  154         437  
1053 154         386 foreach my $objind (@objs) {
1054 890 50       2481 next unless $objind->is_obj($self);
1055 890         1572 my $j = -1;
1056 890         1477 for (my $i = 0; $i < scalar @{$self->{' outlist'}}; $i++) {
  890         2058  
1057 890 50       2542 if ($self->{' outlist'}[$i] eq $objind) {
1058 890         1258 $j = $i;
1059 890         1605 last;
1060             }
1061             }
1062 890 50       1776 next if $j < 0;
1063 890         1217 splice(@{$self->{' outlist'}}, $j, 1);
  890         1715  
1064 890         2358 delete $self->{' outlist_cache'}{$objind};
1065 890 50       1216 next if grep { $_ eq $objind } @{$self->{' free'}};
  0         0  
  890         2279  
1066              
1067 890 50       1912 map { $fh->print("\% $_ \n") } split(/$cr/, $objind->{' comments'}) if $objind->{' comments'};
  0         0  
1068 890         2501 $self->{' locs'}{$objind->uid()} = $fh->tell();
1069 890         1612 my ($objnum, $objgen) = @{$self->{' objects'}{$objind->uid()}}[0..1];
  890         1856  
1070 890         2596 $fh->printf('%d %d obj ', $objnum, $objgen);
1071 890         10410 $objind->outobjdeep($fh, $self);
1072 890         3785 $fh->print(" endobj\n");
1073              
1074             # Note that we've output this obj, not forgetting to update
1075             # the cache of whats printed.
1076 890 50       6016 unless (exists $self->{' printed_cache'}{$objind}) {
1077 890         1323 push @{$self->{' printed'}}, $objind;
  890         1977  
1078 890         3070 $self->{' printed_cache'}{$objind}++;
1079             }
1080             }
1081 154         456 return $self;
1082             }
1083              
1084             =head2 $p->copy($outpdf, \&filter)
1085              
1086             Iterates over every object in the file reading the object, calling filter with the object
1087             and outputting the result. if filter is not defined, then just copies input to output.
1088              
1089             =cut
1090              
1091             sub copy {
1092 0     0 1 0 my ($self, $out, $filter) = @_;
1093 0         0 my ($obj, $minl, $mini, $ming);
1094              
1095 0         0 foreach my $key (grep { not m/^[\s\-]/ } keys %$self) {
  0         0  
1096 0 0       0 $out->{$key} = $self->{$key} unless defined $out->{$key};
1097             }
1098              
1099 0         0 my $tdict = $self;
1100 0         0 while (defined $tdict) {
1101 0         0 foreach my $i (sort {$a <=> $b} keys %{$tdict->{' xref'}}) {
  0         0  
  0         0  
1102 0         0 my ($nl, $ng, $nt) = @{$tdict->{' xref'}{$i}};
  0         0  
1103 0 0       0 next unless $nt eq 'n';
1104              
1105 0 0 0     0 if ($nl < $minl or $mini == 0) {
1106 0         0 $mini = $i;
1107 0         0 $ming = $ng;
1108 0         0 $minl = $nl;
1109             }
1110 0 0       0 unless ($obj = $self->test_obj($i, $ng)) {
1111 0         0 $obj = PDF::API2::Basic::PDF::Objind->new();
1112 0         0 $obj->{' objnum'} = $i;
1113 0         0 $obj->{' objgen'} = $ng;
1114 0         0 $self->add_obj($obj, $i, $ng);
1115 0         0 $obj->{' parent'} = $self;
1116 0         0 weaken $obj->{' parent'};
1117 0         0 $obj->{' realised'} = 0;
1118             }
1119 0         0 $obj->realise;
1120 0 0       0 my $res = defined $filter ? &{$filter}($obj) : $obj;
  0         0  
1121 0 0 0     0 $out->new_obj($res) unless (!$res || $res->is_obj($out));
1122             }
1123 0         0 $tdict = $tdict->{' prev'};
1124             }
1125              
1126             # test for linearized and remove it from output
1127 0         0 $obj = $self->test_obj($mini, $ming);
1128 0 0 0     0 if ($obj->isa('PDF::API2::Basic::PDF::Dict') && $obj->{'Linearized'}) {
1129 0         0 $out->free_obj($obj);
1130             }
1131              
1132 0         0 return $self;
1133             }
1134              
1135              
1136             =head1 PRIVATE METHODS & FUNCTIONS
1137              
1138             The following methods and functions are considered private to this class. This
1139             does not mean you cannot use them if you have a need, just that they aren't really
1140             designed for users of this class.
1141              
1142             =head2 $offset = $p->locate_obj($num, $gen)
1143              
1144             Returns a file offset to the object asked for by following the chain of cross
1145             reference tables until it finds the one you want.
1146              
1147             =cut
1148              
1149             sub locate_obj {
1150 85     85 1 191 my ($self, $num, $gen) = @_;
1151              
1152 85         119 my $tdict = $self;
1153 85         199 while (defined $tdict) {
1154 94 100       293 if (ref $tdict->{' xref'}{$num}) {
1155 85         158 my $ref = $tdict->{' xref'}{$num};
1156 85 100       212 return $ref unless scalar(@$ref) == 3;
1157              
1158 81 50       243 if ($ref->[1] == $gen) {
1159 81 50       374 return $ref->[0] if $ref->[2] eq 'n';
1160 0         0 return; # if $ref->[2] eq 'f';
1161             }
1162             }
1163 9         20 $tdict = $tdict->{' prev'};
1164             }
1165 0         0 return;
1166             }
1167              
1168              
1169             =head2 update($fh, $str, $instream)
1170              
1171             Keeps reading $fh for more data to ensure that $str has at least a line full
1172             for C to work on. At this point we also take the opportunity to ignore
1173             comments.
1174              
1175             =cut
1176              
1177             sub update {
1178 1940     1940 1 3485 my ($fh, $str, $instream) = @_;
1179 1940 50       3456 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1180 1940 100       3124 if ($instream) {
1181             # we are inside a (possible binary) stream
1182             # so we fetch data till we see an 'endstream'
1183             # -- fredo/2004-09-03
1184 10   33     50 while ($str !~ m/endstream/ and not $fh->eof()) {
1185 0 0       0 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1186 0         0 $fh->read($str, 314, length($str));
1187             }
1188             }
1189             else {
1190 1930         8341 $str =~ s/^$ws_char*//;
1191 1930   100     16030 while ($str !~ m/$cr/ and not $fh->eof()) {
1192 107 50       1304 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1193 107         480 $fh->read($str, 314, length($str));
1194 107         2540 $str =~ s/^$ws_char*//so;
1195             }
1196 1930         4983 while ($str =~ m/^\%/) { # restructured by fredo/2003-03-23
1197 1 50       6 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1198 1   33     42 $fh->read($str, 314, length($str)) while ($str !~ m/$cr/ and not $fh->eof());
1199 1         27 $str =~ s/^\%[^\015\012]*$ws_char*//so; # fixed for reportlab -- fredo
1200             }
1201             }
1202              
1203 1940         4564 return $str;
1204             }
1205              
1206             =head2 $objind = $p->test_obj($num, $gen)
1207              
1208             Tests the cache to see whether an object reference (which may or may not have
1209             been getobj()ed) has been cached. Returns it if it has.
1210              
1211             =cut
1212              
1213             sub test_obj {
1214 247     247 1 524 my ($self, $num, $gen) = @_;
1215 247         956 return $self->{' objcache'}{$num, $gen};
1216             }
1217              
1218              
1219             =head2 $p->add_obj($objind)
1220              
1221             Adds the given object to the internal object cache.
1222              
1223             =cut
1224              
1225             sub add_obj {
1226 1205     1205 1 2476 my ($self, $obj, $num, $gen) = @_;
1227              
1228 1205         4118 $self->{' objcache'}{$num, $gen} = $obj;
1229 1205         5410 $self->{' objects'}{$obj->uid()} = [$num, $gen];
1230             # weaken $self->{' objcache'}{$num, $gen};
1231 1205         2730 return $obj;
1232             }
1233              
1234              
1235             =head2 $tdict = $p->readxrtr($xpos)
1236              
1237             Recursive function which reads each of the cross-reference and trailer tables
1238             in turn until there are no more.
1239              
1240             Returns a dictionary corresponding to the trailer chain. Each trailer also
1241             includes the corresponding cross-reference table.
1242              
1243             The structure of the xref private element in a trailer dictionary is of an
1244             anonymous hash of cross reference elements by object number. Each element
1245             consists of an array of 3 elements corresponding to the three elements read
1246             in [location, generation number, free or used]. See the PDF specification
1247             for details.
1248              
1249             =cut
1250              
1251             sub _unpack_xref_stream {
1252 78     78   129 my ($self, $width, $data) = @_;
1253              
1254 78 100       157 return unpack('C', $data) if $width == 1;
1255 52 50       113 return unpack('n', $data) if $width == 2;
1256 0 0       0 return unpack('N', "\x00$data") if $width == 3;
1257 0 0       0 return unpack('N', $data) if $width == 4;
1258 0 0       0 return unpack('Q>', $data) if $width == 8;
1259              
1260 0         0 die "Unsupported xref stream entry width: $width";
1261             }
1262              
1263             sub readxrtr {
1264 19     19 1 69 my ($self, $xpos) = @_;
1265 19         67 my ($tdict, $buf, $xmin, $xnum, $xdiff);
1266              
1267 19         50 my $fh = $self->{' INFILE'};
1268 19         89 $fh->seek($xpos, 0);
1269 19         264 $fh->read($buf, 22);
1270 19         248 $buf = update($fh, $buf); # fix for broken JAWS xref calculation.
1271              
1272 19         70 my $xlist = {};
1273              
1274             ## seams that some products calculate wrong prev entries (short)
1275             ## so we seek ahead to find one -- fredo; save for now
1276             #while($buf !~ m/^xref$cr/i && !eof($fh))
1277             #{
1278             # $buf =~ s/^(\s+|\S+|.)//i;
1279             # $buf=update($fh,$buf);
1280             #}
1281              
1282 19 100       324 if ($buf =~ s/^xref$cr//i) {
    50          
1283             # Plain XRef tables.
1284 16         546 while ($buf =~ m/^$ws_char*([0-9]+)$ws_char+([0-9]+)$ws_char*$cr(.*?)$/s) {
1285 21         68 my $old_buf = $buf;
1286 21         59 $xmin = $1;
1287 21         57 $xnum = $2;
1288 21         52 $buf = $3;
1289 21 50       368 unless ($old_buf =~ /^[0-9]+ [0-9]+$cr/) {
1290             # See PDF 1.7 section 7.5.4: Cross-Reference Table
1291 0         0 warn q{Malformed xref in PDF file: subsection shall begin with a line containing two numbers separated by a SPACE (20h)};
1292             }
1293 21         62 $xdiff = length($buf);
1294              
1295 21         125 $fh->read($buf, 20 * $xnum - $xdiff + 15, $xdiff);
1296 21   66     736 while ($xnum-- > 0 and $buf =~ s/^0*([0-9]*)$ws_char+0*([0-9]+)$ws_char+([nf])$cr//) {
1297 118 50       680 $xlist->{$xmin} = [$1, $2, $3] unless exists $xlist->{$xmin};
1298 118         941 $xmin++;
1299             }
1300             }
1301              
1302 16 50       110 if ($buf !~ /^\s*trailer\b/i) {
1303 0         0 die "Malformed trailer in PDF file $self->{' fname'} at " . ($fh->tell - length($buf));
1304             }
1305              
1306 16         70 $buf =~ s/^\s*trailer\b//i;
1307              
1308 16         81 ($tdict, $buf) = $self->readval($buf);
1309             }
1310             elsif ($buf =~ m/^(\d+)\s+(\d+)\s+obj/i) {
1311 3         18 my ($xref_obj, $xref_gen) = ($1, $2);
1312              
1313             # XRef streams.
1314 3         15 ($tdict, $buf) = $self->readval($buf);
1315              
1316 3 50       13 unless ($tdict->{' stream'}) {
1317 0         0 die "Malformed XRefStm at $xref_obj $xref_gen obj in PDF file $self->{' fname'}";
1318             }
1319 3         19 $tdict->read_stream(1);
1320              
1321 3         8 my $stream = $tdict->{' stream'};
1322 3         5 my @widths = map { $_->val } @{$tdict->{W}->val};
  9         22  
  3         13  
1323              
1324 3         7 my $start = 0;
1325 3         6 my $last;
1326              
1327             my @index;
1328 3 100       10 if (defined $tdict->{Index}) {
1329 1         2 @index = map { $_->val() } @{$tdict->{Index}->val};
  2         6  
  1         3  
1330             }
1331             else {
1332 2         9 @index = (0, $tdict->{Size}->val);
1333             }
1334              
1335 3         11 while (scalar @index) {
1336 3         6 $start = shift(@index);
1337 3         9 $last = $start + shift(@index) - 1;
1338              
1339 3         11 for my $i ($start...$last) {
1340             # Replaced "for $xmin" because it creates a loop-specific local variable, and we
1341             # need $xmin to be correct for maxobj below.
1342 26         37 $xmin = $i;
1343              
1344 26         30 my @cols;
1345              
1346 26         43 for my $w (@widths) {
1347 78         100 my $data;
1348 78 50       202 $data = $self->_unpack_xref_stream($w, substr($stream, 0, $w, '')) if $w;
1349              
1350 78         153 push @cols, $data;
1351             }
1352              
1353 26 100       51 $cols[0] = 1 unless defined $cols[0];
1354 26 50       48 if ($cols[0] > 2) {
1355 0         0 die "Invalid XRefStm entry type ($cols[0]) at $xref_obj $xref_gen obj";
1356             }
1357              
1358 26 50       50 next if exists $xlist->{$xmin};
1359              
1360 26 50       60 my @objind = ($cols[1], defined($cols[2]) ? $cols[2] : ($xmin ? 0 : 65535));
    100          
1361 26 100       60 push @objind, ($cols[0] == 0 ? 'f' : 'n') if $cols[0] < 2;
    100          
1362              
1363 26         79 $xlist->{$xmin} = \@objind;
1364             }
1365             }
1366             }
1367             else {
1368 0         0 die "Malformed xref in PDF file $self->{' fname'}";
1369             }
1370              
1371 19         68 $tdict->{' loc'} = $xpos;
1372 19         68 $tdict->{' xref'} = $xlist;
1373 19 100       115 $self->{' maxobj'} = $xmin + 1 if $xmin + 1 > $self->{' maxobj'};
1374             $tdict->{' prev'} = $self->readxrtr($tdict->{'Prev'}->val)
1375 19 100 66     92 if (defined $tdict->{'Prev'} and $tdict->{'Prev'}->val != 0);
1376 19 100       80 delete $tdict->{' prev'} unless defined $tdict->{' prev'};
1377 19         67 return $tdict;
1378             }
1379              
1380              
1381             =head2 $p->out_trailer($tdict)
1382              
1383             Outputs the body and trailer for a PDF file by outputting all the objects in
1384             the ' outlist' and then outputting a xref table for those objects and any
1385             freed ones. It then outputs the trailing dictionary and the trailer code.
1386              
1387             =cut
1388              
1389             sub out_trailer {
1390 148     148 1 553 my ($self, $tdict, $update) = @_;
1391 148         308 my $fh = $self->{' OUTFILE'};
1392              
1393 148         247 while (@{$self->{' outlist'}}) {
  302         829  
1394 154         457 $self->ship_out();
1395             }
1396              
1397             # When writing new trailers, most dictionary entries get copied from the
1398             # previous trailer, but entries related to cross-reference streams should
1399             # get removed (and possibly recreated below).
1400 148         939 delete $tdict->{$_} for (# Entries common to streams
1401             qw(Length Filter DecodeParms F FFilter FDecodeParms DL),
1402              
1403             # Entries specific to cross-reference streams
1404             qw(Index W XRefStm));
1405              
1406 148         514 $tdict->{'Size'} = PDFNum($self->{' maxobj'});
1407              
1408 148         456 my $tloc = $fh->tell();
1409 148         855 my @out;
1410              
1411 148 100       284 my @xreflist = sort { $self->{' objects'}{$a->uid}[0] <=> $self->{' objects'}{$b->uid}[0] } (@{$self->{' printed'} || []}, @{$self->{' free'} || []});
  1334 100       2745  
  148         509  
  148         676  
1412              
1413 148         391 my ($i, $j, $k);
1414 148 100       393 unless ($update) {
1415 140         248 $i = 1;
1416 140         437 for ($j = 0; $j < @xreflist; $j++) {
1417 868         1151 my @inserts;
1418 868         1205 $k = $xreflist[$j];
1419 868         1813 while ($i < $self->{' objects'}{$k->uid}[0]) {
1420 0         0 my ($n) = PDF::API2::Basic::PDF::Objind->new();
1421 0         0 $self->add_obj($n, $i, 0);
1422 0         0 $self->free_obj($n);
1423 0         0 push(@inserts, $n);
1424 0         0 $i++;
1425             }
1426 868         1504 splice(@xreflist, $j, 0, @inserts);
1427 868         1285 $j += @inserts;
1428 868         1779 $i++;
1429             }
1430             }
1431              
1432 148 100       332 my @freelist = sort { $self->{' objects'}{$a->uid}[0] <=> $self->{' objects'}{$b->uid}[0] } @{$self->{' free'} || []};
  0         0  
  148         553  
1433              
1434 148         272 $j = 0; my $first = -1; $k = 0;
  148         261  
  148         302  
1435 148         491 for ($i = 0; $i <= $#xreflist + 1; $i++) {
1436 1038 100 100     3049 if ($i > $#xreflist || $self->{' objects'}{$xreflist[$i]->uid}[0] != $j + 1) {
1437 160 100       744 push @out, ($first == -1 ? "0 " : "$self->{' objects'}{$xreflist[$first]->uid}[0] ") . ($i - $first) . "\n";
1438 160 100       447 if ($first == -1) {
1439 148 50       777 push @out, sprintf("%010d 65535 f \n", defined $freelist[$k] ? $self->{' objects'}{$freelist[$k]->uid}[0] : 0);
1440 148         314 $first = 0;
1441             }
1442 160         485 for ($j = $first; $j < $i; $j++) {
1443 890         1432 my $xref = $xreflist[$j];
1444 890 50 33     2060 if (defined($freelist[$k]) and defined($xref) and "$freelist[$k]" eq "$xref") {
      33        
1445 0         0 $k++;
1446             push @out, pack("A10AA5A4",
1447             sprintf("%010d", (defined $freelist[$k] ?
1448             $self->{' objects'}{$freelist[$k]->uid}[0] : 0)), " ",
1449 0 0       0 sprintf("%05d", $self->{' objects'}{$xref->uid}[1] + 1),
1450             " f \n");
1451             }
1452             else {
1453             push @out, pack("A10AA5A4", sprintf("%010d", $self->{' locs'}{$xref->uid}), " ",
1454 890         2154 sprintf("%05d", $self->{' objects'}{$xref->uid}[1]),
1455             " n \n");
1456             }
1457             }
1458 160         352 $first = $i;
1459 160 100       645 $j = $self->{' objects'}{$xreflist[$i]->uid}[0] if ($i < scalar @xreflist);
1460             }
1461             else {
1462 878         1877 $j++;
1463             }
1464             }
1465 148 50 33     554 if (exists $tdict->{'Type'} and $tdict->{'Type'}->val() eq 'XRef') {
1466 0         0 my (@index, @stream);
1467 0         0 for (@out) {
1468 0         0 my @a = split;
1469 0 0       0 @a == 2 ? push @index, @a : push @stream, \@a;
1470             }
1471 0         0 my $i = $self->{' maxobj'}++;
1472 0         0 $self->add_obj($tdict, $i, 0);
1473 0         0 $self->out_obj($tdict );
1474              
1475 0         0 push @index, $i, 1;
1476 0         0 push @stream, [$tloc, 0, 'n'];
1477              
1478 0 0       0 my $len = $tloc > 0xFFFF ? 4 : 2; # don't expect files > 4 Gb
1479 0 0       0 my $tpl = $tloc > 0xFFFF ? 'CNC' : 'CnC'; # don't expect gennum > 255, it's absurd.
1480             # Adobe doesn't use them anymore anyway
1481 0         0 my $stream = '';
1482 0         0 my @prev = (0) x ($len + 2);
1483 0         0 for (@stream) {
1484 0 0 0     0 $_->[1] = 0 if $_->[2] eq 'f' and $_->[1] == 65535;
1485 0 0       0 my @line = unpack 'C*', pack $tpl, $_->[2] eq 'n' ? 1 : 0, @{$_}[0..1];
  0         0  
1486              
1487             $stream .= pack 'C*', 2, # prepend filtering method, "PNG Up"
1488 0         0 map {($line[$_] - $prev[$_] + 256) % 256 } 0 .. $#line;
  0         0  
1489 0         0 @prev = @line;
1490             }
1491 0         0 $tdict->{'Size'} = PDFNum($i + 1);
1492 0         0 $tdict->{'Index'} = PDFArray(map PDFNum( $_ ), @index);
1493 0         0 $tdict->{'W'} = PDFArray(map PDFNum( $_ ), 1, $len, 1);
1494 0         0 $tdict->{'Filter'} = PDFName('FlateDecode');
1495              
1496 0         0 $tdict->{'DecodeParms'} = PDFDict();
1497 0         0 $tdict->{'DecodeParms'}->val->{'Predictor'} = PDFNum(12);
1498 0         0 $tdict->{'DecodeParms'}->val->{'Columns'} = PDFNum($len + 2);
1499              
1500 0         0 $stream = PDF::API2::Basic::PDF::Filter::FlateDecode->new->outfilt($stream, 1);
1501 0         0 $tdict->{' stream'} = $stream;
1502 0         0 $tdict->{' nofilt'} = 1;
1503 0         0 delete $tdict->{'Length'};
1504 0         0 $self->ship_out();
1505             }
1506             else {
1507 148         609 $fh->print("xref\n", @out, "trailer\n");
1508 148         1601 $tdict->outobjdeep($fh, $self);
1509 148         487 $fh->print("\n");
1510             }
1511 148         1218 $fh->print("startxref\n$tloc\n%%EOF\n");
1512             }
1513              
1514              
1515             =head2 PDF::API2::Basic::PDF::File->_new
1516              
1517             Creates a very empty PDF file object (used by new and open)
1518              
1519             =cut
1520              
1521             sub _new {
1522 180     180   418 my $class = shift();
1523 180         366 my $self = {};
1524              
1525 180         402 bless $self, $class;
1526 180         640 $self->{' outlist'} = [];
1527 180         442 $self->{' outlist_cache'} = {}; # A cache of whats in the 'outlist'
1528 180         385 $self->{' maxobj'} = 1;
1529 180         398 $self->{' objcache'} = {};
1530 180         384 $self->{' objects'} = {};
1531              
1532 180         397 return $self;
1533             }
1534              
1535             1;
1536              
1537             =head1 AUTHOR
1538              
1539             Martin Hosken Martin_Hosken@sil.org
1540              
1541             Copyright Martin Hosken 1999 and onwards
1542              
1543             No warranty or expression of effectiveness, least of all regarding anyone's
1544             safety, is implied in this software or documentation.