File Coverage

blib/lib/PDF/API2/Basic/PDF/File.pm
Criterion Covered Total %
statement 542 717 75.5
branch 187 322 58.0
condition 41 90 45.5
subroutine 45 48 93.7
pod 27 27 100.0
total 842 1204 69.9


line stmt bran cond sub pod time code
1             # Code in the PDF::API2::Basic::PDF namespace was originally copied from the
2             # Text::PDF distribution.
3             #
4             # Copyright Martin Hosken
5             #
6             # Martin Hosken's code may be used under the terms of the MIT license.
7             # Subsequent versions of the code have the same license as PDF::API2.
8              
9             package PDF::API2::Basic::PDF::File;
10              
11 40     40   56169 use strict;
  40         113  
  40         2577  
12              
13             our $VERSION = '2.045'; # VERSION
14              
15             =head1 NAME
16              
17             PDF::API2::Basic::PDF::File - Low-level PDF file access
18              
19             =head1 SYNOPSIS
20              
21             $p = PDF::API2::Basic::PDF::File->open("filename.pdf", 1);
22             $p->new_obj($obj_ref);
23             $p->free_obj($obj_ref);
24             $p->append_file;
25             $p->close_file;
26             $p->release; # IMPORTANT!
27              
28             =head1 DESCRIPTION
29              
30             This class keeps track of the directory aspects of a PDF file. There are two
31             parts to the directory: the main directory object which is the parent to all
32             other objects and a chain of cross-reference tables and corresponding trailer
33             dictionaries starting with the main directory object.
34              
35             =head1 INSTANCE VARIABLES
36              
37             Within this class hierarchy, rather than making everything visible via methods,
38             which would be a lot of work, there are various instance variables which are
39             accessible via associative array referencing. To distinguish instance variables
40             from content variables (which may come from the PDF content itself), each such
41             variable will start with a space.
42              
43             Variables which do not start with a space directly reflect elements in a PDF
44             dictionary. In the case of a PDF::API2::Basic::PDF::File, the elements reflect those in the
45             trailer dictionary.
46              
47             Since some variables are not designed for class users to access, variables are
48             marked in the documentation with (R) to indicate that such an entry should only
49             be used as read-only information. (P) indicates that the information is private
50             and not designed for user use at all, but is included in the documentation for
51             completeness and to ensure that nobody else tries to use it.
52              
53             =over
54              
55             =item newroot
56              
57             This variable allows the user to create a new root entry to occur in the trailer
58             dictionary which is output when the file is written or appended. If you wish to
59             over-ride the root element in the dictionary you have, use this entry to indicate
60             that without losing the current Root entry. Notice that newroot should point to
61             a PDF level object and not just to a dictionary which does not have object status.
62              
63             =item INFILE (R)
64              
65             Contains the filehandle used to read this information into this PDF directory. Is
66             an IO object.
67              
68             =item fname (R)
69              
70             This is the filename which is reflected by INFILE, or the original IO object passed
71             in.
72              
73             =item update (R)
74              
75             This indicates that the read file has been opened for update and that at some
76             point, $p->appendfile() can be called to update the file with the changes that
77             have been made to the memory representation.
78              
79             =item maxobj (R)
80              
81             Contains the first usable object number above any that have already appeared
82             in the file so far.
83              
84             =item outlist (P)
85              
86             This is a list of Objind which are to be output when the next appendfile or outfile
87             occurs.
88              
89             =item firstfree (P)
90              
91             Contains the first free object in the free object list. Free objects are removed
92             from the front of the list and added to the end.
93              
94             =item lastfree (P)
95              
96             Contains the last free object in the free list. It may be the same as the firstfree
97             if there is only one free object.
98              
99             =item objcache (P)
100              
101             All objects are held in the cache to ensure that a system only has one occurrence of
102             each object. In effect, the objind class acts as a container type class to hold the
103             PDF object structure and it would be unfortunate if there were two identical
104             place-holders floating around a system.
105              
106             =item epos (P)
107              
108             The end location of the read-file.
109              
110             =back
111              
112             Each trailer dictionary contains a number of private instance variables which
113             hold the chain together.
114              
115             =over
116              
117             =item loc (P)
118              
119             Contains the location of the start of the cross-reference table preceding the
120             trailer.
121              
122             =item xref (P)
123              
124             Contains an anonymous array of each cross-reference table entry.
125              
126             =item prev (P)
127              
128             A reference to the previous table. Note this differs from the Prev entry which
129             is in PDF which contains the location of the previous cross-reference table.
130              
131             =back
132              
133             =head1 METHODS
134              
135             =cut
136              
137 40     40   345 use Scalar::Util qw(blessed weaken);
  40         90  
  40         2473  
138              
139 40     40   296 use vars qw($cr $irreg_char $reg_char $ws_char $delim_char %types);
  40         90  
  40         6152  
140              
141             $ws_char = '[ \t\r\n\f\0]';
142             $delim_char = '[][<>{}()/%]';
143             $reg_char = '[^][<>{}()/% \t\r\n\f\0]';
144             $irreg_char = '[][<>{}()/% \t\r\n\f\0]';
145             $cr = '\s*(?:\015|\012|(?:\015\012))';
146              
147             my $re_comment = qr/(?:\%[^\r\n]*)/;
148             my $re_whitespace = qr/(?:[ \t\r\n\f\0]|$re_comment)/;
149              
150             %types = (
151             'Page' => 'PDF::API2::Basic::PDF::Page',
152             'Pages' => 'PDF::API2::Basic::PDF::Pages',
153             );
154              
155             my $readDebug = 0;
156              
157 40     40   308 use Carp;
  40         91  
  40         2378  
158 40     40   1021 use IO::File;
  40         7267  
  40         5458  
159              
160             # Now for the basic PDF types
161 40     40   625 use PDF::API2::Basic::PDF::Utils;
  40         90  
  40         2830  
162              
163 40     40   260 use PDF::API2::Basic::PDF::Array;
  40         86  
  40         1129  
164 40     40   242 use PDF::API2::Basic::PDF::Bool;
  40         78  
  40         1003  
165 40     40   259 use PDF::API2::Basic::PDF::Dict;
  40         101  
  40         932  
166 40     40   223 use PDF::API2::Basic::PDF::Name;
  40         82  
  40         1007  
167 40     40   260 use PDF::API2::Basic::PDF::Number;
  40         115  
  40         976  
168 40     40   211 use PDF::API2::Basic::PDF::Objind;
  40         99  
  40         1145  
169 40     40   250 use PDF::API2::Basic::PDF::String;
  40         90  
  40         879  
170 40     40   16863 use PDF::API2::Basic::PDF::Page;
  40         108  
  40         1289  
171 40     40   251 use PDF::API2::Basic::PDF::Pages;
  40         97  
  40         762  
172 40     40   200 use PDF::API2::Basic::PDF::Null;
  40         82  
  40         907  
173 40     40   190 use POSIX qw(ceil floor);
  40         146  
  40         254  
174              
175 40     40   3166 no warnings qw[ deprecated recursion uninitialized ];
  40         95  
  40         42347  
176              
177              
178             =head2 PDF::API2::Basic::PDF::File->new
179              
180             Creates a new, empty file object which can act as the host to other PDF objects.
181             Since there is no file associated with this object, it is assumed that the
182             object is created in readiness for creating a new PDF file.
183              
184             =cut
185              
186             sub new {
187 164     164 1 413 my ($class, $root) = @_;
188 164         511 my $self = $class->_new();
189              
190 164 50       459 unless ($root) {
191 164         538 $root = PDFDict();
192 164         492 $root->{'Type'} = PDFName('Catalog');
193             }
194 164         573 $self->new_obj($root);
195 164         317 $self->{'Root'} = $root;
196 164         558 return $self;
197             }
198              
199              
200             =head2 $p = PDF::API2::Basic::PDF::File->open($filename, $update)
201              
202             Opens the file and reads all the trailers and cross reference tables to build
203             a complete directory of objects.
204              
205             $update specifies whether this file is being opened for updating and editing,
206             or simply to be read.
207              
208             $filename may be an IO object
209              
210             =cut
211              
212             sub open {
213 16     16 1 56 my ($class, $filename, $update) = @_;
214 16         36 my ($fh, $buffer);
215              
216 16         68 my $self = $class->_new();
217 16 100       78 if (ref $filename) {
218 8         21 $self->{' INFILE'} = $filename;
219 8 50       23 if ($update) {
220 8         19 $self->{' update'} = 1;
221 8         25 $self->{' OUTFILE'} = $filename;
222             }
223 8         13 $fh = $filename;
224             }
225             else {
226 8 50       97 die "File '$filename' does not exist" unless -f $filename;
227 8 50       103 die "File '$filename' is not readable" unless -r $filename;
228 8 50       42 if ($update) {
229 8 50       100 die "File '$filename' is not writable" unless -w $filename;
230             }
231 8   50     106 $fh = IO::File->new(($update ? '+' : '') . "<$filename")
232             || die "Error opening '$filename': $!";
233 8         804 $self->{' INFILE'} = $fh;
234 8 50       29 if ($update) {
235 8         31 $self->{' update'} = 1;
236 8         30 $self->{' OUTFILE'} = $fh;
237 8         22 $self->{' fname'} = $filename;
238             }
239             }
240 16         86 binmode $fh, ':raw';
241 16         162 $fh->seek(0, 0); # go to start of file
242 16         230 $fh->read($buffer, 255);
243 16 50       824 unless ($buffer =~ /^\%PDF\-([12]\.\d+)\s*$cr/m) {
244 0         0 croak "$filename does not appear to be a valid PDF";
245             }
246 16         115 $self->{' version'} = $1;
247              
248 16         89 $fh->seek(0, 2); # go to end of file
249 16         242 my $end = $fh->tell();
250 16         126 $self->{' epos'} = $end;
251 16         62 foreach my $offset (1..64) {
252 32         149 $fh->seek($end - 16 * $offset, 0);
253 32         315 $fh->read($buffer, 16 * $offset);
254 32 100       844 last if $buffer =~ m/startxref($cr|\s*)\d+($cr|\s*)\%\%eof.*?/i;
255             }
256 16 50       459 unless ($buffer =~ m/startxref[^\d]+([0-9]+)($cr|\s*)\%\%eof.*?/i) {
257 0         0 die "Malformed PDF file $filename";
258             }
259 16         69 my $xpos = $1;
260 16         50 $self->{' xref_position'} = $xpos;
261              
262 16         72 my $tdict = $self->readxrtr($xpos, $self);
263 16         81 foreach my $key (keys %$tdict) {
264 121         248 $self->{$key} = $tdict->{$key};
265             }
266 16         144 return $self;
267             }
268              
269             =head2 $p->version($version)
270              
271             Gets/sets the PDF version (e.g. 1.4)
272              
273             =cut
274              
275             sub version {
276 9     9 1 12 my $self = shift();
277              
278 9 100       16 if (@_) {
279 2         3 my $version = shift();
280 2 50       13 croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
281 2         5 $self->header_version($version);
282 2 50       8 if ($version >= 1.4) {
283 2         4 $self->trailer_version($version);
284             }
285             else {
286 0         0 delete $self->{'Root'}->{'Version'};
287 0         0 $self->out_obj($self->{'Root'});
288             }
289 2         5 return $version;
290             }
291              
292 7         12 my $header_version = $self->header_version();
293 7         11 my $trailer_version = $self->trailer_version();
294 7 100       25 return $trailer_version if $trailer_version > $header_version;
295 3         11 return $header_version;
296             }
297              
298             =head2 $version = $p->header_version($version)
299              
300             Gets/sets the PDF version stored in the file header.
301              
302             =cut
303              
304             sub header_version {
305 14     14 1 30 my $self = shift();
306              
307 14 100       22 if (@_) {
308 5         8 my $version = shift();
309 5 50       20 croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
310 5         10 $self->{' version'} = $version;
311             }
312              
313 14         25 return $self->{' version'};
314             }
315              
316             =head2 $version = $p->trailer_version($version)
317              
318             Gets/sets the PDF version stored in the document catalog.
319              
320             =cut
321              
322             sub trailer_version {
323 12     12 1 21 my $self = shift();
324              
325 12 100       21 if (@_) {
326 4         6 my $version = shift();
327 4 50       12 croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
328 4         12 $self->{'Root'}->{'Version'} = PDFName($version);
329 4         11 $self->out_obj($self->{'Root'});
330 4         5 return $version;
331             }
332              
333 8 50       15 return unless $self->{'Root'}->{'Version'};
334 8         26 $self->{'Root'}->{'Version'}->realise();
335 8         22 return $self->{'Root'}->{'Version'}->val();
336             }
337              
338             =head2 $prev_version = $p->require_version($version)
339              
340             Ensures that the PDF version is at least C<$version>.
341              
342             =cut
343              
344             sub require_version {
345 3     3 1 594 my ($self, $min_version) = @_;
346 3         6 my $current_version = $self->version();
347 3 100       12 $self->version($min_version) if $current_version < $min_version;
348 3         4 return $current_version;
349             }
350              
351             =head2 $p->release()
352              
353             Releases ALL of the memory used by the PDF document and all of its
354             component objects. After calling this method, do B expect to
355             have anything left in the C object (so if
356             you need to save, be sure to do it before calling this method).
357              
358             B, that it is important that you call this method on any
359             C object when you wish to destruct it and
360             free up its memory. Internally, PDF files have an enormous number of
361             cross-references and this causes circular references within the
362             internal data structures. Calling 'C' forces a brute-force
363             cleanup of the data structures, freeing up all of the memory. Once
364             you've called this method, though, don't expect to be able to do
365             anything else with the C object; it'll
366             have B internal state whatsoever.
367              
368             =cut
369              
370             # Maintainer's Question: Couldn't this be handled by a DESTROY method
371             # instead of requiring an explicit call to release()?
372             sub release {
373 609     609 1 871 my $self = shift();
374              
375 609 50       1251 return $self unless ref($self);
376 609         1322 my @tofree = values %$self;
377              
378 609         1401 foreach my $key (keys %$self) {
379 2271         2848 $self->{$key} = undef;
380 2271         3203 delete $self->{$key};
381             }
382              
383             # PDFs with highly-interconnected page trees or outlines can hit Perl's
384             # recursion limit pretty easily, so disable the warning for this specific
385             # loop.
386 40     40   335 no warnings 'recursion';
  40         92  
  40         297469  
387              
388 609         2131 while (my $item = shift @tofree) {
389 7109 100 100     23630 if (blessed($item) and $item->can('release')) {
    100          
    100          
390 2117         4194 $item->release(1);
391             }
392             elsif (ref($item) eq 'ARRAY') {
393 1554         4194 push @tofree, @$item;
394             }
395             elsif (ref($item) eq 'HASH') {
396 877         2105 push @tofree, values %$item;
397 877         2026 foreach my $key (keys %$item) {
398 3822         4649 $item->{$key} = undef;
399 3822         6661 delete $item->{$key};
400             }
401             }
402             else {
403 2561         5605 $item = undef;
404             }
405             }
406             }
407              
408             =head2 $p->append_file()
409              
410             Appends the objects for output to the read file and then appends the appropriate table.
411              
412             =cut
413              
414             sub append_file {
415 3     3 1 6 my $self = shift();
416 3 50       11 return unless $self->{' update'};
417              
418 3         7 my $fh = $self->{' INFILE'};
419              
420 3         10 my $tdict = PDFDict();
421 3         13 $tdict->{'Prev'} = PDFNum($self->{' loc'});
422 3         9 $tdict->{'Info'} = $self->{'Info'};
423 3 50       10 if (defined $self->{' newroot'}) {
424 0         0 $tdict->{'Root'} = $self->{' newroot'};
425             }
426             else {
427 3         9 $tdict->{'Root'} = $self->{'Root'};
428             }
429 3         7 $tdict->{'Size'} = $self->{'Size'};
430              
431 3         16 foreach my $key (grep { $_ !~ m/^\s/ } keys %$self) {
  54         100  
432 9 50       24 $tdict->{$key} = $self->{$key} unless defined $tdict->{$key};
433             }
434              
435 3         21 $fh->seek($self->{' epos'}, 0);
436 3         27 $self->out_trailer($tdict, $self->{' update'});
437 3         34 close $self->{' OUTFILE'};
438             }
439              
440              
441             =head2 $p->out_file($fname)
442              
443             Writes a PDF file to a file of the given filename based on the current list of
444             objects to be output. It creates the trailer dictionary based on information
445             in $self.
446              
447             $fname may be an IO object;
448              
449             =cut
450              
451             sub out_file {
452 140     140 1 352 my ($self, $fname) = @_;
453              
454 140         441 $self->create_file($fname);
455 140         418 $self->close_file();
456 140         338 return $self;
457             }
458              
459              
460             =head2 $p->create_file($fname)
461              
462             Creates a new output file (no check is made of an existing open file) of
463             the given filename or IO object. Note, make sure that $p->{' version'} is set
464             correctly before calling this function.
465              
466             =cut
467              
468             sub create_file {
469 140     140 1 294 my ($self, $filename) = @_;
470 140         222 my $fh;
471              
472 140         303 $self->{' fname'} = $filename;
473 140 50       355 if (ref $filename) {
474 140         249 $fh = $filename;
475             }
476             else {
477 0   0     0 $fh = IO::File->new(">$filename") || die "Unable to open $filename for writing";
478 0         0 binmode($fh,':raw');
479             }
480              
481 140         255 $self->{' OUTFILE'} = $fh;
482 140   50     824 $fh->print('%PDF-' . ($self->{' version'} // '1.4') . "\n");
483 140         1299 $fh->print("%\xC6\xCD\xCD\xB5\n"); # and some binary stuff in a comment
484 140         740 return $self;
485             }
486              
487              
488             =head2 $p->clone_file($fname)
489              
490             Creates a copy of the input file at the specified filename and sets it as the
491             output file for future writes. A file handle may be passed instead of a
492             filename.
493              
494             =cut
495              
496             sub clone_file {
497 4     4 1 12 my ($self, $filename) = @_;
498 4         8 my $fh;
499              
500 4         10 $self->{' fname'} = $filename;
501 4 50       12 if (ref $filename) {
502 4         7 $fh = $filename;
503             }
504             else {
505 0   0     0 $fh = IO::File->new(">$filename") || die "Unable to open $filename for writing";
506 0         0 binmode($fh,':raw');
507             }
508              
509 4         10 $self->{' OUTFILE'} = $fh;
510              
511 4         7 my $in = $self->{' INFILE'};
512 4         18 $in->seek(0, 0);
513 4         50 my $data;
514 4         24 while (not $in->eof()) {
515 4         74 $in->read($data, 1024 * 1024);
516 4         119 $fh->print($data);
517             }
518 4         62 return $self;
519             }
520              
521             =head2 $p->close_file
522              
523             Closes up the open file for output by outputting the trailer etc.
524              
525             =cut
526              
527             sub close_file {
528 145     145 1 228 my $self = shift();
529              
530 145         392 my $tdict = PDFDict();
531 145 50       523 $tdict->{'Info'} = $self->{'Info'} if defined $self->{'Info'};
532 145 50 33     544 $tdict->{'Root'} = (defined $self->{' newroot'} and $self->{' newroot'} ne '') ? $self->{' newroot'} : $self->{'Root'};
533              
534             # remove all freed objects from the outlist, AND the outlist_cache if not updating
535             # NO! Don't do that thing! In fact, let out_trailer do the opposite!
536              
537 145   66     523 $tdict->{'Size'} = $self->{'Size'} || PDFNum(1);
538 145 100       394 $tdict->{'Prev'} = PDFNum($self->{' loc'}) if $self->{' loc'};
539 145 100       354 if ($self->{' update'}) {
540 5         26 foreach my $key (grep { $_ !~ m/^[\s\-]/ } keys %$self) {
  97         169  
541 16 50       36 $tdict->{$key} = $self->{$key} unless defined $tdict->{$key};
542             }
543              
544 5         10 my $fh = $self->{' INFILE'};
545 5         28 $fh->seek($self->{' epos'}, 0);
546             }
547              
548 145         757 $self->out_trailer($tdict, $self->{' update'});
549 145         1373 close($self->{' OUTFILE'});
550 145 50 33     624 if ($^O eq 'MacOS' and not ref($self->{' fname'})) {
551 0         0 MacPerl::SetFileInfo('CARO', 'TEXT', $self->{' fname'});
552             }
553              
554 145         647 return $self;
555             }
556              
557             =head2 ($value, $str) = $p->readval($str, %opts)
558              
559             Reads a PDF value from the current position in the file. If $str is too short
560             then read some more from the current location in the file until the whole object
561             is read. This is a recursive call which may slurp in a whole big stream (unprocessed).
562              
563             Returns the recursive data structure read and also the current $str that has been
564             read from the file.
565              
566             =cut
567              
568             sub readval {
569 842     842 1 11353 my ($self, $str, %opts) = @_;
570 842         1210 my $fh = $self->{' INFILE'};
571 842         1076 my ($result, $value);
572              
573 842 100       1498 my $update = defined($opts{update}) ? $opts{update} : 1;
574 842 100       1758 $str = update($fh, $str) if $update;
575              
576 842         2271 $str =~ s/^$ws_char+//; # Ignore initial white space
577 842         1981 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
578              
579             # Dictionary
580 842 100       8916 if ($str =~ m/^<
    100          
    100          
    100          
    100          
    50          
    100          
    50          
    50          
    0          
581 146         333 $str = substr ($str, 2);
582 146 100       388 $str = update($fh, $str) if $update;
583 146         420 $result = PDFDict();
584              
585 146         362 while ($str !~ m/^>>/) {
586 398         1190 $str =~ s/^$ws_char+//; # Ignore initial white space
587 398         1031 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
588              
589 398 50       1865 if ($str =~ s|^/($reg_char+)||) {
    0          
    0          
590 398         973 my $key = PDF::API2::Basic::PDF::Name::name_to_string($1, $self);
591 398         1380 ($value, $str) = $self->readval($str, %opts);
592 398 50 50     1150 unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
593 398         859 $result->{$key} = $value;
594             }
595             }
596             elsif ($str =~ s|^/$ws_char+||) {
597             # fixes a broken key problem of acrobat. -- fredo
598 0         0 ($value, $str) = $self->readval($str, %opts);
599 0 0 0     0 unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
600 0         0 $result->{'null'} = $value;
601             }
602             }
603             elsif ($str =~ s|^//|/|) {
604             # fixes again a broken key problem of illustrator/enfocus. -- fredo
605 0         0 ($value, $str) = $self->readval($str, %opts);
606 0 0 0     0 unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
607 0         0 $result->{'null'} = $value;
608             }
609             }
610             else {
611 0         0 die "Invalid dictionary key";
612             }
613 398 100       939 $str = update($fh, $str) if $update; # thanks gareth.jones@stud.man.ac.uk
614             }
615 146         462 $str =~ s/^>>//;
616 146 100       366 $str = update($fh, $str) if $update;
617             # streams can't be followed by a lone carriage-return.
618             # fredo: yes they can !!! -- use the MacOS Luke.
619 146 100 66     461 if (($str =~ s/^stream(?:(?:\015\012)|\012|\015)//) and ($result->{'Length'}->val != 0)) { # stream
620 10         32 my $length = $result->{'Length'}->val() + 0;
621 10         31 $result->{' streamsrc'} = $fh;
622 10         55 $result->{' streamloc'} = $fh->tell - length($str);
623              
624 10 50       115 unless ($opts{'nostreams'}) {
625 10 50       31 if ($length > length($str)) {
626 0         0 $value = $str;
627 0         0 $length -= length($str);
628 0         0 read $fh, $str, $length + 11; # slurp the whole stream!
629             }
630             else {
631 10         20 $value = '';
632             }
633 10         43 $value .= substr($str, 0, $length);
634 10         32 $result->{' stream'} = $value;
635 10         20 $result->{' nofilt'} = 1;
636 10 50       42 $str = update($fh, $str, 1) if $update; # tell update we are in-stream and only need an endstream
637 10         45 $str = substr($str, index($str, 'endstream') + 9);
638             }
639             }
640              
641 146 100 100     545 if (defined $result->{'Type'} and defined $types{$result->{'Type'}->val}) {
642 34         117 bless $result, $types{$result->{'Type'}->val};
643             }
644             # gdj: FIXME: if any of the ws chars were crs, then the whole
645             # string might not have been read.
646             }
647              
648             # Indirect Object
649             elsif ($str =~ m/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+R/s) {
650 152         413 my $num = $1 + 0;
651 152         228 $value = $2 + 0;
652 152         1390 $str =~ s/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+R//s;
653 152 100       419 unless ($result = $self->test_obj($num, $value)) {
654 119         461 $result = PDF::API2::Basic::PDF::Objind->new();
655 119         324 $result->{' objnum'} = $num;
656 119         227 $result->{' objgen'} = $value;
657 119         247 $self->add_obj($result, $num, $value);
658             }
659 152         340 $result->{' parent'} = $self;
660 152         516 weaken $result->{' parent'};
661              
662             # Removed to address changes being lost when an indirect object is realised twice
663             # $result->{' realised'} = 0;
664              
665             # gdj: FIXME: if any of the ws chars were crs, then the whole
666             # string might not have been read.
667             }
668              
669             # Object
670             elsif ($str =~ m/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+obj/s) {
671 95         178 my $obj;
672 95         255 my $num = $1 + 0;
673 95         157 $value = $2 + 0;
674 95         986 $str =~ s/^([0-9]+)(?:$ws_char|$re_comment)+([0-9]+)(?:$ws_char|$re_comment)+obj//s;
675 95         440 ($obj, $str) = $self->readval($str, %opts);
676 95 100       254 if ($result = $self->test_obj($num, $value)) {
677 81         243 $result->merge($obj);
678             }
679             else {
680 14         20 $result = $obj;
681 14         56 $self->add_obj($result, $num, $value);
682 14         25 $result->{' realised'} = 1;
683             }
684 95 100       276 $str = update($fh, $str) if $update; # thanks to kundrat@kundrat.sk
685 95         531 $str =~ s/^endobj//;
686             }
687              
688             # Name
689             elsif ($str =~ m|^/($reg_char*)|s) {
690 287         663 $value = $1;
691 287         1308 $str =~ s|^/($reg_char*)||s;
692 287         969 $result = PDF::API2::Basic::PDF::Name->from_pdf($value, $self);
693             }
694              
695             # Literal String
696             elsif ($str =~ m/^\(/) {
697             # We now need to find an unbalanced, unescaped right-paren.
698             # This can't be done with a regex.
699 2         8 my $value = '(';
700 2         8 $str = substr($str, 1);
701              
702 2         4 my $nested_level = 1;
703 2         4 while (1) {
704             # Ignore everything up to the first escaped or parenthesis character
705 2 50       11 if ($str =~ /^([^\\()]+)(.*)/s) {
706 2         20 $value .= $1;
707 2         21 $str = $2;
708             }
709              
710             # Ignore escaped parentheses
711 2 50       19 if ($str =~ /^(\\[()])/) {
    50          
    50          
    0          
712 0         0 $value .= $1;
713 0         0 $str = substr($str, 2);
714             }
715              
716             # Left parenthesis: increase nesting
717             elsif ($str =~ /^\(/) {
718 0         0 $value .= '(';
719 0         0 $str = substr($str, 1);
720 0         0 $nested_level++;
721             }
722              
723             # Right parenthesis: decrease nesting
724             elsif ($str =~ /^\)/) {
725 2         6 $value .= ')';
726 2         6 $str = substr($str, 1);
727 2         3 $nested_level--;
728 2 50       9 last unless $nested_level;
729             }
730              
731             # Other escaped character
732             elsif ($str =~ /^(\\[^()])/) {
733 0         0 $value .= $1;
734 0         0 $str = substr($str, 2);
735             }
736              
737             # If there wasn't an escaped or parenthesis character,
738             # read some more.
739             else {
740             # We don't use update because we don't want to remove
741             # whitespace or comments.
742 0 0       0 $fh->read($str, 255, length($str)) or die 'Unterminated string.';
743             }
744             }
745              
746 2         23 $result = PDF::API2::Basic::PDF::String->from_pdf($value);
747             }
748              
749             # Hex String
750             elsif ($str =~ m/^
751 0         0 $str =~ s/^
752 0         0 $fh->read($str, 255, length($str)) while (0 > index($str, '>'));
753 0         0 ($value, $str) = ($str =~ /^(.*?)>(.*)/s);
754 0         0 $result = PDF::API2::Basic::PDF::String->from_pdf('<' . $value . '>');
755             }
756              
757             # Array
758             elsif ($str =~ m/^\[/) {
759 74         268 $str =~ s/^\[//;
760 74 50       233 $str = update($fh, $str) if $update;
761 74         232 $result = PDFArray();
762 74         199 while ($str !~ m/^\]/) {
763 224         821 $str =~ s/^$ws_char+//; # Ignore initial white space
764 224         712 $str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
765              
766 224         675 ($value, $str) = $self->readval($str, %opts);
767 224         660 $result->add_elements($value);
768 224 50       478 $str = update($fh, $str) if $update; # str might just be exhausted!
769             }
770 74         225 $str =~ s/^\]//;
771             }
772              
773             # Boolean
774             elsif ($str =~ m/^(true|false)($irreg_char|$)/) {
775 0         0 $value = $1;
776 0         0 $str =~ s/^(?:true|false)//;
777 0         0 $result = PDF::API2::Basic::PDF::Bool->from_pdf($value);
778             }
779              
780             # Number
781             elsif ($str =~ m/^([+-.0-9]+)($irreg_char|$)/) {
782 86         228 $value = $1;
783 86         312 $str =~ s/^([+-.0-9]+)//;
784              
785             # If $str only consists of whitespace (or is empty), call update to
786             # see if this is the beginning of an indirect object or reference
787 86 100 100     1491 if ($update and ($str =~ /^$re_whitespace*$/s or $str =~ /^$re_whitespace+[0-9]+$re_whitespace*$/s)) {
      100        
788 6         44 $str =~ s/^$re_whitespace+/ /s;
789 6         35 $str =~ s/$re_whitespace+$/ /s;
790 6         13 $str = update($fh, $str);
791 6 100       104 if ($str =~ m/^$re_whitespace*([0-9]+)$re_whitespace+(?:R|obj)/s) {
792 4         14 return $self->readval("$value $str", %opts);
793             }
794             }
795              
796 82         455 $result = PDF::API2::Basic::PDF::Number->from_pdf($value);
797             }
798              
799             # Null
800             elsif ($str =~ m/^null($irreg_char|$)/) {
801 0         0 $str =~ s/^null//;
802 0         0 $result = PDF::API2::Basic::PDF::Null->new;
803             }
804              
805             else {
806 0         0 die "Can't parse `$str' near " . ($fh->tell()) . " length " . length($str) . ".";
807             }
808              
809 838         3504 $str =~ s/^$ws_char+//s;
810 838         2657 return ($result, $str);
811             }
812              
813              
814             =head2 $ref = $p->read_obj($objind, %opts)
815              
816             Given an indirect object reference, locate it and read the object returning
817             the read in object.
818              
819             =cut
820              
821             sub read_obj {
822 77     77 1 159 my ($self, $objind, %opts) = @_;
823              
824 77   50     279 my $res = $self->read_objnum($objind->{' objnum'}, $objind->{' objgen'}, %opts) || return;
825 77 50       247 $objind->merge($res) unless $objind eq $res;
826 77         253 return $objind;
827             }
828              
829              
830             =head2 $ref = $p->read_objnum($num, $gen, %opts)
831              
832             Returns a fully read object of given number and generation in this file
833              
834             =cut
835              
836             sub read_objnum {
837 85     85 1 2450 my ($self, $num, $gen, %opts) = @_;
838 85 50       179 croak 'Undefined object number in call to read_objnum($num, $gen)' unless defined $num;
839 85 50       176 croak 'Undefined object generation in call to read_objnum($num, $gen)' unless defined $gen;
840 85 50       412 croak "Invalid object number '$num' in call to read_objnum" unless $num =~ /^[0-9]+$/;
841 85 50       312 croak "Invalid object generation '$gen' in call to read_objnum" unless $gen =~ /^[0-9]+$/;
842              
843 85   50     219 my $object_location = $self->locate_obj($num, $gen) || return;
844 85         146 my $object;
845              
846             # Compressed object
847 85 100       180 if (ref($object_location)) {
848 4         6 my ($object_stream_num, $object_stream_pos) = @{$object_location};
  4         11  
849              
850 4         33 my $object_stream = $self->read_objnum($object_stream_num, 0, %opts);
851 4 50       16 die 'Cannot find the compressed object stream' unless $object_stream;
852              
853 4 50       24 $object_stream->read_stream() if $object_stream->{' nofilt'};
854              
855             # An object stream starts with pairs of integers containing object numbers and
856             # stream offsets relative to the First key
857 4         8 my $fh;
858             my $pairs;
859 4 50       10 unless ($object_stream->{' streamfile'}) {
860 4         15 $pairs = substr($object_stream->{' stream'}, 0, $object_stream->{'First'}->val);
861             }
862             else {
863 0         0 CORE::open($fh, '<', $object_stream->{' streamfile'});
864 0         0 read($fh, $pairs, $object_stream->{'First'}->val());
865             }
866 4         32 my @map = split /\s+/, $pairs;
867              
868             # Find the offset of the object in the stream
869 4         10 my $index = $object_stream_pos * 2;
870 4 50       24 die "Objind $num does not exist at index $index" unless $map[$index] == $num;
871 4         11 my $start = $map[$index + 1];
872              
873             # Unless this is the last object in the stream, its length is determined by the
874             # offset of the next object
875 4         9 my $last_object_in_stream = $map[-2];
876 4         8 my $length;
877 4 100       12 if ($last_object_in_stream == $num) {
878 2 50       17 if ($object_stream->{' stream'}) {
879 2         12 $length = length($object_stream->{' stream'}) - $object_stream->{'First'}->val() - $start;
880             }
881             else {
882 0         0 $length = (-s $object_stream->{' streamfile'}) - $object_stream->{'First'}->val() - $start;
883             }
884             }
885             else {
886 2         8 my $next_start = $map[$index + 3];
887 2         6 $length = $next_start - $start;
888             }
889              
890             # Read the object from the stream
891 4         12 my $stream = "$num 0 obj ";
892 4 50       12 unless ($object_stream->{' streamfile'}) {
893 4         17 $stream .= substr($object_stream->{' stream'}, $object_stream->{'First'}->val() + $start, $length);
894             }
895             else {
896 0         0 seek($fh, $object_stream->{'First'}->val() + $start, 0);
897 0         0 read($fh, $stream, $length, length($stream));
898 0         0 close $fh;
899             }
900              
901 4         17 ($object) = $self->readval($stream, %opts, update => 0);
902 4         20 return $object;
903             }
904              
905 81         342 my $current_location = $self->{' INFILE'}->tell;
906 81         554 $self->{' INFILE'}->seek($object_location, 0);
907 81         782 ($object) = $self->readval('', %opts);
908 81         409 $self->{' INFILE'}->seek($current_location, 0);
909 81         1023 return $object;
910             }
911              
912              
913             =head2 $objind = $p->new_obj($obj)
914              
915             Creates a new, free object reference based on free space in the cross reference chain.
916             If nothing free then thinks up a new number. If $obj then turns that object into this
917             new object rather than returning a new object.
918              
919             =cut
920              
921             sub new_obj {
922 1072     1072 1 1993 my ($self, $base) = @_;
923 1072         1540 my $res;
924              
925 1072 50 66     2886 if (defined $self->{' free'} and scalar @{$self->{' free'}} > 0) {
  15         59  
926 0         0 $res = shift(@{$self->{' free'}});
  0         0  
927 0 0       0 if (defined $base) {
928 0         0 my ($num, $gen) = @{$self->{' objects'}{$res->uid}};
  0         0  
929 0         0 $self->remove_obj($res);
930 0         0 $self->add_obj($base, $num, $gen);
931 0         0 return $self->out_obj($base);
932             }
933             else {
934 0         0 $self->{' objects'}{$res->uid}[2] = 0;
935 0         0 return $res;
936             }
937             }
938              
939 1072         1624 my $tdict = $self;
940 1072         1435 my $i;
941 1072         2149 while (defined $tdict) {
942 1073 50       3423 $i = $tdict->{' xref'}{defined($i) ? $i : ''}[0];
943 1073   33     2645 while (defined $i and $i != 0) {
944 0         0 my ($ni, $ng) = @{$tdict->{' xref'}{$i}};
  0         0  
945 0 0       0 unless (defined $self->locate_obj($i, $ng)) {
946 0 0       0 if (defined $base) {
947 0         0 $self->add_obj($base, $i, $ng);
948 0         0 return $base;
949             }
950             else {
951 0   0     0 $res = $self->test_obj($i, $ng) || $self->add_obj(PDF::API2::Basic::PDF::Objind->new(), $i, $ng);
952 0         0 $self->out_obj($res);
953 0         0 return $res;
954             }
955             }
956 0         0 $i = $ni;
957             }
958 1073         2206 $tdict = $tdict->{' prev'};
959             }
960              
961 1072         1761 $i = $self->{' maxobj'}++;
962 1072 50       1943 if (defined $base) {
963 1072         2851 $self->add_obj($base, $i, 0);
964 1072         2672 $self->out_obj($base);
965 1072         2204 return $base;
966             }
967             else {
968 0         0 $res = $self->add_obj(PDF::API2::Basic::PDF::Objind->new(), $i, 0);
969 0         0 $self->out_obj($res);
970 0         0 return $res;
971             }
972             }
973              
974              
975             =head2 $p->out_obj($objind)
976              
977             Indicates that the given object reference should appear in the output xref
978             table whether with data or freed.
979              
980             =cut
981              
982             sub out_obj {
983 2676     2676 1 4470 my ($self, $obj) = @_;
984              
985             # This is why we've been keeping the outlist CACHE around; to speed
986             # up this method by orders of magnitude (it saves up from having to
987             # grep the full outlist each time through as we'll just do a lookup
988             # in the hash) (which is super-fast).
989 2676 100       6686 unless (exists $self->{' outlist_cache'}{$obj}) {
990 1087         1509 push @{$self->{' outlist'}}, $obj;
  1087         2296  
991             # weaken $self->{' outlist'}->[-1];
992 1087         3277 $self->{' outlist_cache'}{$obj} = 1;
993             }
994 2676         4624 return $obj;
995             }
996              
997              
998             =head2 $p->free_obj($objind)
999              
1000             Marks an object reference for output as being freed.
1001              
1002             =cut
1003              
1004             sub free_obj {
1005 0     0 1 0 my ($self, $obj) = @_;
1006              
1007 0         0 push @{$self->{' free'}}, $obj;
  0         0  
1008 0         0 $self->{' objects'}{$obj->uid()}[2] = 1;
1009 0         0 $self->out_obj($obj);
1010             }
1011              
1012              
1013             =head2 $p->remove_obj($objind)
1014              
1015             Removes the object from all places where we might remember it
1016              
1017             =cut
1018              
1019             sub remove_obj {
1020 0     0 1 0 my ($self, $objind) = @_;
1021              
1022             # who says it has to be fast
1023 0         0 delete $self->{' objects'}{$objind->uid()};
1024 0         0 delete $self->{' outlist_cache'}{$objind};
1025 0         0 delete $self->{' printed_cache'}{$objind};
1026 0         0 @{$self->{' outlist'}} = grep { $_ ne $objind } @{$self->{' outlist'}};
  0         0  
  0         0  
  0         0  
1027 0         0 @{$self->{' printed'}} = grep { $_ ne $objind } @{$self->{' printed'}};
  0         0  
  0         0  
  0         0  
1028             $self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} = undef
1029 0 0       0 if $self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} eq $objind;
1030 0         0 return $self;
1031             }
1032              
1033              
1034             =head2 $p->ship_out(@objects)
1035              
1036             Ships the given objects (or all objects for output if @objects is empty) to
1037             the currently open output file (assuming there is one). Freed objects are not
1038             shipped, and once an object is shipped it is switched such that this file
1039             becomes its source and it will not be shipped again unless out_obj is called
1040             again. Notice that a shipped out object can be re-output or even freed, but
1041             that it will not cause the data already output to be changed.
1042              
1043             =cut
1044              
1045             sub ship_out {
1046 154     154 1 300 my ($self, @objs) = @_;
1047              
1048 154 50       376 die "No output file specified" unless defined $self->{' OUTFILE'};
1049 154         257 my $fh = $self->{' OUTFILE'};
1050 154         357 seek($fh, 0, 2); # go to the end of the file
1051              
1052 154 50       383 @objs = @{$self->{' outlist'}} unless scalar @objs > 0;
  154         397  
1053 154         352 foreach my $objind (@objs) {
1054 890 50       2449 next unless $objind->is_obj($self);
1055 890         1512 my $j = -1;
1056 890         1481 for (my $i = 0; $i < scalar @{$self->{' outlist'}}; $i++) {
  890         2014  
1057 890 50       2380 if ($self->{' outlist'}[$i] eq $objind) {
1058 890         1230 $j = $i;
1059 890         1516 last;
1060             }
1061             }
1062 890 50       1791 next if $j < 0;
1063 890         1182 splice(@{$self->{' outlist'}}, $j, 1);
  890         1660  
1064 890         2236 delete $self->{' outlist_cache'}{$objind};
1065 890 50       1207 next if grep { $_ eq $objind } @{$self->{' free'}};
  0         0  
  890         2149  
1066              
1067 890 50       1729 map { $fh->print("\% $_ \n") } split(/$cr/, $objind->{' comments'}) if $objind->{' comments'};
  0         0  
1068 890         2465 $self->{' locs'}{$objind->uid()} = $fh->tell();
1069 890         1543 my ($objnum, $objgen) = @{$self->{' objects'}{$objind->uid()}}[0..1];
  890         1789  
1070 890         2530 $fh->printf('%d %d obj ', $objnum, $objgen);
1071 890         8845 $objind->outobjdeep($fh, $self);
1072 890         3301 $fh->print(" endobj\n");
1073              
1074             # Note that we've output this obj, not forgetting to update
1075             # the cache of whats printed.
1076 890 50       5771 unless (exists $self->{' printed_cache'}{$objind}) {
1077 890         1234 push @{$self->{' printed'}}, $objind;
  890         1890  
1078 890         2936 $self->{' printed_cache'}{$objind}++;
1079             }
1080             }
1081 154         411 return $self;
1082             }
1083              
1084             =head2 $p->copy($outpdf, \&filter)
1085              
1086             Iterates over every object in the file reading the object, calling filter with the object
1087             and outputting the result. if filter is not defined, then just copies input to output.
1088              
1089             =cut
1090              
1091             sub copy {
1092 0     0 1 0 my ($self, $out, $filter) = @_;
1093 0         0 my ($obj, $minl, $mini, $ming);
1094              
1095 0         0 foreach my $key (grep { not m/^[\s\-]/ } keys %$self) {
  0         0  
1096 0 0       0 $out->{$key} = $self->{$key} unless defined $out->{$key};
1097             }
1098              
1099 0         0 my $tdict = $self;
1100 0         0 while (defined $tdict) {
1101 0         0 foreach my $i (sort {$a <=> $b} keys %{$tdict->{' xref'}}) {
  0         0  
  0         0  
1102 0         0 my ($nl, $ng, $nt) = @{$tdict->{' xref'}{$i}};
  0         0  
1103 0 0       0 next unless $nt eq 'n';
1104              
1105 0 0 0     0 if ($nl < $minl or $mini == 0) {
1106 0         0 $mini = $i;
1107 0         0 $ming = $ng;
1108 0         0 $minl = $nl;
1109             }
1110 0 0       0 unless ($obj = $self->test_obj($i, $ng)) {
1111 0         0 $obj = PDF::API2::Basic::PDF::Objind->new();
1112 0         0 $obj->{' objnum'} = $i;
1113 0         0 $obj->{' objgen'} = $ng;
1114 0         0 $self->add_obj($obj, $i, $ng);
1115 0         0 $obj->{' parent'} = $self;
1116 0         0 weaken $obj->{' parent'};
1117 0         0 $obj->{' realised'} = 0;
1118             }
1119 0         0 $obj->realise;
1120 0 0       0 my $res = defined $filter ? &{$filter}($obj) : $obj;
  0         0  
1121 0 0 0     0 $out->new_obj($res) unless (!$res || $res->is_obj($out));
1122             }
1123 0         0 $tdict = $tdict->{' prev'};
1124             }
1125              
1126             # test for linearized and remove it from output
1127 0         0 $obj = $self->test_obj($mini, $ming);
1128 0 0 0     0 if ($obj->isa('PDF::API2::Basic::PDF::Dict') && $obj->{'Linearized'}) {
1129 0         0 $out->free_obj($obj);
1130             }
1131              
1132 0         0 return $self;
1133             }
1134              
1135              
1136             =head1 PRIVATE METHODS & FUNCTIONS
1137              
1138             The following methods and functions are considered private to this class. This
1139             does not mean you cannot use them if you have a need, just that they aren't really
1140             designed for users of this class.
1141              
1142             =head2 $offset = $p->locate_obj($num, $gen)
1143              
1144             Returns a file offset to the object asked for by following the chain of cross
1145             reference tables until it finds the one you want.
1146              
1147             =cut
1148              
1149             sub locate_obj {
1150 85     85 1 171 my ($self, $num, $gen) = @_;
1151              
1152 85         144 my $tdict = $self;
1153 85         184 while (defined $tdict) {
1154 94 100       279 if (ref $tdict->{' xref'}{$num}) {
1155 85         152 my $ref = $tdict->{' xref'}{$num};
1156 85 100       196 return $ref unless scalar(@$ref) == 3;
1157              
1158 81 50       199 if ($ref->[1] == $gen) {
1159 81 50       376 return $ref->[0] if $ref->[2] eq 'n';
1160 0         0 return; # if $ref->[2] eq 'f';
1161             }
1162             }
1163 9         30 $tdict = $tdict->{' prev'};
1164             }
1165 0         0 return;
1166             }
1167              
1168              
1169             =head2 update($fh, $str, $instream)
1170              
1171             Keeps reading $fh for more data to ensure that $str has at least a line full
1172             for C to work on. At this point we also take the opportunity to ignore
1173             comments.
1174              
1175             =cut
1176              
1177             sub update {
1178 1940     1940 1 3140 my ($fh, $str, $instream) = @_;
1179 1940 50       3128 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1180 1940 100       2900 if ($instream) {
1181             # we are inside a (possible binary) stream
1182             # so we fetch data till we see an 'endstream'
1183             # -- fredo/2004-09-03
1184 10   33     47 while ($str !~ m/endstream/ and not $fh->eof()) {
1185 0 0       0 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1186 0         0 $fh->read($str, 314, length($str));
1187             }
1188             }
1189             else {
1190 1930         7438 $str =~ s/^$ws_char*//;
1191 1930   100     13968 while ($str !~ m/$cr/ and not $fh->eof()) {
1192 107 50       1234 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1193 107         448 $fh->read($str, 314, length($str));
1194 107         2392 $str =~ s/^$ws_char*//so;
1195             }
1196 1930         4482 while ($str =~ m/^\%/) { # restructured by fredo/2003-03-23
1197 1 50       3 print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1198 1   33     24 $fh->read($str, 314, length($str)) while ($str !~ m/$cr/ and not $fh->eof());
1199 1         19 $str =~ s/^\%[^\015\012]*$ws_char*//so; # fixed for reportlab -- fredo
1200             }
1201             }
1202              
1203 1940         4041 return $str;
1204             }
1205              
1206             =head2 $objind = $p->test_obj($num, $gen)
1207              
1208             Tests the cache to see whether an object reference (which may or may not have
1209             been getobj()ed) has been cached. Returns it if it has.
1210              
1211             =cut
1212              
1213             sub test_obj {
1214 247     247 1 468 my ($self, $num, $gen) = @_;
1215 247         910 return $self->{' objcache'}{$num, $gen};
1216             }
1217              
1218              
1219             =head2 $p->add_obj($objind)
1220              
1221             Adds the given object to the internal object cache.
1222              
1223             =cut
1224              
1225             sub add_obj {
1226 1205     1205 1 2330 my ($self, $obj, $num, $gen) = @_;
1227              
1228 1205         3884 $self->{' objcache'}{$num, $gen} = $obj;
1229 1205         5208 $self->{' objects'}{$obj->uid()} = [$num, $gen];
1230             # weaken $self->{' objcache'}{$num, $gen};
1231 1205         2378 return $obj;
1232             }
1233              
1234              
1235             =head2 $tdict = $p->readxrtr($xpos)
1236              
1237             Recursive function which reads each of the cross-reference and trailer tables
1238             in turn until there are no more.
1239              
1240             Returns a dictionary corresponding to the trailer chain. Each trailer also
1241             includes the corresponding cross-reference table.
1242              
1243             The structure of the xref private element in a trailer dictionary is of an
1244             anonymous hash of cross reference elements by object number. Each element
1245             consists of an array of 3 elements corresponding to the three elements read
1246             in [location, generation number, free or used]. See the PDF specification
1247             for details.
1248              
1249             =cut
1250              
1251             sub _unpack_xref_stream {
1252 78     78   129 my ($self, $width, $data) = @_;
1253              
1254 78 100       157 return unpack('C', $data) if $width == 1;
1255 52 50       123 return unpack('n', $data) if $width == 2;
1256 0 0       0 return unpack('N', "\x00$data") if $width == 3;
1257 0 0       0 return unpack('N', $data) if $width == 4;
1258 0 0       0 return unpack('Q>', $data) if $width == 8;
1259              
1260 0         0 die "Unsupported xref stream entry width: $width";
1261             }
1262              
1263             sub readxrtr {
1264 19     19 1 58 my ($self, $xpos) = @_;
1265 19         43 my ($tdict, $buf, $xmin, $xnum, $xdiff);
1266              
1267 19         55 my $fh = $self->{' INFILE'};
1268 19         394 $fh->seek($xpos, 0);
1269 19         230 $fh->read($buf, 22);
1270 19         244 $buf = update($fh, $buf); # fix for broken JAWS xref calculation.
1271              
1272 19         42 my $xlist = {};
1273              
1274             ## seams that some products calculate wrong prev entries (short)
1275             ## so we seek ahead to find one -- fredo; save for now
1276             #while($buf !~ m/^xref$cr/i && !eof($fh))
1277             #{
1278             # $buf =~ s/^(\s+|\S+|.)//i;
1279             # $buf=update($fh,$buf);
1280             #}
1281              
1282 19 100       284 if ($buf =~ s/^xref$cr//i) {
    50          
1283             # Plain XRef tables.
1284 16         491 while ($buf =~ m/^$ws_char*([0-9]+)$ws_char+([0-9]+)$ws_char*$cr(.*?)$/s) {
1285 21         84 my $old_buf = $buf;
1286 21         52 $xmin = $1;
1287 21         44 $xnum = $2;
1288 21         48 $buf = $3;
1289 21 50       311 unless ($old_buf =~ /^[0-9]+ [0-9]+$cr/) {
1290             # See PDF 1.7 section 7.5.4: Cross-Reference Table
1291 0         0 warn q{Malformed xref in PDF file: subsection shall begin with a line containing two numbers separated by a SPACE (20h)};
1292             }
1293 21         49 $xdiff = length($buf);
1294              
1295 21         105 $fh->read($buf, 20 * $xnum - $xdiff + 15, $xdiff);
1296 21   66     649 while ($xnum-- > 0 and $buf =~ s/^0*([0-9]*)$ws_char+0*([0-9]+)$ws_char+([nf])$cr//) {
1297 118 50       574 $xlist->{$xmin} = [$1, $2, $3] unless exists $xlist->{$xmin};
1298 118         886 $xmin++;
1299             }
1300             }
1301              
1302 16 50       89 if ($buf !~ /^\s*trailer\b/i) {
1303 0         0 die "Malformed trailer in PDF file $self->{' fname'} at " . ($fh->tell - length($buf));
1304             }
1305              
1306 16         55 $buf =~ s/^\s*trailer\b//i;
1307              
1308 16         73 ($tdict, $buf) = $self->readval($buf);
1309             }
1310             elsif ($buf =~ m/^(\d+)\s+(\d+)\s+obj/i) {
1311 3         15 my ($xref_obj, $xref_gen) = ($1, $2);
1312              
1313             # XRef streams.
1314 3         14 ($tdict, $buf) = $self->readval($buf);
1315              
1316 3 50       13 unless ($tdict->{' stream'}) {
1317 0         0 die "Malformed XRefStm at $xref_obj $xref_gen obj in PDF file $self->{' fname'}";
1318             }
1319 3         17 $tdict->read_stream(1);
1320              
1321 3         10 my $stream = $tdict->{' stream'};
1322 3         6 my @widths = map { $_->val } @{$tdict->{W}->val};
  9         21  
  3         14  
1323              
1324 3         7 my $start = 0;
1325 3         6 my $last;
1326              
1327             my @index;
1328 3 100       10 if (defined $tdict->{Index}) {
1329 1         2 @index = map { $_->val() } @{$tdict->{Index}->val};
  2         5  
  1         4  
1330             }
1331             else {
1332 2         8 @index = (0, $tdict->{Size}->val);
1333             }
1334              
1335 3         10 while (scalar @index) {
1336 3         7 $start = shift(@index);
1337 3         9 $last = $start + shift(@index) - 1;
1338              
1339 3         12 for my $i ($start...$last) {
1340             # Replaced "for $xmin" because it creates a loop-specific local variable, and we
1341             # need $xmin to be correct for maxobj below.
1342 26         39 $xmin = $i;
1343              
1344 26         30 my @cols;
1345              
1346 26         43 for my $w (@widths) {
1347 78         93 my $data;
1348 78 50       208 $data = $self->_unpack_xref_stream($w, substr($stream, 0, $w, '')) if $w;
1349              
1350 78         153 push @cols, $data;
1351             }
1352              
1353 26 100       52 $cols[0] = 1 unless defined $cols[0];
1354 26 50       85 if ($cols[0] > 2) {
1355 0         0 die "Invalid XRefStm entry type ($cols[0]) at $xref_obj $xref_gen obj";
1356             }
1357              
1358 26 50       57 next if exists $xlist->{$xmin};
1359              
1360 26 50       61 my @objind = ($cols[1], defined($cols[2]) ? $cols[2] : ($xmin ? 0 : 65535));
    100          
1361 26 100       65 push @objind, ($cols[0] == 0 ? 'f' : 'n') if $cols[0] < 2;
    100          
1362              
1363 26         75 $xlist->{$xmin} = \@objind;
1364             }
1365             }
1366             }
1367             else {
1368 0         0 die "Malformed xref in PDF file $self->{' fname'}";
1369             }
1370              
1371 19         87 $tdict->{' loc'} = $xpos;
1372 19         64 $tdict->{' xref'} = $xlist;
1373 19 100       93 $self->{' maxobj'} = $xmin + 1 if $xmin + 1 > $self->{' maxobj'};
1374             $tdict->{' prev'} = $self->readxrtr($tdict->{'Prev'}->val)
1375 19 100 66     81 if (defined $tdict->{'Prev'} and $tdict->{'Prev'}->val != 0);
1376 19 100       65 delete $tdict->{' prev'} unless defined $tdict->{' prev'};
1377 19         53 return $tdict;
1378             }
1379              
1380              
1381             =head2 $p->out_trailer($tdict)
1382              
1383             Outputs the body and trailer for a PDF file by outputting all the objects in
1384             the ' outlist' and then outputting a xref table for those objects and any
1385             freed ones. It then outputs the trailing dictionary and the trailer code.
1386              
1387             =cut
1388              
1389             sub out_trailer {
1390 148     148 1 504 my ($self, $tdict, $update) = @_;
1391 148         309 my $fh = $self->{' OUTFILE'};
1392              
1393 148         243 while (@{$self->{' outlist'}}) {
  302         830  
1394 154         411 $self->ship_out();
1395             }
1396              
1397             # When writing new trailers, most dictionary entries get copied from the
1398             # previous trailer, but entries related to cross-reference streams should
1399             # get removed (and possibly recreated below).
1400 148         860 delete $tdict->{$_} for (# Entries common to streams
1401             qw(Length Filter DecodeParms F FFilter FDecodeParms DL),
1402              
1403             # Entries specific to cross-reference streams
1404             qw(Index W XRefStm));
1405              
1406 148         515 $tdict->{'Size'} = PDFNum($self->{' maxobj'});
1407              
1408 148         482 my $tloc = $fh->tell();
1409 148         709 my @out;
1410              
1411 148 100       240 my @xreflist = sort { $self->{' objects'}{$a->uid}[0] <=> $self->{' objects'}{$b->uid}[0] } (@{$self->{' printed'} || []}, @{$self->{' free'} || []});
  1334 100       2646  
  148         452  
  148         651  
1412              
1413 148         361 my ($i, $j, $k);
1414 148 100       386 unless ($update) {
1415 140         219 $i = 1;
1416 140         403 for ($j = 0; $j < @xreflist; $j++) {
1417 868         1083 my @inserts;
1418 868         1153 $k = $xreflist[$j];
1419 868         1745 while ($i < $self->{' objects'}{$k->uid}[0]) {
1420 0         0 my ($n) = PDF::API2::Basic::PDF::Objind->new();
1421 0         0 $self->add_obj($n, $i, 0);
1422 0         0 $self->free_obj($n);
1423 0         0 push(@inserts, $n);
1424 0         0 $i++;
1425             }
1426 868         1403 splice(@xreflist, $j, 0, @inserts);
1427 868         1173 $j += @inserts;
1428 868         1687 $i++;
1429             }
1430             }
1431              
1432 148 100       254 my @freelist = sort { $self->{' objects'}{$a->uid}[0] <=> $self->{' objects'}{$b->uid}[0] } @{$self->{' free'} || []};
  0         0  
  148         493  
1433              
1434 148         261 $j = 0; my $first = -1; $k = 0;
  148         240  
  148         218  
1435 148         461 for ($i = 0; $i <= $#xreflist + 1; $i++) {
1436 1038 100 100     2820 if ($i > $#xreflist || $self->{' objects'}{$xreflist[$i]->uid}[0] != $j + 1) {
1437 160 100       650 push @out, ($first == -1 ? "0 " : "$self->{' objects'}{$xreflist[$first]->uid}[0] ") . ($i - $first) . "\n";
1438 160 100       430 if ($first == -1) {
1439 148 50       707 push @out, sprintf("%010d 65535 f \n", defined $freelist[$k] ? $self->{' objects'}{$freelist[$k]->uid}[0] : 0);
1440 148         276 $first = 0;
1441             }
1442 160         444 for ($j = $first; $j < $i; $j++) {
1443 890         1425 my $xref = $xreflist[$j];
1444 890 50 33     1989 if (defined($freelist[$k]) and defined($xref) and "$freelist[$k]" eq "$xref") {
      33        
1445 0         0 $k++;
1446             push @out, pack("A10AA5A4",
1447             sprintf("%010d", (defined $freelist[$k] ?
1448             $self->{' objects'}{$freelist[$k]->uid}[0] : 0)), " ",
1449 0 0       0 sprintf("%05d", $self->{' objects'}{$xref->uid}[1] + 1),
1450             " f \n");
1451             }
1452             else {
1453             push @out, pack("A10AA5A4", sprintf("%010d", $self->{' locs'}{$xref->uid}), " ",
1454 890         2065 sprintf("%05d", $self->{' objects'}{$xref->uid}[1]),
1455             " n \n");
1456             }
1457             }
1458 160         288 $first = $i;
1459 160 100       584 $j = $self->{' objects'}{$xreflist[$i]->uid}[0] if ($i < scalar @xreflist);
1460             }
1461             else {
1462 878         1912 $j++;
1463             }
1464             }
1465 148 50 33     529 if (exists $tdict->{'Type'} and $tdict->{'Type'}->val() eq 'XRef') {
1466 0         0 my (@index, @stream);
1467 0         0 for (@out) {
1468 0         0 my @a = split;
1469 0 0       0 @a == 2 ? push @index, @a : push @stream, \@a;
1470             }
1471 0         0 my $i = $self->{' maxobj'}++;
1472 0         0 $self->add_obj($tdict, $i, 0);
1473 0         0 $self->out_obj($tdict );
1474              
1475 0         0 push @index, $i, 1;
1476 0         0 push @stream, [$tloc, 0, 'n'];
1477              
1478 0 0       0 my $len = $tloc > 0xFFFF ? 4 : 2; # don't expect files > 4 Gb
1479 0 0       0 my $tpl = $tloc > 0xFFFF ? 'CNC' : 'CnC'; # don't expect gennum > 255, it's absurd.
1480             # Adobe doesn't use them anymore anyway
1481 0         0 my $stream = '';
1482 0         0 my @prev = (0) x ($len + 2);
1483 0         0 for (@stream) {
1484 0 0 0     0 $_->[1] = 0 if $_->[2] eq 'f' and $_->[1] == 65535;
1485 0 0       0 my @line = unpack 'C*', pack $tpl, $_->[2] eq 'n' ? 1 : 0, @{$_}[0..1];
  0         0  
1486              
1487             $stream .= pack 'C*', 2, # prepend filtering method, "PNG Up"
1488 0         0 map {($line[$_] - $prev[$_] + 256) % 256 } 0 .. $#line;
  0         0  
1489 0         0 @prev = @line;
1490             }
1491 0         0 $tdict->{'Size'} = PDFNum($i + 1);
1492 0         0 $tdict->{'Index'} = PDFArray(map PDFNum( $_ ), @index);
1493 0         0 $tdict->{'W'} = PDFArray(map PDFNum( $_ ), 1, $len, 1);
1494 0         0 $tdict->{'Filter'} = PDFName('FlateDecode');
1495              
1496 0         0 $tdict->{'DecodeParms'} = PDFDict();
1497 0         0 $tdict->{'DecodeParms'}->val->{'Predictor'} = PDFNum(12);
1498 0         0 $tdict->{'DecodeParms'}->val->{'Columns'} = PDFNum($len + 2);
1499              
1500 0         0 $stream = PDF::API2::Basic::PDF::Filter::FlateDecode->new->outfilt($stream, 1);
1501 0         0 $tdict->{' stream'} = $stream;
1502 0         0 $tdict->{' nofilt'} = 1;
1503 0         0 delete $tdict->{'Length'};
1504 0         0 $self->ship_out();
1505             }
1506             else {
1507 148         579 $fh->print("xref\n", @out, "trailer\n");
1508 148         1630 $tdict->outobjdeep($fh, $self);
1509 148         376 $fh->print("\n");
1510             }
1511 148         1085 $fh->print("startxref\n$tloc\n%%EOF\n");
1512             }
1513              
1514              
1515             =head2 PDF::API2::Basic::PDF::File->_new
1516              
1517             Creates a very empty PDF file object (used by new and open)
1518              
1519             =cut
1520              
1521             sub _new {
1522 180     180   363 my $class = shift();
1523 180         341 my $self = {};
1524              
1525 180         338 bless $self, $class;
1526 180         631 $self->{' outlist'} = [];
1527 180         389 $self->{' outlist_cache'} = {}; # A cache of whats in the 'outlist'
1528 180         351 $self->{' maxobj'} = 1;
1529 180         375 $self->{' objcache'} = {};
1530 180         363 $self->{' objects'} = {};
1531              
1532 180         358 return $self;
1533             }
1534              
1535             1;
1536              
1537             =head1 AUTHOR
1538              
1539             Martin Hosken Martin_Hosken@sil.org
1540              
1541             Copyright Martin Hosken 1999 and onwards
1542              
1543             No warranty or expression of effectiveness, least of all regarding anyone's
1544             safety, is implied in this software or documentation.