File Coverage

blib/lib/PDF/API2/Basic/PDF/File.pm

Criterion	Covered	Total	%
statement	542	717	75.5
branch	187	322	58.0
condition	41	90	45.5
subroutine	45	48	93.7
pod	27	27	100.0
total	842	1204	69.9

line	stmt	bran	cond	sub	pod	time	code
1							# Code in the PDF::API2::Basic::PDF namespace was originally copied from the
2							# Text::PDF distribution.
3							#
4							# Copyright Martin Hosken
5							#
6							# Martin Hosken's code may be used under the terms of the MIT license.
7							# Subsequent versions of the code have the same license as PDF::API2.
8
9							package PDF::API2::Basic::PDF::File;
10
11	39			39		69274	use strict;
	39					108
	39					2695
12
13							our $VERSION = '2.044'; # VERSION
14
15							=head1 NAME
16
17							PDF::API2::Basic::PDF::File - Low-level PDF file access
18
19							=head1 SYNOPSIS
20
21							$p = PDF::API2::Basic::PDF::File->open("filename.pdf", 1);
22							$p->new_obj($obj_ref);
23							$p->free_obj($obj_ref);
24							$p->append_file;
25							$p->close_file;
26							$p->release; # IMPORTANT!
27
28							=head1 DESCRIPTION
29
30							This class keeps track of the directory aspects of a PDF file. There are two
31							parts to the directory: the main directory object which is the parent to all
32							other objects and a chain of cross-reference tables and corresponding trailer
33							dictionaries starting with the main directory object.
34
35							=head1 INSTANCE VARIABLES
36
37							Within this class hierarchy, rather than making everything visible via methods,
38							which would be a lot of work, there are various instance variables which are
39							accessible via associative array referencing. To distinguish instance variables
40							from content variables (which may come from the PDF content itself), each such
41							variable will start with a space.
42
43							Variables which do not start with a space directly reflect elements in a PDF
44							dictionary. In the case of a PDF::API2::Basic::PDF::File, the elements reflect those in the
45							trailer dictionary.
46
47							Since some variables are not designed for class users to access, variables are
48							marked in the documentation with (R) to indicate that such an entry should only
49							be used as read-only information. (P) indicates that the information is private
50							and not designed for user use at all, but is included in the documentation for
51							completeness and to ensure that nobody else tries to use it.
52
53							=over
54
55							=item newroot
56
57							This variable allows the user to create a new root entry to occur in the trailer
58							dictionary which is output when the file is written or appended. If you wish to
59							over-ride the root element in the dictionary you have, use this entry to indicate
60							that without losing the current Root entry. Notice that newroot should point to
61							a PDF level object and not just to a dictionary which does not have object status.
62
63							=item INFILE (R)
64
65							Contains the filehandle used to read this information into this PDF directory. Is
66							an IO object.
67
68							=item fname (R)
69
70							This is the filename which is reflected by INFILE, or the original IO object passed
71							in.
72
73							=item update (R)
74
75							This indicates that the read file has been opened for update and that at some
76							point, $p->appendfile() can be called to update the file with the changes that
77							have been made to the memory representation.
78
79							=item maxobj (R)
80
81							Contains the first usable object number above any that have already appeared
82							in the file so far.
83
84							=item outlist (P)
85
86							This is a list of Objind which are to be output when the next appendfile or outfile
87							occurs.
88
89							=item firstfree (P)
90
91							Contains the first free object in the free object list. Free objects are removed
92							from the front of the list and added to the end.
93
94							=item lastfree (P)
95
96							Contains the last free object in the free list. It may be the same as the firstfree
97							if there is only one free object.
98
99							=item objcache (P)
100
101							All objects are held in the cache to ensure that a system only has one occurrence of
102							each object. In effect, the objind class acts as a container type class to hold the
103							PDF object structure and it would be unfortunate if there were two identical
104							place-holders floating around a system.
105
106							=item epos (P)
107
108							The end location of the read-file.
109
110							=back
111
112							Each trailer dictionary contains a number of private instance variables which
113							hold the chain together.
114
115							=over
116
117							=item loc (P)
118
119							Contains the location of the start of the cross-reference table preceding the
120							trailer.
121
122							=item xref (P)
123
124							Contains an anonymous array of each cross-reference table entry.
125
126							=item prev (P)
127
128							A reference to the previous table. Note this differs from the Prev entry which
129							is in PDF which contains the location of the previous cross-reference table.
130
131							=back
132
133							=head1 METHODS
134
135							=cut
136
137	39			39		273	use Scalar::Util qw(blessed weaken);
	39					97
	39					2588
138
139	39			39		252	use vars qw($cr $irreg_char $reg_char $ws_char $delim_char %types);
	39					90
	39					6263
140
141							$ws_char = '[ \t\r\n\f\0]';
142							$delim_char = '[][<>{}()/%]';
143							$reg_char = '[^][<>{}()/% \t\r\n\f\0]';
144							$irreg_char = '[][<>{}()/% \t\r\n\f\0]';
145							$cr = '\s*(?:\015\|\012\|(?:\015\012))';
146
147							my $re_comment = qr/(?:\%[^\r\n]*)/;
148							my $re_whitespace = qr/(?:[ \t\r\n\f\0]\|$re_comment)/;
149
150							%types = (
151							'Page' => 'PDF::API2::Basic::PDF::Page',
152							'Pages' => 'PDF::API2::Basic::PDF::Pages',
153							);
154
155							my $readDebug = 0;
156
157	39			39		305	use Carp;
	39					107
	39					2477
158	39			39		822	use IO::File;
	39					8742
	39					5487
159
160							# Now for the basic PDF types
161	39			39		745	use PDF::API2::Basic::PDF::Utils;
	39					108
	39					2974
162
163	39			39		282	use PDF::API2::Basic::PDF::Array;
	39					101
	39					1109
164	39			39		245	use PDF::API2::Basic::PDF::Bool;
	39					88
	39					982
165	39			39		244	use PDF::API2::Basic::PDF::Dict;
	39					103
	39					953
166	39			39		231	use PDF::API2::Basic::PDF::Name;
	39					97
	39					1000
167	39			39		295	use PDF::API2::Basic::PDF::Number;
	39					138
	39					1023
168	39			39		213	use PDF::API2::Basic::PDF::Objind;
	39					93
	39					1175
169	39			39		235	use PDF::API2::Basic::PDF::String;
	39					92
	39					944
170	39			39		17421	use PDF::API2::Basic::PDF::Page;
	39					122
	39					1320
171	39			39		272	use PDF::API2::Basic::PDF::Pages;
	39					85
	39					782
172	39			39		198	use PDF::API2::Basic::PDF::Null;
	39					95
	39					931
173	39			39		198	use POSIX qw(ceil floor);
	39					165
	39					252
174
175	39			39		3254	no warnings qw[ deprecated recursion uninitialized ];
	39					92
	39					43240
176
177
178							=head2 PDF::API2::Basic::PDF::File->new
179
180							Creates a new, empty file object which can act as the host to other PDF objects.
181							Since there is no file associated with this object, it is assumed that the
182							object is created in readiness for creating a new PDF file.
183
184							=cut
185
186							sub new {
187	164			164	1	437	my ($class, $root) = @_;
188	164					545	my $self = $class->_new();
189
190	164	50				504	unless ($root) {
191	164					570	$root = PDFDict();
192	164					527	$root->{'Type'} = PDFName('Catalog');
193							}
194	164					602	$self->new_obj($root);
195	164					388	$self->{'Root'} = $root;
196	164					593	return $self;
197							}
198
199
200							=head2 $p = PDF::API2::Basic::PDF::File->open($filename, $update)
201
202							Opens the file and reads all the trailers and cross reference tables to build
203							a complete directory of objects.
204
205							$update specifies whether this file is being opened for updating and editing,
206							or simply to be read.
207
208							$filename may be an IO object
209
210							=cut
211
212							sub open {
213	16			16	1	59	my ($class, $filename, $update) = @_;
214	16					41	my ($fh, $buffer);
215
216	16					63	my $self = $class->_new();
217	16	100				87	if (ref $filename) {
218	8					26	$self->{' INFILE'} = $filename;
219	8	50				25	if ($update) {
220	8					21	$self->{' update'} = 1;
221	8					29	$self->{' OUTFILE'} = $filename;
222							}
223	8					18	$fh = $filename;
224							}
225							else {
226	8	50				111	die "File '$filename' does not exist" unless -f $filename;
227	8	50				121	die "File '$filename' is not readable" unless -r $filename;
228	8	50				49	if ($update) {
229	8	50				109	die "File '$filename' is not writable" unless -w $filename;
230							}
231	8		50			121	$fh = IO::File->new(($update ? '+' : '') . "<$filename")
232							\|\| die "Error opening '$filename': $!";
233	8					900	$self->{' INFILE'} = $fh;
234	8	50				33	if ($update) {
235	8					26	$self->{' update'} = 1;
236	8					33	$self->{' OUTFILE'} = $fh;
237	8					26	$self->{' fname'} = $filename;
238							}
239							}
240	16					109	binmode $fh, ':raw';
241	16					176	$fh->seek(0, 0); # go to start of file
242	16					266	$fh->read($buffer, 255);
243	16	50				1040	unless ($buffer =~ /^\%PDF\-([12]\.\d+)\s*$cr/m) {
244	0					0	croak "$filename does not appear to be a valid PDF";
245							}
246	16					104	$self->{' version'} = $1;
247
248	16					92	$fh->seek(0, 2); # go to end of file
249	16					276	my $end = $fh->tell();
250	16					126	$self->{' epos'} = $end;
251	16					66	foreach my $offset (1..64) {
252	32					185	$fh->seek($end - 16 * $offset, 0);
253	32					363	$fh->read($buffer, 16 * $offset);
254	32	100				987	last if $buffer =~ m/startxref($cr\|\s)\d+($cr\|\s)\%\%eof.*?/i;
255							}
256	16	50				465	unless ($buffer =~ m/startxref[^\d]+([0-9]+)($cr\|\s)\%\%eof.?/i) {
257	0					0	die "Malformed PDF file $filename";
258							}
259	16					74	my $xpos = $1;
260	16					53	$self->{' xref_position'} = $xpos;
261
262	16					94	my $tdict = $self->readxrtr($xpos, $self);
263	16					89	foreach my $key (keys %$tdict) {
264	121					274	$self->{$key} = $tdict->{$key};
265							}
266	16					132	return $self;
267							}
268
269							=head2 $p->version($version)
270
271							Gets/sets the PDF version (e.g. 1.4)
272
273							=cut
274
275							sub version {
276	9			9	1	14	my $self = shift();
277
278	9	100				21	if (@_) {
279	2					5	my $version = shift();
280	2	50				22	croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
281	2					6	$self->header_version($version);
282	2	50				9	if ($version >= 1.4) {
283	2					5	$self->trailer_version($version);
284							}
285							else {
286	0					0	delete $self->{'Root'}->{'Version'};
287	0					0	$self->out_obj($self->{'Root'});
288							}
289	2					4	return $version;
290							}
291
292	7					15	my $header_version = $self->header_version();
293	7					14	my $trailer_version = $self->trailer_version();
294	7	100				35	return $trailer_version if $trailer_version > $header_version;
295	3					15	return $header_version;
296							}
297
298							=head2 $version = $p->header_version($version)
299
300							Gets/sets the PDF version stored in the file header.
301
302							=cut
303
304							sub header_version {
305	14			14	1	31	my $self = shift();
306
307	14	100				29	if (@_) {
308	5					10	my $version = shift();
309	5	50				28	croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
310	5					12	$self->{' version'} = $version;
311							}
312
313	14					33	return $self->{' version'};
314							}
315
316							=head2 $version = $p->trailer_version($version)
317
318							Gets/sets the PDF version stored in the document catalog.
319
320							=cut
321
322							sub trailer_version {
323	12			12	1	27	my $self = shift();
324
325	12	100				25	if (@_) {
326	4					7	my $version = shift();
327	4	50				15	croak "Invalid version $version" unless $version =~ /^([12]\.[0-9]+)$/;
328	4					10	$self->{'Root'}->{'Version'} = PDFName($version);
329	4					15	$self->out_obj($self->{'Root'});
330	4					8	return $version;
331							}
332
333	8	50				20	return unless $self->{'Root'}->{'Version'};
334	8					33	$self->{'Root'}->{'Version'}->realise();
335	8					23	return $self->{'Root'}->{'Version'}->val();
336							}
337
338							=head2 $prev_version = $p->require_version($version)
339
340							Ensures that the PDF version is at least C<$version>.
341
342							=cut
343
344							sub require_version {
345	3			3	1	712	my ($self, $min_version) = @_;
346	3					8	my $current_version = $self->version();
347	3	100				12	$self->version($min_version) if $current_version < $min_version;
348	3					8	return $current_version;
349							}
350
351							=head2 $p->release()
352
353							Releases ALL of the memory used by the PDF document and all of its
354							component objects. After calling this method, do B expect to
355							have anything left in the C object (so if
356							you need to save, be sure to do it before calling this method).
357
358							B, that it is important that you call this method on any
359							C object when you wish to destruct it and
360							free up its memory. Internally, PDF files have an enormous number of
361							cross-references and this causes circular references within the
362							internal data structures. Calling 'C' forces a brute-force
363							cleanup of the data structures, freeing up all of the memory. Once
364							you've called this method, though, don't expect to be able to do
365							anything else with the C object; it'll
366							have B internal state whatsoever.
367
368							=cut
369
370							# Maintainer's Question: Couldn't this be handled by a DESTROY method
371							# instead of requiring an explicit call to release()?
372							sub release {
373	615			615	1	966	my $self = shift();
374
375	615	50				1284	return $self unless ref($self);
376	615					1349	my @tofree = values %$self;
377
378	615					1427	foreach my $key (keys %$self) {
379	2271					2993	$self->{$key} = undef;
380	2271					3624	delete $self->{$key};
381							}
382
383							# PDFs with highly-interconnected page trees or outlines can hit Perl's
384							# recursion limit pretty easily, so disable the warning for this specific
385							# loop.
386	39			39		338	no warnings 'recursion';
	39					122
	39					296566
387
388	615					2323	while (my $item = shift @tofree) {
389	7103	100	100			24799	if (blessed($item) and $item->can('release')) {
		100
		100
390	2115					4413	$item->release(1);
391							}
392							elsif (ref($item) eq 'ARRAY') {
393	1552					4394	push @tofree, @$item;
394							}
395							elsif (ref($item) eq 'HASH') {
396	877					2265	push @tofree, values %$item;
397	877					2182	foreach my $key (keys %$item) {
398	3822					4962	$item->{$key} = undef;
399	3822					6757	delete $item->{$key};
400							}
401							}
402							else {
403	2559					6142	$item = undef;
404							}
405							}
406							}
407
408							=head2 $p->append_file()
409
410							Appends the objects for output to the read file and then appends the appropriate table.
411
412							=cut
413
414							sub append_file {
415	3			3	1	7	my $self = shift();
416	3	50				13	return unless $self->{' update'};
417
418	3					7	my $fh = $self->{' INFILE'};
419
420	3					11	my $tdict = PDFDict();
421	3					10	$tdict->{'Prev'} = PDFNum($self->{' loc'});
422	3					8	$tdict->{'Info'} = $self->{'Info'};
423	3	50				11	if (defined $self->{' newroot'}) {
424	0					0	$tdict->{'Root'} = $self->{' newroot'};
425							}
426							else {
427	3					9	$tdict->{'Root'} = $self->{'Root'};
428							}
429	3					8	$tdict->{'Size'} = $self->{'Size'};
430
431	3					18	foreach my $key (grep { $_ !~ m/^\s/ } keys %$self) {
	54					118
432	9	50				27	$tdict->{$key} = $self->{$key} unless defined $tdict->{$key};
433							}
434
435	3					22	$fh->seek($self->{' epos'}, 0);
436	3					32	$self->out_trailer($tdict, $self->{' update'});
437	3					43	close $self->{' OUTFILE'};
438							}
439
440
441							=head2 $p->out_file($fname)
442
443							Writes a PDF file to a file of the given filename based on the current list of
444							objects to be output. It creates the trailer dictionary based on information
445							in $self.
446
447							$fname may be an IO object;
448
449							=cut
450
451							sub out_file {
452	140			140	1	362	my ($self, $fname) = @_;
453
454	140					477	$self->create_file($fname);
455	140					462	$self->close_file();
456	140					350	return $self;
457							}
458
459
460							=head2 $p->create_file($fname)
461
462							Creates a new output file (no check is made of an existing open file) of
463							the given filename or IO object. Note, make sure that $p->{' version'} is set
464							correctly before calling this function.
465
466							=cut
467
468							sub create_file {
469	140			140	1	304	my ($self, $filename) = @_;
470	140					223	my $fh;
471
472	140					318	$self->{' fname'} = $filename;
473	140	50				360	if (ref $filename) {
474	140					235	$fh = $filename;
475							}
476							else {
477	0		0			0	$fh = IO::File->new(">$filename") \|\| die "Unable to open $filename for writing";
478	0					0	binmode($fh,':raw');
479							}
480
481	140					333	$self->{' OUTFILE'} = $fh;
482	140		50			848	$fh->print('%PDF-' . ($self->{' version'} // '1.4') . "\n");
483	140					1228	$fh->print("%\xC6\xCD\xCD\xB5\n"); # and some binary stuff in a comment
484	140					789	return $self;
485							}
486
487
488							=head2 $p->clone_file($fname)
489
490							Creates a copy of the input file at the specified filename and sets it as the
491							output file for future writes. A file handle may be passed instead of a
492							filename.
493
494							=cut
495
496							sub clone_file {
497	4			4	1	14	my ($self, $filename) = @_;
498	4					11	my $fh;
499
500	4					16	$self->{' fname'} = $filename;
501	4	50				22	if (ref $filename) {
502	4					10	$fh = $filename;
503							}
504							else {
505	0		0			0	$fh = IO::File->new(">$filename") \|\| die "Unable to open $filename for writing";
506	0					0	binmode($fh,':raw');
507							}
508
509	4					9	$self->{' OUTFILE'} = $fh;
510
511	4					11	my $in = $self->{' INFILE'};
512	4					20	$in->seek(0, 0);
513	4					62	my $data;
514	4					20	while (not $in->eof()) {
515	4					93	$in->read($data, 1024 * 1024);
516	4					136	$fh->print($data);
517							}
518	4					95	return $self;
519							}
520
521							=head2 $p->close_file
522
523							Closes up the open file for output by outputting the trailer etc.
524
525							=cut
526
527							sub close_file {
528	145			145	1	262	my $self = shift();
529
530	145					424	my $tdict = PDFDict();
531	145	50				616	$tdict->{'Info'} = $self->{'Info'} if defined $self->{'Info'};
532	145	50	33			607	$tdict->{'Root'} = (defined $self->{' newroot'} and $self->{' newroot'} ne '') ? $self->{' newroot'} : $self->{'Root'};
533
534							# remove all freed objects from the outlist, AND the outlist_cache if not updating
535							# NO! Don't do that thing! In fact, let out_trailer do the opposite!
536
537	145		66			538	$tdict->{'Size'} = $self->{'Size'} \|\| PDFNum(1);
538	145	100				441	$tdict->{'Prev'} = PDFNum($self->{' loc'}) if $self->{' loc'};
539	145	100				433	if ($self->{' update'}) {
540	5					32	foreach my $key (grep { $_ !~ m/^[\s\-]/ } keys %$self) {
	97					219
541	16	50				40	$tdict->{$key} = $self->{$key} unless defined $tdict->{$key};
542							}
543
544	5					16	my $fh = $self->{' INFILE'};
545	5					23	$fh->seek($self->{' epos'}, 0);
546							}
547
548	145					849	$self->out_trailer($tdict, $self->{' update'});
549	145					1497	close($self->{' OUTFILE'});
550	145	50	33			668	if ($^O eq 'MacOS' and not ref($self->{' fname'})) {
551	0					0	MacPerl::SetFileInfo('CARO', 'TEXT', $self->{' fname'});
552							}
553
554	145					711	return $self;
555							}
556
557							=head2 ($value, $str) = $p->readval($str, %opts)
558
559							Reads a PDF value from the current position in the file. If $str is too short
560							then read some more from the current location in the file until the whole object
561							is read. This is a recursive call which may slurp in a whole big stream (unprocessed).
562
563							Returns the recursive data structure read and also the current $str that has been
564							read from the file.
565
566							=cut
567
568							sub readval {
569	842			842	1	14532	my ($self, $str, %opts) = @_;
570	842					1392	my $fh = $self->{' INFILE'};
571	842					1166	my ($result, $value);
572
573	842	100				1673	my $update = defined($opts{update}) ? $opts{update} : 1;
574	842	100				1886	$str = update($fh, $str) if $update;
575
576	842					2537	$str =~ s/^$ws_char+//; # Ignore initial white space
577	842					2200	$str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
578
579							# Dictionary
580	842	100				9951	if ($str =~ m/^<
		100
		100
		100
		100
		50
		100
		50
		50
		0
581	146					400	$str = substr ($str, 2);
582	146	100				390	$str = update($fh, $str) if $update;
583	146					478	$result = PDFDict();
584
585	146					415	while ($str !~ m/^>>/) {
586	398					1321	$str =~ s/^$ws_char+//; # Ignore initial white space
587	398					1140	$str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
588
589	398	50				2136	if ($str =~ s\|^/($reg_char+)\|\|) {
		0
		0
590	398					1116	my $key = PDF::API2::Basic::PDF::Name::name_to_string($1, $self);
591	398					1400	($value, $str) = $self->readval($str, %opts);
592	398	50	50			1274	unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
593	398					1027	$result->{$key} = $value;
594							}
595							}
596							elsif ($str =~ s\|^/$ws_char+\|\|) {
597							# fixes a broken key problem of acrobat. -- fredo
598	0					0	($value, $str) = $self->readval($str, %opts);
599	0	0	0			0	unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
600	0					0	$result->{'null'} = $value;
601							}
602							}
603							elsif ($str =~ s\|^//\|/\|) {
604							# fixes again a broken key problem of illustrator/enfocus. -- fredo
605	0					0	($value, $str) = $self->readval($str, %opts);
606	0	0	0			0	unless ((ref($value) // '') eq 'PDF::API2::Basic::PDF::Null') {
607	0					0	$result->{'null'} = $value;
608							}
609							}
610							else {
611	0					0	die "Invalid dictionary key";
612							}
613	398	100				1058	$str = update($fh, $str) if $update; # thanks gareth.jones@stud.man.ac.uk
614							}
615	146					485	$str =~ s/^>>//;
616	146	100				400	$str = update($fh, $str) if $update;
617							# streams can't be followed by a lone carriage-return.
618							# fredo: yes they can !!! -- use the MacOS Luke.
619	146	100	66			482	if (($str =~ s/^stream(?:(?:\015\012)\|\012\|\015)//) and ($result->{'Length'}->val != 0)) { # stream
620	10					38	my $length = $result->{'Length'}->val;
621	10					30	$result->{' streamsrc'} = $fh;
622	10					40	$result->{' streamloc'} = $fh->tell - length($str);
623
624	10	50				127	unless ($opts{'nostreams'}) {
625	10	50				38	if ($length > length($str)) {
626	0					0	$value = $str;
627	0					0	$length -= length($str);
628	0					0	read $fh, $str, $length + 11; # slurp the whole stream!
629							}
630							else {
631	10					21	$value = '';
632							}
633	10					38	$value .= substr($str, 0, $length);
634	10					28	$result->{' stream'} = $value;
635	10					37	$result->{' nofilt'} = 1;
636	10	50				36	$str = update($fh, $str, 1) if $update; # tell update we are in-stream and only need an endstream
637	10					52	$str = substr($str, index($str, 'endstream') + 9);
638							}
639							}
640
641	146	100	100			580	if (defined $result->{'Type'} and defined $types{$result->{'Type'}->val}) {
642	34					121	bless $result, $types{$result->{'Type'}->val};
643							}
644							# gdj: FIXME: if any of the ws chars were crs, then the whole
645							# string might not have been read.
646							}
647
648							# Indirect Object
649							elsif ($str =~ m/^([0-9]+)(?:$ws_char\|$re_comment)+([0-9]+)(?:$ws_char\|$re_comment)+R/s) {
650	152					401	my $num = $1;
651	152					272	$value = $2;
652	152					1434	$str =~ s/^([0-9]+)(?:$ws_char\|$re_comment)+([0-9]+)(?:$ws_char\|$re_comment)+R//s;
653	152	100				453	unless ($result = $self->test_obj($num, $value)) {
654	119					476	$result = PDF::API2::Basic::PDF::Objind->new();
655	119					381	$result->{' objnum'} = $num;
656	119					285	$result->{' objgen'} = $value;
657	119					266	$self->add_obj($result, $num, $value);
658							}
659	152					324	$result->{' parent'} = $self;
660	152					491	weaken $result->{' parent'};
661
662							# Removed to address changes being lost when an indirect object is realised twice
663							# $result->{' realised'} = 0;
664
665							# gdj: FIXME: if any of the ws chars were crs, then the whole
666							# string might not have been read.
667							}
668
669							# Object
670							elsif ($str =~ m/^([0-9]+)(?:$ws_char\|$re_comment)+([0-9]+)(?:$ws_char\|$re_comment)+obj/s) {
671	95					187	my $obj;
672	95					264	my $num = $1;
673	95					190	$value = $2;
674	95					1065	$str =~ s/^([0-9]+)(?:$ws_char\|$re_comment)+([0-9]+)(?:$ws_char\|$re_comment)+obj//s;
675	95					397	($obj, $str) = $self->readval($str, %opts);
676	95	100				269	if ($result = $self->test_obj($num, $value)) {
677	81					271	$result->merge($obj);
678							}
679							else {
680	14					25	$result = $obj;
681	14					35	$self->add_obj($result, $num, $value);
682	14					25	$result->{' realised'} = 1;
683							}
684	95	100				309	$str = update($fh, $str) if $update; # thanks to kundrat@kundrat.sk
685	95					630	$str =~ s/^endobj//;
686							}
687
688							# Name
689							elsif ($str =~ m\|^/($reg_char*)\|s) {
690	287					740	$value = $1;
691	287					1440	$str =~ s\|^/($reg_char*)\|\|s;
692	287					1049	$result = PDF::API2::Basic::PDF::Name->from_pdf($value, $self);
693							}
694
695							# Literal String
696							elsif ($str =~ m/^\(/) {
697							# We now need to find an unbalanced, unescaped right-paren.
698							# This can't be done with a regex.
699	2					10	my $value = '(';
700	2					7	$str = substr($str, 1);
701
702	2					6	my $nested_level = 1;
703	2					6	while (1) {
704							# Ignore everything up to the first escaped or parenthesis character
705	2	50				11	if ($str =~ /^([^\\()]+)(.*)/s) {
706	2					8	$value .= $1;
707	2					6	$str = $2;
708							}
709
710							# Ignore escaped parentheses
711	2	50				20	if ($str =~ /^(\\[()])/) {
		50
		50
		0
712	0					0	$value .= $1;
713	0					0	$str = substr($str, 2);
714							}
715
716							# Left parenthesis: increase nesting
717							elsif ($str =~ /^\(/) {
718	0					0	$value .= '(';
719	0					0	$str = substr($str, 1);
720	0					0	$nested_level++;
721							}
722
723							# Right parenthesis: decrease nesting
724							elsif ($str =~ /^\)/) {
725	2					6	$value .= ')';
726	2					6	$str = substr($str, 1);
727	2					4	$nested_level--;
728	2	50				7	last unless $nested_level;
729							}
730
731							# Other escaped character
732							elsif ($str =~ /^(\\[^()])/) {
733	0					0	$value .= $1;
734	0					0	$str = substr($str, 2);
735							}
736
737							# If there wasn't an escaped or parenthesis character,
738							# read some more.
739							else {
740							# We don't use update because we don't want to remove
741							# whitespace or comments.
742	0	0				0	$fh->read($str, 255, length($str)) or die 'Unterminated string.';
743							}
744							}
745
746	2					9	$result = PDF::API2::Basic::PDF::String->from_pdf($value);
747							}
748
749							# Hex String
750							elsif ($str =~ m/^
751	0					0	$str =~ s/^
752	0					0	$fh->read($str, 255, length($str)) while (0 > index($str, '>'));
753	0					0	($value, $str) = ($str =~ /^(.?)>(.)/s);
754	0					0	$result = PDF::API2::Basic::PDF::String->from_pdf('<' . $value . '>');
755							}
756
757							# Array
758							elsif ($str =~ m/^\[/) {
759	74					267	$str =~ s/^\[//;
760	74	50				247	$str = update($fh, $str) if $update;
761	74					263	$result = PDFArray();
762	74					250	while ($str !~ m/^\]/) {
763	224					845	$str =~ s/^$ws_char+//; # Ignore initial white space
764	224					715	$str =~ s/^\%[^\015\012]*$ws_char+//; # Ignore comments
765
766	224					682	($value, $str) = $self->readval($str, %opts);
767	224					719	$result->add_elements($value);
768	224	50				551	$str = update($fh, $str) if $update; # str might just be exhausted!
769							}
770	74					287	$str =~ s/^\]//;
771							}
772
773							# Boolean
774							elsif ($str =~ m/^(true\|false)($irreg_char\|$)/) {
775	0					0	$value = $1;
776	0					0	$str =~ s/^(?:true\|false)//;
777	0					0	$result = PDF::API2::Basic::PDF::Bool->from_pdf($value);
778							}
779
780							# Number
781							elsif ($str =~ m/^([+-.0-9]+)($irreg_char\|$)/) {
782	86					270	$value = $1;
783	86					350	$str =~ s/^([+-.0-9]+)//;
784
785							# If $str only consists of whitespace (or is empty), call update to
786							# see if this is the beginning of an indirect object or reference
787	86	100	100			1772	if ($update and ($str =~ /^$re_whitespace$/s or $str =~ /^$re_whitespace+[0-9]+$re_whitespace$/s)) {
			100
788	6					61	$str =~ s/^$re_whitespace+/ /s;
789	6					47	$str =~ s/$re_whitespace+$/ /s;
790	6					17	$str = update($fh, $str);
791	6	100				137	if ($str =~ m/^$re_whitespace*([0-9]+)$re_whitespace+(?:R\|obj)/s) {
792	4					18	return $self->readval("$value $str", %opts);
793							}
794							}
795
796	82					520	$result = PDF::API2::Basic::PDF::Number->from_pdf($value);
797							}
798
799							# Null
800							elsif ($str =~ m/^null($irreg_char\|$)/) {
801	0					0	$str =~ s/^null//;
802	0					0	$result = PDF::API2::Basic::PDF::Null->new;
803							}
804
805							else {
806	0					0	die "Can't parse `$str' near " . ($fh->tell()) . " length " . length($str) . ".";
807							}
808
809	838					3857	$str =~ s/^$ws_char+//s;
810	838					2841	return ($result, $str);
811							}
812
813
814							=head2 $ref = $p->read_obj($objind, %opts)
815
816							Given an indirect object reference, locate it and read the object returning
817							the read in object.
818
819							=cut
820
821							sub read_obj {
822	77			77	1	174	my ($self, $objind, %opts) = @_;
823
824	77		50			297	my $res = $self->read_objnum($objind->{' objnum'}, $objind->{' objgen'}, %opts) \|\| return;
825	77	50				267	$objind->merge($res) unless $objind eq $res;
826	77					321	return $objind;
827							}
828
829
830							=head2 $ref = $p->read_objnum($num, $gen, %opts)
831
832							Returns a fully read object of given number and generation in this file
833
834							=cut
835
836							sub read_objnum {
837	85			85	1	2439	my ($self, $num, $gen, %opts) = @_;
838	85	50				194	croak 'Undefined object number in call to read_objnum($num, $gen)' unless defined $num;
839	85	50				176	croak 'Undefined object generation in call to read_objnum($num, $gen)' unless defined $gen;
840	85	50				428	croak "Invalid object number '$num' in call to read_objnum" unless $num =~ /^[0-9]+$/;
841	85	50				293	croak "Invalid object generation '$gen' in call to read_objnum" unless $gen =~ /^[0-9]+$/;
842
843	85		50			224	my $object_location = $self->locate_obj($num, $gen) \|\| return;
844	85					151	my $object;
845
846							# Compressed object
847	85	100				199	if (ref($object_location)) {
848	4					9	my ($object_stream_num, $object_stream_pos) = @{$object_location};
	4					9
849
850	4					32	my $object_stream = $self->read_objnum($object_stream_num, 0, %opts);
851	4	50				17	die 'Cannot find the compressed object stream' unless $object_stream;
852
853	4	50				25	$object_stream->read_stream() if $object_stream->{' nofilt'};
854
855							# An object stream starts with pairs of integers containing object numbers and
856							# stream offsets relative to the First key
857	4					8	my $fh;
858							my $pairs;
859	4	50				14	unless ($object_stream->{' streamfile'}) {
860	4					15	$pairs = substr($object_stream->{' stream'}, 0, $object_stream->{'First'}->val);
861							}
862							else {
863	0					0	CORE::open($fh, '<', $object_stream->{' streamfile'});
864	0					0	read($fh, $pairs, $object_stream->{'First'}->val());
865							}
866	4					31	my @map = split /\s+/, $pairs;
867
868							# Find the offset of the object in the stream
869	4					11	my $index = $object_stream_pos * 2;
870	4	50				14	die "Objind $num does not exist at index $index" unless $map[$index] == $num;
871	4					10	my $start = $map[$index + 1];
872
873							# Unless this is the last object in the stream, its length is determined by the
874							# offset of the next object
875	4					9	my $last_object_in_stream = $map[-2];
876	4					6	my $length;
877	4	100				15	if ($last_object_in_stream == $num) {
878	2	50				7	if ($object_stream->{' stream'}) {
879	2					20	$length = length($object_stream->{' stream'}) - $object_stream->{'First'}->val() - $start;
880							}
881							else {
882	0					0	$length = (-s $object_stream->{' streamfile'}) - $object_stream->{'First'}->val() - $start;
883							}
884							}
885							else {
886	2					7	my $next_start = $map[$index + 3];
887	2					6	$length = $next_start - $start;
888							}
889
890							# Read the object from the stream
891	4					14	my $stream = "$num 0 obj ";
892	4	50				15	unless ($object_stream->{' streamfile'}) {
893	4					12	$stream .= substr($object_stream->{' stream'}, $object_stream->{'First'}->val() + $start, $length);
894							}
895							else {
896	0					0	seek($fh, $object_stream->{'First'}->val() + $start, 0);
897	0					0	read($fh, $stream, $length, length($stream));
898	0					0	close $fh;
899							}
900
901	4					16	($object) = $self->readval($stream, %opts, update => 0);
902	4					24	return $object;
903							}
904
905	81					287	my $current_location = $self->{' INFILE'}->tell;
906	81					586	$self->{' INFILE'}->seek($object_location, 0);
907	81					816	($object) = $self->readval('', %opts);
908	81					394	$self->{' INFILE'}->seek($current_location, 0);
909	81					1083	return $object;
910							}
911
912
913							=head2 $objind = $p->new_obj($obj)
914
915							Creates a new, free object reference based on free space in the cross reference chain.
916							If nothing free then thinks up a new number. If $obj then turns that object into this
917							new object rather than returning a new object.
918
919							=cut
920
921							sub new_obj {
922	1072			1072	1	2094	my ($self, $base) = @_;
923	1072					1603	my $res;
924
925	1072	50	66			2952	if (defined $self->{' free'} and scalar @{$self->{' free'}} > 0) {
	15					63
926	0					0	$res = shift(@{$self->{' free'}});
	0					0
927	0	0				0	if (defined $base) {
928	0					0	my ($num, $gen) = @{$self->{' objects'}{$res->uid}};
	0					0
929	0					0	$self->remove_obj($res);
930	0					0	$self->add_obj($base, $num, $gen);
931	0					0	return $self->out_obj($base);
932							}
933							else {
934	0					0	$self->{' objects'}{$res->uid}[2] = 0;
935	0					0	return $res;
936							}
937							}
938
939	1072					1722	my $tdict = $self;
940	1072					1502	my $i;
941	1072					2346	while (defined $tdict) {
942	1073	50				3241	$i = $tdict->{' xref'}{defined($i) ? $i : ''}[0];
943	1073		33			2798	while (defined $i and $i != 0) {
944	0					0	my ($ni, $ng) = @{$tdict->{' xref'}{$i}};
	0					0
945	0	0				0	unless (defined $self->locate_obj($i, $ng)) {
946	0	0				0	if (defined $base) {
947	0					0	$self->add_obj($base, $i, $ng);
948	0					0	return $base;
949							}
950							else {
951	0		0			0	$res = $self->test_obj($i, $ng) \|\| $self->add_obj(PDF::API2::Basic::PDF::Objind->new(), $i, $ng);
952	0					0	$self->out_obj($res);
953	0					0	return $res;
954							}
955							}
956	0					0	$i = $ni;
957							}
958	1073					2298	$tdict = $tdict->{' prev'};
959							}
960
961	1072					1876	$i = $self->{' maxobj'}++;
962	1072	50				2015	if (defined $base) {
963	1072					2977	$self->add_obj($base, $i, 0);
964	1072					2788	$self->out_obj($base);
965	1072					2290	return $base;
966							}
967							else {
968	0					0	$res = $self->add_obj(PDF::API2::Basic::PDF::Objind->new(), $i, 0);
969	0					0	$self->out_obj($res);
970	0					0	return $res;
971							}
972							}
973
974
975							=head2 $p->out_obj($objind)
976
977							Indicates that the given object reference should appear in the output xref
978							table whether with data or freed.
979
980							=cut
981
982							sub out_obj {
983	2676			2676	1	4534	my ($self, $obj) = @_;
984
985							# This is why we've been keeping the outlist CACHE around; to speed
986							# up this method by orders of magnitude (it saves up from having to
987							# grep the full outlist each time through as we'll just do a lookup
988							# in the hash) (which is super-fast).
989	2676	100				7098	unless (exists $self->{' outlist_cache'}{$obj}) {
990	1087					1627	push @{$self->{' outlist'}}, $obj;
	1087					2436
991							# weaken $self->{' outlist'}->[-1];
992	1087					3482	$self->{' outlist_cache'}{$obj} = 1;
993							}
994	2676					4879	return $obj;
995							}
996
997
998							=head2 $p->free_obj($objind)
999
1000							Marks an object reference for output as being freed.
1001
1002							=cut
1003
1004							sub free_obj {
1005	0			0	1	0	my ($self, $obj) = @_;
1006
1007	0					0	push @{$self->{' free'}}, $obj;
	0					0
1008	0					0	$self->{' objects'}{$obj->uid()}[2] = 1;
1009	0					0	$self->out_obj($obj);
1010							}
1011
1012
1013							=head2 $p->remove_obj($objind)
1014
1015							Removes the object from all places where we might remember it
1016
1017							=cut
1018
1019							sub remove_obj {
1020	0			0	1	0	my ($self, $objind) = @_;
1021
1022							# who says it has to be fast
1023	0					0	delete $self->{' objects'}{$objind->uid()};
1024	0					0	delete $self->{' outlist_cache'}{$objind};
1025	0					0	delete $self->{' printed_cache'}{$objind};
1026	0					0	@{$self->{' outlist'}} = grep { $_ ne $objind } @{$self->{' outlist'}};
	0					0
	0					0
	0					0
1027	0					0	@{$self->{' printed'}} = grep { $_ ne $objind } @{$self->{' printed'}};
	0					0
	0					0
	0					0
1028							$self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} = undef
1029	0	0				0	if $self->{' objcache'}{$objind->{' objnum'}, $objind->{' objgen'}} eq $objind;
1030	0					0	return $self;
1031							}
1032
1033
1034							=head2 $p->ship_out(@objects)
1035
1036							Ships the given objects (or all objects for output if @objects is empty) to
1037							the currently open output file (assuming there is one). Freed objects are not
1038							shipped, and once an object is shipped it is switched such that this file
1039							becomes its source and it will not be shipped again unless out_obj is called
1040							again. Notice that a shipped out object can be re-output or even freed, but
1041							that it will not cause the data already output to be changed.
1042
1043							=cut
1044
1045							sub ship_out {
1046	154			154	1	321	my ($self, @objs) = @_;
1047
1048	154	50				402	die "No output file specified" unless defined $self->{' OUTFILE'};
1049	154					295	my $fh = $self->{' OUTFILE'};
1050	154					396	seek($fh, 0, 2); # go to the end of the file
1051
1052	154	50				434	@objs = @{$self->{' outlist'}} unless scalar @objs > 0;
	154					437
1053	154					386	foreach my $objind (@objs) {
1054	890	50				2481	next unless $objind->is_obj($self);
1055	890					1572	my $j = -1;
1056	890					1477	for (my $i = 0; $i < scalar @{$self->{' outlist'}}; $i++) {
	890					2058
1057	890	50				2542	if ($self->{' outlist'}[$i] eq $objind) {
1058	890					1258	$j = $i;
1059	890					1605	last;
1060							}
1061							}
1062	890	50				1776	next if $j < 0;
1063	890					1217	splice(@{$self->{' outlist'}}, $j, 1);
	890					1715
1064	890					2358	delete $self->{' outlist_cache'}{$objind};
1065	890	50				1216	next if grep { $_ eq $objind } @{$self->{' free'}};
	0					0
	890					2279
1066
1067	890	50				1912	map { $fh->print("\% $_ \n") } split(/$cr/, $objind->{' comments'}) if $objind->{' comments'};
	0					0
1068	890					2501	$self->{' locs'}{$objind->uid()} = $fh->tell();
1069	890					1612	my ($objnum, $objgen) = @{$self->{' objects'}{$objind->uid()}}[0..1];
	890					1856
1070	890					2596	$fh->printf('%d %d obj ', $objnum, $objgen);
1071	890					10410	$objind->outobjdeep($fh, $self);
1072	890					3785	$fh->print(" endobj\n");
1073
1074							# Note that we've output this obj, not forgetting to update
1075							# the cache of whats printed.
1076	890	50				6016	unless (exists $self->{' printed_cache'}{$objind}) {
1077	890					1323	push @{$self->{' printed'}}, $objind;
	890					1977
1078	890					3070	$self->{' printed_cache'}{$objind}++;
1079							}
1080							}
1081	154					456	return $self;
1082							}
1083
1084							=head2 $p->copy($outpdf, \&filter)
1085
1086							Iterates over every object in the file reading the object, calling filter with the object
1087							and outputting the result. if filter is not defined, then just copies input to output.
1088
1089							=cut
1090
1091							sub copy {
1092	0			0	1	0	my ($self, $out, $filter) = @_;
1093	0					0	my ($obj, $minl, $mini, $ming);
1094
1095	0					0	foreach my $key (grep { not m/^[\s\-]/ } keys %$self) {
	0					0
1096	0	0				0	$out->{$key} = $self->{$key} unless defined $out->{$key};
1097							}
1098
1099	0					0	my $tdict = $self;
1100	0					0	while (defined $tdict) {
1101	0					0	foreach my $i (sort {$a <=> $b} keys %{$tdict->{' xref'}}) {
	0					0
	0					0
1102	0					0	my ($nl, $ng, $nt) = @{$tdict->{' xref'}{$i}};
	0					0
1103	0	0				0	next unless $nt eq 'n';
1104
1105	0	0	0			0	if ($nl < $minl or $mini == 0) {
1106	0					0	$mini = $i;
1107	0					0	$ming = $ng;
1108	0					0	$minl = $nl;
1109							}
1110	0	0				0	unless ($obj = $self->test_obj($i, $ng)) {
1111	0					0	$obj = PDF::API2::Basic::PDF::Objind->new();
1112	0					0	$obj->{' objnum'} = $i;
1113	0					0	$obj->{' objgen'} = $ng;
1114	0					0	$self->add_obj($obj, $i, $ng);
1115	0					0	$obj->{' parent'} = $self;
1116	0					0	weaken $obj->{' parent'};
1117	0					0	$obj->{' realised'} = 0;
1118							}
1119	0					0	$obj->realise;
1120	0	0				0	my $res = defined $filter ? &{$filter}($obj) : $obj;
	0					0
1121	0	0	0			0	$out->new_obj($res) unless (!$res \|\| $res->is_obj($out));
1122							}
1123	0					0	$tdict = $tdict->{' prev'};
1124							}
1125
1126							# test for linearized and remove it from output
1127	0					0	$obj = $self->test_obj($mini, $ming);
1128	0	0	0			0	if ($obj->isa('PDF::API2::Basic::PDF::Dict') && $obj->{'Linearized'}) {
1129	0					0	$out->free_obj($obj);
1130							}
1131
1132	0					0	return $self;
1133							}
1134
1135
1136							=head1 PRIVATE METHODS & FUNCTIONS
1137
1138							The following methods and functions are considered private to this class. This
1139							does not mean you cannot use them if you have a need, just that they aren't really
1140							designed for users of this class.
1141
1142							=head2 $offset = $p->locate_obj($num, $gen)
1143
1144							Returns a file offset to the object asked for by following the chain of cross
1145							reference tables until it finds the one you want.
1146
1147							=cut
1148
1149							sub locate_obj {
1150	85			85	1	191	my ($self, $num, $gen) = @_;
1151
1152	85					119	my $tdict = $self;
1153	85					199	while (defined $tdict) {
1154	94	100				293	if (ref $tdict->{' xref'}{$num}) {
1155	85					158	my $ref = $tdict->{' xref'}{$num};
1156	85	100				212	return $ref unless scalar(@$ref) == 3;
1157
1158	81	50				243	if ($ref->[1] == $gen) {
1159	81	50				374	return $ref->[0] if $ref->[2] eq 'n';
1160	0					0	return; # if $ref->[2] eq 'f';
1161							}
1162							}
1163	9					20	$tdict = $tdict->{' prev'};
1164							}
1165	0					0	return;
1166							}
1167
1168
1169							=head2 update($fh, $str, $instream)
1170
1171							Keeps reading $fh for more data to ensure that $str has at least a line full
1172							for C to work on. At this point we also take the opportunity to ignore
1173							comments.
1174
1175							=cut
1176
1177							sub update {
1178	1940			1940	1	3485	my ($fh, $str, $instream) = @_;
1179	1940	50				3456	print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1180	1940	100				3124	if ($instream) {
1181							# we are inside a (possible binary) stream
1182							# so we fetch data till we see an 'endstream'
1183							# -- fredo/2004-09-03
1184	10		33			50	while ($str !~ m/endstream/ and not $fh->eof()) {
1185	0	0				0	print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1186	0					0	$fh->read($str, 314, length($str));
1187							}
1188							}
1189							else {
1190	1930					8341	$str =~ s/^$ws_char*//;
1191	1930		100			16030	while ($str !~ m/$cr/ and not $fh->eof()) {
1192	107	50				1304	print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1193	107					480	$fh->read($str, 314, length($str));
1194	107					2540	$str =~ s/^$ws_char*//so;
1195							}
1196	1930					4983	while ($str =~ m/^\%/) { # restructured by fredo/2003-03-23
1197	1	50				6	print STDERR 'fpos=' . tell($fh) . ' strlen=' . length($str) . "\n" if $readDebug;
1198	1		33			42	$fh->read($str, 314, length($str)) while ($str !~ m/$cr/ and not $fh->eof());
1199	1					27	$str =~ s/^\%[^\015\012]$ws_char//so; # fixed for reportlab -- fredo
1200							}
1201							}
1202
1203	1940					4564	return $str;
1204							}
1205
1206							=head2 $objind = $p->test_obj($num, $gen)
1207
1208							Tests the cache to see whether an object reference (which may or may not have
1209							been getobj()ed) has been cached. Returns it if it has.
1210
1211							=cut
1212
1213							sub test_obj {
1214	247			247	1	524	my ($self, $num, $gen) = @_;
1215	247					956	return $self->{' objcache'}{$num, $gen};
1216							}
1217
1218
1219							=head2 $p->add_obj($objind)
1220
1221							Adds the given object to the internal object cache.
1222
1223							=cut
1224
1225							sub add_obj {
1226	1205			1205	1	2476	my ($self, $obj, $num, $gen) = @_;
1227
1228	1205					4118	$self->{' objcache'}{$num, $gen} = $obj;
1229	1205					5410	$self->{' objects'}{$obj->uid()} = [$num, $gen];
1230							# weaken $self->{' objcache'}{$num, $gen};
1231	1205					2730	return $obj;
1232							}
1233
1234
1235							=head2 $tdict = $p->readxrtr($xpos)
1236
1237							Recursive function which reads each of the cross-reference and trailer tables
1238							in turn until there are no more.
1239
1240							Returns a dictionary corresponding to the trailer chain. Each trailer also
1241							includes the corresponding cross-reference table.
1242
1243							The structure of the xref private element in a trailer dictionary is of an
1244							anonymous hash of cross reference elements by object number. Each element
1245							consists of an array of 3 elements corresponding to the three elements read
1246							in [location, generation number, free or used]. See the PDF specification
1247							for details.
1248
1249							=cut
1250
1251							sub _unpack_xref_stream {
1252	78			78		129	my ($self, $width, $data) = @_;
1253
1254	78	100				157	return unpack('C', $data) if $width == 1;
1255	52	50				113	return unpack('n', $data) if $width == 2;
1256	0	0				0	return unpack('N', "\x00$data") if $width == 3;
1257	0	0				0	return unpack('N', $data) if $width == 4;
1258	0	0				0	return unpack('Q>', $data) if $width == 8;
1259
1260	0					0	die "Unsupported xref stream entry width: $width";
1261							}
1262
1263							sub readxrtr {
1264	19			19	1	69	my ($self, $xpos) = @_;
1265	19					67	my ($tdict, $buf, $xmin, $xnum, $xdiff);
1266
1267	19					50	my $fh = $self->{' INFILE'};
1268	19					89	$fh->seek($xpos, 0);
1269	19					264	$fh->read($buf, 22);
1270	19					248	$buf = update($fh, $buf); # fix for broken JAWS xref calculation.
1271
1272	19					70	my $xlist = {};
1273
1274							## seams that some products calculate wrong prev entries (short)
1275							## so we seek ahead to find one -- fredo; save for now
1276							#while($buf !~ m/^xref$cr/i && !eof($fh))
1277							#{
1278							# $buf =~ s/^(\s+\|\S+\|.)//i;
1279							# $buf=update($fh,$buf);
1280							#}
1281
1282	19	100				324	if ($buf =~ s/^xref$cr//i) {
		50
1283							# Plain XRef tables.
1284	16					546	while ($buf =~ m/^$ws_char([0-9]+)$ws_char+([0-9]+)$ws_char$cr(.*?)$/s) {
1285	21					68	my $old_buf = $buf;
1286	21					59	$xmin = $1;
1287	21					57	$xnum = $2;
1288	21					52	$buf = $3;
1289	21	50				368	unless ($old_buf =~ /^[0-9]+ [0-9]+$cr/) {
1290							# See PDF 1.7 section 7.5.4: Cross-Reference Table
1291	0					0	warn q{Malformed xref in PDF file: subsection shall begin with a line containing two numbers separated by a SPACE (20h)};
1292							}
1293	21					62	$xdiff = length($buf);
1294
1295	21					125	$fh->read($buf, 20 * $xnum - $xdiff + 15, $xdiff);
1296	21		66			736	while ($xnum-- > 0 and $buf =~ s/^0([0-9])$ws_char+0*([0-9]+)$ws_char+([nf])$cr//) {
1297	118	50				680	$xlist->{$xmin} = [$1, $2, $3] unless exists $xlist->{$xmin};
1298	118					941	$xmin++;
1299							}
1300							}
1301
1302	16	50				110	if ($buf !~ /^\s*trailer\b/i) {
1303	0					0	die "Malformed trailer in PDF file $self->{' fname'} at " . ($fh->tell - length($buf));
1304							}
1305
1306	16					70	$buf =~ s/^\s*trailer\b//i;
1307
1308	16					81	($tdict, $buf) = $self->readval($buf);
1309							}
1310							elsif ($buf =~ m/^(\d+)\s+(\d+)\s+obj/i) {
1311	3					18	my ($xref_obj, $xref_gen) = ($1, $2);
1312
1313							# XRef streams.
1314	3					15	($tdict, $buf) = $self->readval($buf);
1315
1316	3	50				13	unless ($tdict->{' stream'}) {
1317	0					0	die "Malformed XRefStm at $xref_obj $xref_gen obj in PDF file $self->{' fname'}";
1318							}
1319	3					19	$tdict->read_stream(1);
1320
1321	3					8	my $stream = $tdict->{' stream'};
1322	3					5	my @widths = map { $_->val } @{$tdict->{W}->val};
	9					22
	3					13
1323
1324	3					7	my $start = 0;
1325	3					6	my $last;
1326
1327							my @index;
1328	3	100				10	if (defined $tdict->{Index}) {
1329	1					2	@index = map { $_->val() } @{$tdict->{Index}->val};
	2					6
	1					3
1330							}
1331							else {
1332	2					9	@index = (0, $tdict->{Size}->val);
1333							}
1334
1335	3					11	while (scalar @index) {
1336	3					6	$start = shift(@index);
1337	3					9	$last = $start + shift(@index) - 1;
1338
1339	3					11	for my $i ($start...$last) {
1340							# Replaced "for $xmin" because it creates a loop-specific local variable, and we
1341							# need $xmin to be correct for maxobj below.
1342	26					37	$xmin = $i;
1343
1344	26					30	my @cols;
1345
1346	26					43	for my $w (@widths) {
1347	78					100	my $data;
1348	78	50				202	$data = $self->_unpack_xref_stream($w, substr($stream, 0, $w, '')) if $w;
1349
1350	78					153	push @cols, $data;
1351							}
1352
1353	26	100				51	$cols[0] = 1 unless defined $cols[0];
1354	26	50				48	if ($cols[0] > 2) {
1355	0					0	die "Invalid XRefStm entry type ($cols[0]) at $xref_obj $xref_gen obj";
1356							}
1357
1358	26	50				50	next if exists $xlist->{$xmin};
1359
1360	26	50				60	my @objind = ($cols[1], defined($cols[2]) ? $cols[2] : ($xmin ? 0 : 65535));
		100
1361	26	100				60	push @objind, ($cols[0] == 0 ? 'f' : 'n') if $cols[0] < 2;
		100
1362
1363	26					79	$xlist->{$xmin} = \@objind;
1364							}
1365							}
1366							}
1367							else {
1368	0					0	die "Malformed xref in PDF file $self->{' fname'}";
1369							}
1370
1371	19					68	$tdict->{' loc'} = $xpos;
1372	19					68	$tdict->{' xref'} = $xlist;
1373	19	100				115	$self->{' maxobj'} = $xmin + 1 if $xmin + 1 > $self->{' maxobj'};
1374							$tdict->{' prev'} = $self->readxrtr($tdict->{'Prev'}->val)
1375	19	100	66			92	if (defined $tdict->{'Prev'} and $tdict->{'Prev'}->val != 0);
1376	19	100				80	delete $tdict->{' prev'} unless defined $tdict->{' prev'};
1377	19					67	return $tdict;
1378							}
1379
1380
1381							=head2 $p->out_trailer($tdict)
1382
1383							Outputs the body and trailer for a PDF file by outputting all the objects in
1384							the ' outlist' and then outputting a xref table for those objects and any
1385							freed ones. It then outputs the trailing dictionary and the trailer code.
1386
1387							=cut
1388
1389							sub out_trailer {
1390	148			148	1	553	my ($self, $tdict, $update) = @_;
1391	148					308	my $fh = $self->{' OUTFILE'};
1392
1393	148					247	while (@{$self->{' outlist'}}) {
	302					829
1394	154					457	$self->ship_out();
1395							}
1396
1397							# When writing new trailers, most dictionary entries get copied from the
1398							# previous trailer, but entries related to cross-reference streams should
1399							# get removed (and possibly recreated below).
1400	148					939	delete $tdict->{$_} for (# Entries common to streams
1401							qw(Length Filter DecodeParms F FFilter FDecodeParms DL),
1402
1403							# Entries specific to cross-reference streams
1404							qw(Index W XRefStm));
1405
1406	148					514	$tdict->{'Size'} = PDFNum($self->{' maxobj'});
1407
1408	148					456	my $tloc = $fh->tell();
1409	148					855	my @out;
1410
1411	148	100				284	my @xreflist = sort { $self->{' objects'}{$a->uid}[0] <=> $self->{' objects'}{$b->uid}[0] } (@{$self->{' printed'} \|\| []}, @{$self->{' free'} \|\| []});
	1334	100				2745
	148					509
	148					676
1412
1413	148					391	my ($i, $j, $k);
1414	148	100				393	unless ($update) {
1415	140					248	$i = 1;
1416	140					437	for ($j = 0; $j < @xreflist; $j++) {
1417	868					1151	my @inserts;
1418	868					1205	$k = $xreflist[$j];
1419	868					1813	while ($i < $self->{' objects'}{$k->uid}[0]) {
1420	0					0	my ($n) = PDF::API2::Basic::PDF::Objind->new();
1421	0					0	$self->add_obj($n, $i, 0);
1422	0					0	$self->free_obj($n);
1423	0					0	push(@inserts, $n);
1424	0					0	$i++;
1425							}
1426	868					1504	splice(@xreflist, $j, 0, @inserts);
1427	868					1285	$j += @inserts;
1428	868					1779	$i++;
1429							}
1430							}
1431
1432	148	100				332	my @freelist = sort { $self->{' objects'}{$a->uid}[0] <=> $self->{' objects'}{$b->uid}[0] } @{$self->{' free'} \|\| []};
	0					0
	148					553
1433
1434	148					272	$j = 0; my $first = -1; $k = 0;
	148					261
	148					302
1435	148					491	for ($i = 0; $i <= $#xreflist + 1; $i++) {
1436	1038	100	100			3049	if ($i > $#xreflist \|\| $self->{' objects'}{$xreflist[$i]->uid}[0] != $j + 1) {
1437	160	100				744	push @out, ($first == -1 ? "0 " : "$self->{' objects'}{$xreflist[$first]->uid}[0] ") . ($i - $first) . "\n";
1438	160	100				447	if ($first == -1) {
1439	148	50				777	push @out, sprintf("%010d 65535 f \n", defined $freelist[$k] ? $self->{' objects'}{$freelist[$k]->uid}[0] : 0);
1440	148					314	$first = 0;
1441							}
1442	160					485	for ($j = $first; $j < $i; $j++) {
1443	890					1432	my $xref = $xreflist[$j];
1444	890	50	33			2060	if (defined($freelist[$k]) and defined($xref) and "$freelist[$k]" eq "$xref") {
			33
1445	0					0	$k++;
1446							push @out, pack("A10AA5A4",
1447							sprintf("%010d", (defined $freelist[$k] ?
1448							$self->{' objects'}{$freelist[$k]->uid}[0] : 0)), " ",
1449	0	0				0	sprintf("%05d", $self->{' objects'}{$xref->uid}[1] + 1),
1450							" f \n");
1451							}
1452							else {
1453							push @out, pack("A10AA5A4", sprintf("%010d", $self->{' locs'}{$xref->uid}), " ",
1454	890					2154	sprintf("%05d", $self->{' objects'}{$xref->uid}[1]),
1455							" n \n");
1456							}
1457							}
1458	160					352	$first = $i;
1459	160	100				645	$j = $self->{' objects'}{$xreflist[$i]->uid}[0] if ($i < scalar @xreflist);
1460							}
1461							else {
1462	878					1877	$j++;
1463							}
1464							}
1465	148	50	33			554	if (exists $tdict->{'Type'} and $tdict->{'Type'}->val() eq 'XRef') {
1466	0					0	my (@index, @stream);
1467	0					0	for (@out) {
1468	0					0	my @a = split;
1469	0	0				0	@a == 2 ? push @index, @a : push @stream, \@a;
1470							}
1471	0					0	my $i = $self->{' maxobj'}++;
1472	0					0	$self->add_obj($tdict, $i, 0);
1473	0					0	$self->out_obj($tdict );
1474
1475	0					0	push @index, $i, 1;
1476	0					0	push @stream, [$tloc, 0, 'n'];
1477
1478	0	0				0	my $len = $tloc > 0xFFFF ? 4 : 2; # don't expect files > 4 Gb
1479	0	0				0	my $tpl = $tloc > 0xFFFF ? 'CNC' : 'CnC'; # don't expect gennum > 255, it's absurd.
1480							# Adobe doesn't use them anymore anyway
1481	0					0	my $stream = '';
1482	0					0	my @prev = (0) x ($len + 2);
1483	0					0	for (@stream) {
1484	0	0	0			0	$_->[1] = 0 if $_->[2] eq 'f' and $_->[1] == 65535;
1485	0	0				0	my @line = unpack 'C*', pack $tpl, $_->[2] eq 'n' ? 1 : 0, @{$_}[0..1];
	0					0
1486
1487							$stream .= pack 'C*', 2, # prepend filtering method, "PNG Up"
1488	0					0	map {($line[$_] - $prev[$_] + 256) % 256 } 0 .. $#line;
	0					0
1489	0					0	@prev = @line;
1490							}
1491	0					0	$tdict->{'Size'} = PDFNum($i + 1);
1492	0					0	$tdict->{'Index'} = PDFArray(map PDFNum( $_ ), @index);
1493	0					0	$tdict->{'W'} = PDFArray(map PDFNum( $_ ), 1, $len, 1);
1494	0					0	$tdict->{'Filter'} = PDFName('FlateDecode');
1495
1496	0					0	$tdict->{'DecodeParms'} = PDFDict();
1497	0					0	$tdict->{'DecodeParms'}->val->{'Predictor'} = PDFNum(12);
1498	0					0	$tdict->{'DecodeParms'}->val->{'Columns'} = PDFNum($len + 2);
1499
1500	0					0	$stream = PDF::API2::Basic::PDF::Filter::FlateDecode->new->outfilt($stream, 1);
1501	0					0	$tdict->{' stream'} = $stream;
1502	0					0	$tdict->{' nofilt'} = 1;
1503	0					0	delete $tdict->{'Length'};
1504	0					0	$self->ship_out();
1505							}
1506							else {
1507	148					609	$fh->print("xref\n", @out, "trailer\n");
1508	148					1601	$tdict->outobjdeep($fh, $self);
1509	148					487	$fh->print("\n");
1510							}
1511	148					1218	$fh->print("startxref\n$tloc\n%%EOF\n");
1512							}
1513
1514
1515							=head2 PDF::API2::Basic::PDF::File->_new
1516
1517							Creates a very empty PDF file object (used by new and open)
1518
1519							=cut
1520
1521							sub _new {
1522	180			180		418	my $class = shift();
1523	180					366	my $self = {};
1524
1525	180					402	bless $self, $class;
1526	180					640	$self->{' outlist'} = [];
1527	180					442	$self->{' outlist_cache'} = {}; # A cache of whats in the 'outlist'
1528	180					385	$self->{' maxobj'} = 1;
1529	180					398	$self->{' objcache'} = {};
1530	180					384	$self->{' objects'} = {};
1531
1532	180					397	return $self;
1533							}
1534
1535							1;
1536
1537							=head1 AUTHOR
1538
1539							Martin Hosken Martin_Hosken@sil.org
1540
1541							Copyright Martin Hosken 1999 and onwards
1542
1543							No warranty or expression of effectiveness, least of all regarding anyone's
1544							safety, is implied in this software or documentation.