File Coverage

blib/lib/Palm/Doc.pm

Criterion	Covered	Total	%
statement	146	152	96.0
branch	39	58	67.2
condition	20	30	66.6
subroutine	16	16	100.0
pod	4	5	80.0
total	225	261	86.2

line	stmt	bran	cond	sub	pod	time	code
1							# Palm::Doc.pm
2							#
3							# Palm::PDB helper for handling Palm Doc databases
4							#
5							# Copyright (C) 2004 Christophe Beauregard
6							#
7							# $Id: Doc.pm,v 1.19 2005/05/12 01:36:49 cpb Exp $
8
9	1			1		48971	use strict;
	1					4
	1					49
10
11							package Palm::Doc;
12
13	1			1		1335	use Palm::PDB;
	1					6177
	1					46
14	1			1		946	use Palm::Raw();
	1					610
	1					24
15	1			1		7	use vars qw( $VERSION @ISA );
	1					2
	1					115
16
17							$VERSION = do { my @r = (q$Revision: 1.19 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
18
19							@ISA = qw( Palm::Raw );
20
21	1			1		6	use constant DOC_UNCOMPRESSED => scalar 1;
	1					2
	1					66
22	1			1		5	use constant DOC_COMPRESSED => scalar 2;
	1					2
	1					38
23	1			1		5	use constant DOC_RECSIZE => scalar 4096;
	1					10
	1					1895
24
25							=head1 NAME
26
27							Palm::Doc - Handler for Palm Doc books
28
29							=head1 SYNOPSIS
30
31							use Palm::Doc;
32
33							=head1 DESCRIPTION
34
35							Helper for reading and writing Palm Doc books. The interface is based on
36							L since it just makes sense. However, because of the nature
37							of these databases, record-level processing is just a Bad Idea. Use
38							the C and C calls rather than do direct access of the
39							C<@records> array.
40
41							=head1 EXAMPLES
42
43							Convert a text file to a .pdb:
44
45							use Palm::Doc;
46							my $doc = new Palm::Doc;
47							$doc->textfile( $ARGV[0] );
48							$doc->Write( $ARGV[0] . ".pdb" );
49
50							Convert an HTML file to a .prc:
51
52							use HTML::TreeBuilder;
53							use HTML::FormatText;
54							use Palm::Doc;
55
56							my $tree = HTML::TreeBuilder->new_from_file( $ARGV[0] );
57							my $formatter = HTML::FormatText->new( leftmargin => 0, rightmargin => 80 );
58							my $doc = new Palm::Doc;
59							$doc->{attributes}{resource} = 1;
60							$doc->text( $formatter->format( $tree ) );
61							$doc->Write( $ARGV[0] . ".prc" );
62
63							=cut
64							#'
65
66							sub import
67							{
68	1			1		18	&Palm::PDB::RegisterPDBHandlers( __PACKAGE__, [ "REAd", "TEXt" ], );
69	1					22	&Palm::PDB::RegisterPRCHandlers( __PACKAGE__, [ "REAd", "TEXt" ], );
70	1					14	&Palm::PDB::RegisterPDBHandlers( __PACKAGE__, [ "MOBI", "BOOK" ], );
71	1					14	&Palm::PDB::RegisterPRCHandlers( __PACKAGE__, [ "MOBI", "BOOK" ], );
72							}
73
74							=head2 new
75
76							$doc = new Palm::Doc;
77
78							Create a new Doc object. By default, it's not a resource database. Setting
79							C<$self->{attributes}{resource}> to C<1> before any manipulations will
80							cause it to become a resource database.
81
82							=cut
83
84							sub new
85							{
86	1			1	1	16	my $class = shift;
87	1					14	my $self = $class->SUPER::new(@_);
88
89	1					101	$self->{'creator'} = 'REAd';
90	1					3	$self->{'type'} = 'TEXt';
91
92	1					3	$self->{attributes}{resource} = 0;
93
94	1					3	$self->{appinfo} = undef;
95	1					2	$self->{sort} = undef;
96	1					3	$self->{records} = [];
97
98	1					3	return $self;
99							}
100
101							# determine if the given (raw) record is a Doc header record and fill in the
102							# record with appropriate fields if it is.
103							sub _parse_headerrec($) {
104	3			3		4	my $record = shift;
105	3	50				9	return undef unless exists $record->{'data'};
106
107							# Doc header is minimum of 16 bytes
108	3	50				7	return undef if length $record->{'data'} < 16;
109
110	3					19	my ($version,$spare,$ulen, $records, $recsize, $position)
111							= unpack( 'n n N n n N', $record->{'data'} );
112
113							# the header is followed by a list of record sizes. We don't use
114							# this since we can guess the sizes pretty easily by looking at
115							# the actual records.
116
117							# According to the spec, $version is either 1 (uncompressed)
118							# or 2 (compress), while spare is always zero. AportisDoc supposedly sets
119							# spare to something else, so screw AportisDoc.
120	3	50	33			18	return undef if $version != DOC_UNCOMPRESSED and $version != DOC_COMPRESSED;
121	3	50				7	return undef if $spare != 0;
122
123	3					4	$record->{'version'} = $version;
124	3					4	$record->{'length'} = $ulen;
125	3					5	$record->{'records'} = $records;
126	3					5	$record->{'recsize'} = $recsize;
127	3					6	$record->{'position'} = $position;
128
129	3					10	return $record;
130							}
131
132							sub _compress_record($$) {
133	2			2		3	my ($version,$in) = @_;
134	2	50				6	return $in if $version == DOC_UNCOMPRESSED;
135
136	2					3	my $out = '';
137
138	2					3	my $lin = length $in;
139	2					2	my $i = 0;
140	2					6	while( $i < $lin ) {
141							# See http://patb.dyndns.org/Programming/PilotDoc.htm for the code type
142							# taxonomy.
143
144							# Try type B compression first.
145							# If the next 3 to 10 bytes are already in the compressed buffer, we can
146							# encode them into a 2 byte sequence. Don't bother too close to the ends,
147							# however... Makes the boundary conditions simpler.
148	570	100	100			1903	if( $i > 10 and $lin - $i > 10 ) {
149	528					480	my $chunk = '';
150	528					458	my $match = -1;
151
152							# the preamble is what'll be in the decoders output buffer.
153	528					656	my $preamble = substr( $in, 0, $i );
154	528					882	for( my $j = 10; $j >= 3; $j -- ) {
155	4224					4106	$chunk = substr( $in, $i, $j ); # grab next $j characters
156	4224					5563	$match = rindex( $preamble, $chunk ); # in the output?
157
158							# type B code has a 2047 byte sliding window, so matches have to be
159							# within that range to be useful
160	4224	100	66			7051	last if $match >= 0 and ($i - $match) <= 2047;
161	4216					6431	$match = -1;
162							}
163
164	528					485	my $n = length $chunk;
165	528	50	66			1074	if( $match >= 0 and $n <= 10 and $n >= 3 ) {
			66
166	8					10	my $m = $i - $match;
167
168							# first 2 bits are 10, next 11 are offset, next 3 are length-3
169	8					24	$out .= pack( "n", 0x8000 + (($m<<3)&0x3ff8) + ($n-3) );
170
171	8					9	$i += $n;
172
173	8					18	next;
174							}
175							}
176
177	562					662	my $ch = substr( $in, $i ++, 1 );
178	562					513	my $och = ord($ch);
179
180							# Try type C compression.
181	562	100	100			1922	if( $i+1 < $lin and $ch eq ' ' ) {
182	368					431	my $nch = substr( $in, $i, 1 );
183	368					352	my $onch = ord($nch);
184
185	368	50	66			964	if( $onch >= 0x40 and $onch < 0x80 ) {
186							# space plus ASCII character compression
187	0					0	$out .= chr($onch ^ 0x80);
188	0					0	$i ++;
189
190	0					0	next;
191							}
192							}
193
194	562	100	66			2437	if( $och == 0 or ($och >= 9 and $och < 0x80) ) {
			33
195							# pass through
196	372					681	$out .= $ch;
197							} else {
198							# type A code. This is essentially an 'escape' like '\\' in strings.
199							# For efficiency, it's best to encode as long a sequence as
200							# possible with one copy. This might seem like it would cause us to miss
201							# out on a type B sequence, but in actuality keeping long binary strings
202							# together improves the likelyhood of a later type B sequence than
203							# interspersing them with x01's.
204
205	190					317	my $next = substr($in,$i - 1);
206	190	50				553	if( $next =~ /([\x01-\x08\x80-\xff]{1,8})/o ) {
207	190					253	my $binseq = $1;
208	190					210	$out .= chr(length $binseq);
209	190					171	$out .= $binseq;
210	190					394	$i += length( $binseq ) - 1; # first char, $ch, is already counted
211							}
212							}
213							}
214
215	2					14	return $out;
216							}
217
218							# algorithm taken from makedoc7.cpp with reference to
219							# http://patb.dyndns.org/Programming/PilotDoc.htm and
220							# http://www.pyrite.org/doc_format.html
221							sub _decompress_record($$) {
222	1			1		2	my ($version,$in) = @_;
223	1	50				3	return $in if $version == DOC_UNCOMPRESSED;
224
225	1					1	my $out = '';
226
227	1					2	my $lin = length $in;
228	1					1	my $i = 0;
229	1					4	while( $i < $lin ) {
230	285					287	my $ch = substr( $in, $i ++, 1 );
231	285					219	my $och = ord($ch);
232
233	285	100	66			828	if( $och >= 1 and $och <= 8 ) {
		100
		50
234							# copy this many bytes... basically a way to 'escape' data
235	95					88	$out .= substr( $in, $i, $och );
236	95					144	$i += $och;
237							} elsif( $och < 0x80 ) {
238							# pass through 0, 9-0x7f
239	186					278	$out .= $ch;
240							} elsif( $och >= 0xc0 ) {
241							# 0xc0-0xff are 'space' plus ASCII char
242	0					0	$out .= ' ';
243	0					0	$out .= chr($och ^ 0x80);
244							} else {
245							# 0x80-0xbf is sequence from already decompressed buffer
246	4					6	my $nch = substr( $in, $i ++, 1 );
247	4					4	$och = ($och << 8) + ord($nch);
248	4					5	my $m = ($och & 0x3fff) >> 3;
249	4					4	my $n = ($och & 0x7) + 3;
250
251							# This isn't very perl-like, but a simple
252							# substr($out,$lo-$m,$n) doesn't work.
253	4					6	my $lo = length $out;
254	4					9	for( my $j = 0; $j < $n; $j ++, $lo ++ ) {
255	12	50				17	die "bad Doc compression" unless ($lo-$m) >= 0;
256	12					33	$out .= substr( $out, $lo-$m, 1 );
257							}
258							}
259							}
260
261	1					5	return $out;
262							}
263
264							sub Write {
265	2			2	1	19	my $self = shift;
266
267	2					6	my $prc = $self->{attributes}{resource};
268	2	100				8	my $recs = $prc ? $self->{'resources'} : $self->{'records'};
269	2					4	my $header = $recs->[0];
270	2	50				8	unless( defined _parse_headerrec($header) ) {
271	0					0	die "@_: Doesn't appear to be a correct book...";
272							}
273
274	2					25	$self->SUPER::Write(@_);
275							}
276
277							=head2 text
278
279							$text = $doc->text;
280
281							Return the contents of the Doc database.
282
283							$text = $doc->text( @text );
284
285							Set the contents of the Doc book to the specified arguments. All the list arguments
286							will simply be concatenated together.
287
288							=cut
289
290							sub text {
291	3			3	1	1565	my $self = shift;
292
293	3					4	my $body = '';
294	3					7	my $prc = $self->{attributes}{resource};
295
296	3	100				11	if( @_ > 0 ) {
		50
297	2					5	$body = join( '', @_ );
298
299	2					2	my $version = DOC_COMPRESSED;
300
301	2					5	$self->{'records'} = [];
302	2					8	$self->{'resources'} = [];
303
304							# first record is the header
305	2	100				26	my $header = $prc ? $self->append_Resource() : $self->append_Record();
306	2					53	$header->{'version'} = $version;
307	2					3	$header->{'length'} = 0;
308	2					3	$header->{'records'} = 0;
309	2					3	$header->{'recsize'} = DOC_RECSIZE;
310
311							# break the document into record-sized chunks
312	2					7	for( my $i = 0; $i < length($body); $i += DOC_RECSIZE ) {
313	2	100				7	my $record = $prc ? $self->append_Resource : $self->append_Record;
314	2					28	my $chunk = substr($body,$i,DOC_RECSIZE);
315	2					9	$record->{'data'} = _compress_record( $version, $chunk );
316
317	2					6	$header->{'records'} ++;
318	2					10	$header->{'length'} += length $body;
319							}
320
321	2	50				7	$header->{'recsize'} = $header->{'length'}
322							if $header->{'length'} < DOC_RECSIZE;
323
324							# pack up the header
325	2					16	$header->{'data'} = pack( 'n xx N n n N',
326							$header->{'version'}, $header->{'length'},
327							$header->{'records'}, $header->{'recsize'}, 0 );
328
329							} elsif( defined wantarray ) {
330
331	1	50				4	my $recs = $prc ? $self->{'resources'} : $self->{'records'};
332
333	1					2	my $header = $recs->[0];
334	1	50				3	if( defined _parse_headerrec($header) ) {
335							# a proper Doc file should be fine, but if it's not Doc
336							# compression like some Mobi docs seem to be we want to
337							# bail early. Otherwise we end up with a huge stream of
338							# substr() errors and we _still_ don't get any content.
339	1					1	eval {
340	1	50		1	0	6	sub min { return ($_[0]<$_[1]) ? $_[0] : $_[1] }
341	1					5	my $maxi = min($#$recs, $header->{'records'});
342	1					4	for( my $i = 1; $i <= $maxi; $i ++ ) {
343	1					3	$body .= _decompress_record( $header->{'version'},
344							$recs->[$i]->{'data'} );
345							}
346							};
347	1	50				4	return undef if $@;
348							}
349							}
350
351	3					12	return $body;
352							}
353
354							=head2 textfile
355
356							$doc->textfile( "README.txt" );
357
358							Set the contents of the Doc to the contents of the file and sets the name of the PDB to
359							the specified filename.
360
361							=cut
362
363							sub textfile($$) {
364	2			2	1	1118	my ($self, $filename) = @_;
365
366	2	50				62	open IN, "< $filename" or return undef;
367	2					5	binmode IN;
368	2					45	$self->text( '', );
369	2					58	close IN;
370
371	2					12	$self->{'name'} = $filename;
372							}
373
374							1;
375							__END__