File Coverage

blib/lib/Palm/Doc.pm
Criterion Covered Total %
statement 146 152 96.0
branch 39 58 67.2
condition 20 30 66.6
subroutine 16 16 100.0
pod 4 5 80.0
total 225 261 86.2


line stmt bran cond sub pod time code
1             # Palm::Doc.pm
2             #
3             # Palm::PDB helper for handling Palm Doc databases
4             #
5             # Copyright (C) 2004 Christophe Beauregard
6             #
7             # $Id: Doc.pm,v 1.19 2005/05/12 01:36:49 cpb Exp $
8              
9 1     1   48971 use strict;
  1         4  
  1         49  
10              
11             package Palm::Doc;
12              
13 1     1   1335 use Palm::PDB;
  1         6177  
  1         46  
14 1     1   946 use Palm::Raw();
  1         610  
  1         24  
15 1     1   7 use vars qw( $VERSION @ISA );
  1         2  
  1         115  
16              
17             $VERSION = do { my @r = (q$Revision: 1.19 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
18              
19             @ISA = qw( Palm::Raw );
20              
21 1     1   6 use constant DOC_UNCOMPRESSED => scalar 1;
  1         2  
  1         66  
22 1     1   5 use constant DOC_COMPRESSED => scalar 2;
  1         2  
  1         38  
23 1     1   5 use constant DOC_RECSIZE => scalar 4096;
  1         10  
  1         1895  
24              
25             =head1 NAME
26              
27             Palm::Doc - Handler for Palm Doc books
28              
29             =head1 SYNOPSIS
30              
31             use Palm::Doc;
32              
33             =head1 DESCRIPTION
34              
35             Helper for reading and writing Palm Doc books. The interface is based on
36             L since it just makes sense. However, because of the nature
37             of these databases, record-level processing is just a Bad Idea. Use
38             the C and C calls rather than do direct access of the
39             C<@records> array.
40              
41             =head1 EXAMPLES
42              
43             Convert a text file to a .pdb:
44              
45             use Palm::Doc;
46             my $doc = new Palm::Doc;
47             $doc->textfile( $ARGV[0] );
48             $doc->Write( $ARGV[0] . ".pdb" );
49              
50             Convert an HTML file to a .prc:
51              
52             use HTML::TreeBuilder;
53             use HTML::FormatText;
54             use Palm::Doc;
55              
56             my $tree = HTML::TreeBuilder->new_from_file( $ARGV[0] );
57             my $formatter = HTML::FormatText->new( leftmargin => 0, rightmargin => 80 );
58             my $doc = new Palm::Doc;
59             $doc->{attributes}{resource} = 1;
60             $doc->text( $formatter->format( $tree ) );
61             $doc->Write( $ARGV[0] . ".prc" );
62              
63             =cut
64             #'
65              
66             sub import
67             {
68 1     1   18 &Palm::PDB::RegisterPDBHandlers( __PACKAGE__, [ "REAd", "TEXt" ], );
69 1         22 &Palm::PDB::RegisterPRCHandlers( __PACKAGE__, [ "REAd", "TEXt" ], );
70 1         14 &Palm::PDB::RegisterPDBHandlers( __PACKAGE__, [ "MOBI", "BOOK" ], );
71 1         14 &Palm::PDB::RegisterPRCHandlers( __PACKAGE__, [ "MOBI", "BOOK" ], );
72             }
73              
74             =head2 new
75              
76             $doc = new Palm::Doc;
77              
78             Create a new Doc object. By default, it's not a resource database. Setting
79             C<$self->{attributes}{resource}> to C<1> before any manipulations will
80             cause it to become a resource database.
81              
82             =cut
83              
84             sub new
85             {
86 1     1 1 16 my $class = shift;
87 1         14 my $self = $class->SUPER::new(@_);
88              
89 1         101 $self->{'creator'} = 'REAd';
90 1         3 $self->{'type'} = 'TEXt';
91              
92 1         3 $self->{attributes}{resource} = 0;
93              
94 1         3 $self->{appinfo} = undef;
95 1         2 $self->{sort} = undef;
96 1         3 $self->{records} = [];
97              
98 1         3 return $self;
99             }
100              
101             # determine if the given (raw) record is a Doc header record and fill in the
102             # record with appropriate fields if it is.
103             sub _parse_headerrec($) {
104 3     3   4 my $record = shift;
105 3 50       9 return undef unless exists $record->{'data'};
106              
107             # Doc header is minimum of 16 bytes
108 3 50       7 return undef if length $record->{'data'} < 16;
109              
110 3         19 my ($version,$spare,$ulen, $records, $recsize, $position)
111             = unpack( 'n n N n n N', $record->{'data'} );
112              
113             # the header is followed by a list of record sizes. We don't use
114             # this since we can guess the sizes pretty easily by looking at
115             # the actual records.
116              
117             # According to the spec, $version is either 1 (uncompressed)
118             # or 2 (compress), while spare is always zero. AportisDoc supposedly sets
119             # spare to something else, so screw AportisDoc.
120 3 50 33     18 return undef if $version != DOC_UNCOMPRESSED and $version != DOC_COMPRESSED;
121 3 50       7 return undef if $spare != 0;
122              
123 3         4 $record->{'version'} = $version;
124 3         4 $record->{'length'} = $ulen;
125 3         5 $record->{'records'} = $records;
126 3         5 $record->{'recsize'} = $recsize;
127 3         6 $record->{'position'} = $position;
128              
129 3         10 return $record;
130             }
131              
132             sub _compress_record($$) {
133 2     2   3 my ($version,$in) = @_;
134 2 50       6 return $in if $version == DOC_UNCOMPRESSED;
135              
136 2         3 my $out = '';
137              
138 2         3 my $lin = length $in;
139 2         2 my $i = 0;
140 2         6 while( $i < $lin ) {
141             # See http://patb.dyndns.org/Programming/PilotDoc.htm for the code type
142             # taxonomy.
143              
144             # Try type B compression first.
145             # If the next 3 to 10 bytes are already in the compressed buffer, we can
146             # encode them into a 2 byte sequence. Don't bother too close to the ends,
147             # however... Makes the boundary conditions simpler.
148 570 100 100     1903 if( $i > 10 and $lin - $i > 10 ) {
149 528         480 my $chunk = '';
150 528         458 my $match = -1;
151              
152             # the preamble is what'll be in the decoders output buffer.
153 528         656 my $preamble = substr( $in, 0, $i );
154 528         882 for( my $j = 10; $j >= 3; $j -- ) {
155 4224         4106 $chunk = substr( $in, $i, $j ); # grab next $j characters
156 4224         5563 $match = rindex( $preamble, $chunk ); # in the output?
157              
158             # type B code has a 2047 byte sliding window, so matches have to be
159             # within that range to be useful
160 4224 100 66     7051 last if $match >= 0 and ($i - $match) <= 2047;
161 4216         6431 $match = -1;
162             }
163              
164 528         485 my $n = length $chunk;
165 528 50 66     1074 if( $match >= 0 and $n <= 10 and $n >= 3 ) {
      66        
166 8         10 my $m = $i - $match;
167              
168             # first 2 bits are 10, next 11 are offset, next 3 are length-3
169 8         24 $out .= pack( "n", 0x8000 + (($m<<3)&0x3ff8) + ($n-3) );
170              
171 8         9 $i += $n;
172              
173 8         18 next;
174             }
175             }
176            
177 562         662 my $ch = substr( $in, $i ++, 1 );
178 562         513 my $och = ord($ch);
179              
180             # Try type C compression.
181 562 100 100     1922 if( $i+1 < $lin and $ch eq ' ' ) {
182 368         431 my $nch = substr( $in, $i, 1 );
183 368         352 my $onch = ord($nch);
184              
185 368 50 66     964 if( $onch >= 0x40 and $onch < 0x80 ) {
186             # space plus ASCII character compression
187 0         0 $out .= chr($onch ^ 0x80);
188 0         0 $i ++;
189              
190 0         0 next;
191             }
192             }
193              
194 562 100 66     2437 if( $och == 0 or ($och >= 9 and $och < 0x80) ) {
      33        
195             # pass through
196 372         681 $out .= $ch;
197             } else {
198             # type A code. This is essentially an 'escape' like '\\' in strings.
199             # For efficiency, it's best to encode as long a sequence as
200             # possible with one copy. This might seem like it would cause us to miss
201             # out on a type B sequence, but in actuality keeping long binary strings
202             # together improves the likelyhood of a later type B sequence than
203             # interspersing them with x01's.
204              
205 190         317 my $next = substr($in,$i - 1);
206 190 50       553 if( $next =~ /([\x01-\x08\x80-\xff]{1,8})/o ) {
207 190         253 my $binseq = $1;
208 190         210 $out .= chr(length $binseq);
209 190         171 $out .= $binseq;
210 190         394 $i += length( $binseq ) - 1; # first char, $ch, is already counted
211             }
212             }
213             }
214              
215 2         14 return $out;
216             }
217              
218             # algorithm taken from makedoc7.cpp with reference to
219             # http://patb.dyndns.org/Programming/PilotDoc.htm and
220             # http://www.pyrite.org/doc_format.html
221             sub _decompress_record($$) {
222 1     1   2 my ($version,$in) = @_;
223 1 50       3 return $in if $version == DOC_UNCOMPRESSED;
224              
225 1         1 my $out = '';
226              
227 1         2 my $lin = length $in;
228 1         1 my $i = 0;
229 1         4 while( $i < $lin ) {
230 285         287 my $ch = substr( $in, $i ++, 1 );
231 285         219 my $och = ord($ch);
232              
233 285 100 66     828 if( $och >= 1 and $och <= 8 ) {
    100          
    50          
234             # copy this many bytes... basically a way to 'escape' data
235 95         88 $out .= substr( $in, $i, $och );
236 95         144 $i += $och;
237             } elsif( $och < 0x80 ) {
238             # pass through 0, 9-0x7f
239 186         278 $out .= $ch;
240             } elsif( $och >= 0xc0 ) {
241             # 0xc0-0xff are 'space' plus ASCII char
242 0         0 $out .= ' ';
243 0         0 $out .= chr($och ^ 0x80);
244             } else {
245             # 0x80-0xbf is sequence from already decompressed buffer
246 4         6 my $nch = substr( $in, $i ++, 1 );
247 4         4 $och = ($och << 8) + ord($nch);
248 4         5 my $m = ($och & 0x3fff) >> 3;
249 4         4 my $n = ($och & 0x7) + 3;
250              
251             # This isn't very perl-like, but a simple
252             # substr($out,$lo-$m,$n) doesn't work.
253 4         6 my $lo = length $out;
254 4         9 for( my $j = 0; $j < $n; $j ++, $lo ++ ) {
255 12 50       17 die "bad Doc compression" unless ($lo-$m) >= 0;
256 12         33 $out .= substr( $out, $lo-$m, 1 );
257             }
258             }
259             }
260              
261 1         5 return $out;
262             }
263              
264             sub Write {
265 2     2 1 19 my $self = shift;
266              
267 2         6 my $prc = $self->{attributes}{resource};
268 2 100       8 my $recs = $prc ? $self->{'resources'} : $self->{'records'};
269 2         4 my $header = $recs->[0];
270 2 50       8 unless( defined _parse_headerrec($header) ) {
271 0         0 die "@_: Doesn't appear to be a correct book...";
272             }
273              
274 2         25 $self->SUPER::Write(@_);
275             }
276              
277             =head2 text
278              
279             $text = $doc->text;
280              
281             Return the contents of the Doc database.
282              
283             $text = $doc->text( @text );
284              
285             Set the contents of the Doc book to the specified arguments. All the list arguments
286             will simply be concatenated together.
287              
288             =cut
289              
290             sub text {
291 3     3 1 1565 my $self = shift;
292              
293 3         4 my $body = '';
294 3         7 my $prc = $self->{attributes}{resource};
295              
296 3 100       11 if( @_ > 0 ) {
    50          
297 2         5 $body = join( '', @_ );
298              
299 2         2 my $version = DOC_COMPRESSED;
300              
301 2         5 $self->{'records'} = [];
302 2         8 $self->{'resources'} = [];
303              
304             # first record is the header
305 2 100       26 my $header = $prc ? $self->append_Resource() : $self->append_Record();
306 2         53 $header->{'version'} = $version;
307 2         3 $header->{'length'} = 0;
308 2         3 $header->{'records'} = 0;
309 2         3 $header->{'recsize'} = DOC_RECSIZE;
310              
311             # break the document into record-sized chunks
312 2         7 for( my $i = 0; $i < length($body); $i += DOC_RECSIZE ) {
313 2 100       7 my $record = $prc ? $self->append_Resource : $self->append_Record;
314 2         28 my $chunk = substr($body,$i,DOC_RECSIZE);
315 2         9 $record->{'data'} = _compress_record( $version, $chunk );
316              
317 2         6 $header->{'records'} ++;
318 2         10 $header->{'length'} += length $body;
319             }
320              
321 2 50       7 $header->{'recsize'} = $header->{'length'}
322             if $header->{'length'} < DOC_RECSIZE;
323              
324             # pack up the header
325 2         16 $header->{'data'} = pack( 'n xx N n n N',
326             $header->{'version'}, $header->{'length'},
327             $header->{'records'}, $header->{'recsize'}, 0 );
328              
329             } elsif( defined wantarray ) {
330              
331 1 50       4 my $recs = $prc ? $self->{'resources'} : $self->{'records'};
332              
333 1         2 my $header = $recs->[0];
334 1 50       3 if( defined _parse_headerrec($header) ) {
335             # a proper Doc file should be fine, but if it's not Doc
336             # compression like some Mobi docs seem to be we want to
337             # bail early. Otherwise we end up with a huge stream of
338             # substr() errors and we _still_ don't get any content.
339 1         1 eval {
340 1 50   1 0 6 sub min { return ($_[0]<$_[1]) ? $_[0] : $_[1] }
341 1         5 my $maxi = min($#$recs, $header->{'records'});
342 1         4 for( my $i = 1; $i <= $maxi; $i ++ ) {
343 1         3 $body .= _decompress_record( $header->{'version'},
344             $recs->[$i]->{'data'} );
345             }
346             };
347 1 50       4 return undef if $@;
348             }
349             }
350              
351 3         12 return $body;
352             }
353              
354             =head2 textfile
355              
356             $doc->textfile( "README.txt" );
357              
358             Set the contents of the Doc to the contents of the file and sets the name of the PDB to
359             the specified filename.
360              
361             =cut
362              
363             sub textfile($$) {
364 2     2 1 1118 my ($self, $filename) = @_;
365              
366 2 50       62 open IN, "< $filename" or return undef;
367 2         5 binmode IN;
368 2         45 $self->text( '', );
369 2         58 close IN;
370              
371 2         12 $self->{'name'} = $filename;
372             }
373              
374             1;
375             __END__