File Coverage

blib/lib/EBook/MOBI/MobiPerl/Palm/Doc.pm
Criterion Covered Total %
statement 91 162 56.1
branch 19 60 31.6
condition 19 30 63.3
subroutine 12 16 75.0
pod 4 5 80.0
total 145 273 53.1


line stmt bran cond sub pod time code
1             # Palm::Doc.pm
2             #
3             # Palm::PDB helper for handling Palm Doc databases
4             #
5             # Copyright (C) 2004 Christophe Beauregard
6             #
7             # $Id: Doc.pm,v 1.19 2005/05/12 01:36:49 cpb Exp $
8              
9 9     9   44 use strict;
  9         15  
  9         381  
10              
11             package EBook::MOBI::MobiPerl::Palm::Doc;
12              
13 9     9   44 use EBook::MOBI::MobiPerl::Palm::PDB;
  9         15  
  9         204  
14 9     9   4898 use EBook::MOBI::MobiPerl::Palm::Raw();
  9         21  
  9         197  
15 9     9   58 use vars qw( $VERSION @ISA );
  9         14  
  9         776  
16              
17             $VERSION = do { my @r = (q$Revision: 1.19 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
18              
19             @ISA = qw( EBook::MOBI::MobiPerl::Palm::Raw );
20              
21 9     9   42 use constant DOC_UNCOMPRESSED => scalar 1;
  9         14  
  9         608  
22 9     9   38 use constant DOC_COMPRESSED => scalar 2;
  9         17  
  9         405  
23 9     9   38 use constant DOC_RECSIZE => scalar 4096;
  9         15  
  9         14201  
24              
25             =head1 NAME
26              
27             Palm::Doc - Handler for Palm Doc books
28              
29             =head1 SYNOPSIS
30              
31             use Palm::Doc;
32              
33             =head1 DESCRIPTION
34              
35             Helper for reading and writing Palm Doc books. The interface is based on
36             L since it just makes sense. However, because of the nature
37             of these databases, record-level processing is just a Bad Idea. Use
38             the C and C calls rather than do direct access of the
39             C<@records> array.
40              
41             =head1 EXAMPLES
42              
43             Convert a text file to a .pdb:
44              
45             use Palm::Doc;
46             my $doc = new Palm::Doc;
47             $doc->textfile( $ARGV[0] );
48             $doc->Write( $ARGV[0] . ".pdb" );
49              
50             Convert an HTML file to a .prc:
51              
52             use HTML::TreeBuilder;
53             use HTML::FormatText;
54             use Palm::Doc;
55              
56             my $tree = HTML::TreeBuilder->new_from_file( $ARGV[0] );
57             my $formatter = HTML::FormatText->new( leftmargin => 0, rightmargin => 80 );
58             my $doc = new Palm::Doc;
59             $doc->{attributes}{resource} = 1;
60             $doc->text( $formatter->format( $tree ) );
61             $doc->Write( $ARGV[0] . ".prc" );
62              
63             =cut
64             #'
65              
66             sub import
67             {
68 9     9   47 &EBook::MOBI::MobiPerl::Palm::PDB::RegisterPDBHandlers( __PACKAGE__, [ "REAd", "TEXt" ], );
69 9         43 &EBook::MOBI::MobiPerl::Palm::PDB::RegisterPRCHandlers( __PACKAGE__, [ "REAd", "TEXt" ], );
70 9         36 &EBook::MOBI::MobiPerl::Palm::PDB::RegisterPDBHandlers( __PACKAGE__, [ "MOBI", "BOOK" ], );
71 9         37 &EBook::MOBI::MobiPerl::Palm::PDB::RegisterPRCHandlers( __PACKAGE__, [ "MOBI", "BOOK" ], );
72             }
73              
74             =head2 new
75              
76             $doc = new Palm::Doc;
77              
78             Create a new Doc object. By default, it's not a resource database. Setting
79             C<$self->{attributes}{resource}> to C<1> before any manipulations will
80             cause it to become a resource database.
81              
82             =cut
83              
84             sub new
85             {
86 3     3 1 6 my $class = shift;
87 3         36 my $self = $class->SUPER::new(@_);
88              
89 3         17 $self->{'creator'} = 'REAd';
90 3         6 $self->{'type'} = 'TEXt';
91              
92 3         8 $self->{attributes}{resource} = 0;
93              
94 3         8 $self->{appinfo} = undef;
95 3         6 $self->{sort} = undef;
96 3         8 $self->{records} = [];
97              
98 3         9 return $self;
99             }
100              
101             # determine if the given (raw) record is a Doc header record and fill in the
102             # record with appropriate fields if it is.
103             sub _parse_headerrec($) {
104 3     3   5 my $record = shift;
105 3 50       10 return undef unless exists $record->{'data'};
106              
107             # Doc header is minimum of 16 bytes
108 3 50       9 return undef if length $record->{'data'} < 16;
109              
110             my ($version,$spare,$ulen, $records, $recsize, $position)
111 3         18 = unpack( 'n n N n n N', $record->{'data'} );
112              
113             # BEGIN: BORIS DAEPPEN JULY 2012
114             # ***********************
115             # This code just prints stuff... which is what I don't like
116             # One should not print to stderr just for fun!
117             # so this is why I put this out of work...
118              
119             #my $h = sprintf ("%x", $version);
120             #print STDERR "Version: $version - $h - ";
121             #if ($version == DOC_COMPRESSED) {
122             #print STDERR " DOC_COMPRESSED\n";
123             #}
124             #if ($version == DOC_UNCOMPRESSED) {
125             #print STDERR " DOC_UNCOMPRESSED\n";
126             #}
127             #if ($version != DOC_UNCOMPRESSED and $version != DOC_COMPRESSED) {
128             #print STDERR " probably HUFFDIC_COMPRESSED - CANNOT BE DECOMPRESSED!!!\n";
129             #}
130             # ***********************
131             # END: BORIS DAEPPEN JULY 2012
132              
133             # the header is followed by a list of record sizes. We don't use
134             # this since we can guess the sizes pretty easily by looking at
135             # the actual records.
136              
137             # According to the spec, $version is either 1 (uncompressed)
138             # or 2 (compress), while spare is always zero. AportisDoc supposedly sets
139             # spare to something else, so screw AportisDoc.
140 3 50 33     18 return undef if $version != DOC_UNCOMPRESSED and $version != DOC_COMPRESSED;
141 3 50       8 return undef if $spare != 0;
142              
143 3         4 $record->{'version'} = $version;
144 3         5 $record->{'length'} = $ulen;
145 3         5 $record->{'records'} = $records;
146 3         4 $record->{'recsize'} = $recsize;
147 3         6 $record->{'position'} = $position;
148              
149 3         10 return $record;
150             }
151              
152             sub _compress_record($$) {
153 3     3   11 my ($version,$in) = @_;
154 3 50       12 return $in if $version == DOC_UNCOMPRESSED;
155              
156 3         5 my $out = '';
157              
158 3         5 my $lin = length $in;
159 3         5 my $i = 0;
160 3         9 while( $i < $lin ) {
161             # See http://patb.dyndns.org/Programming/PilotDoc.htm for the code type
162             # taxonomy.
163              
164             # Try type B compression first.
165             # If the next 3 to 10 bytes are already in the compressed buffer, we can
166             # encode them into a 2 byte sequence. Don't bother too close to the ends,
167             # however... Makes the boundary conditions simpler.
168 171 100 100     545 if( $i > 10 and $lin - $i > 10 ) {
169 116         133 my $chunk = '';
170 116         117 my $match = -1;
171              
172             # the preamble is what'll be in the decoders output buffer.
173 116         160 my $preamble = substr( $in, 0, $i );
174 116         220 for( my $j = 10; $j >= 3; $j -- ) {
175 905         1097 $chunk = substr( $in, $i, $j ); # grab next $j characters
176 905         963 $match = rindex( $preamble, $chunk ); # in the output?
177              
178             # type B code has a 2047 byte sliding window, so matches have to be
179             # within that range to be useful
180 905 100 66     1663 last if $match >= 0 and ($i - $match) <= 2047;
181 893         1690 $match = -1;
182             }
183              
184 116         135 my $n = length $chunk;
185 116 50 66     338 if( $match >= 0 and $n <= 10 and $n >= 3 ) {
      66        
186 12         14 my $m = $i - $match;
187              
188             # first 2 bits are 10, next 11 are offset, next 3 are length-3
189 12         87 $out .= pack( "n", 0x8000 + (($m<<3)&0x3ff8) + ($n-3) );
190              
191 12         13 $i += $n;
192              
193 12         30 next;
194             }
195             }
196            
197 159         216 my $ch = substr( $in, $i ++, 1 );
198 159         163 my $och = ord($ch);
199              
200             # Try type C compression.
201 159 100 100     559 if( $i+1 < $lin and $ch eq ' ' ) {
202 13         18 my $nch = substr( $in, $i, 1 );
203 13         17 my $onch = ord($nch);
204              
205 13 100 100     50 if( $onch >= 0x40 and $onch < 0x80 ) {
206             # space plus ASCII character compression
207 10         19 $out .= chr($onch ^ 0x80);
208 10         12 $i ++;
209              
210 10         24 next;
211             }
212             }
213              
214 149 100 66     636 if( $och == 0 or ($och >= 9 and $och < 0x80) ) {
      33        
215             # pass through
216 147         320 $out .= $ch;
217             } else {
218             # type A code. This is essentially an 'escape' like '\\' in strings.
219             # For efficiency, it's best to encode as long a sequence as
220             # possible with one copy. This might seem like it would cause us to miss
221             # out on a type B sequence, but in actuality keeping long binary strings
222             # together improves the likelyhood of a later type B sequence than
223             # interspersing them with x01's.
224              
225 2         4 my $next = substr($in,$i - 1);
226 2 50       12 if( $next =~ /([\x01-\x08\x80-\xff]{1,8})/o ) {
227 2         5 my $binseq = $1;
228 2         4 $out .= chr(length $binseq);
229 2         4 $out .= $binseq;
230 2         6 $i += length( $binseq ) - 1; # first char, $ch, is already counted
231             }
232             }
233             }
234              
235 3         14 return $out;
236             }
237              
238             # algorithm taken from makedoc7.cpp with reference to
239             # http://patb.dyndns.org/Programming/PilotDoc.htm and
240             # http://www.pyrite.org/doc_format.html
241             sub _decompress_record($$) {
242 0     0   0 my ($version,$in) = @_;
243 0 0       0 return $in if $version == DOC_UNCOMPRESSED;
244              
245 0         0 my $out = '';
246              
247 0         0 my $lin = length $in;
248 0         0 my $i = 0;
249 0         0 while( $i < $lin ) {
250 0         0 my $ch = substr( $in, $i ++, 1 );
251 0         0 my $och = ord($ch);
252              
253 0 0 0     0 if( $och >= 1 and $och <= 8 ) {
    0          
    0          
254             # copy this many bytes... basically a way to 'escape' data
255 0         0 $out .= substr( $in, $i, $och );
256 0         0 $i += $och;
257             } elsif( $och < 0x80 ) {
258             # pass through 0, 9-0x7f
259 0         0 $out .= $ch;
260             } elsif( $och >= 0xc0 ) {
261             # 0xc0-0xff are 'space' plus ASCII char
262 0         0 $out .= ' ';
263 0         0 $out .= chr($och ^ 0x80);
264             } else {
265             # 0x80-0xbf is sequence from already decompressed buffer
266 0         0 my $nch = substr( $in, $i ++, 1 );
267 0         0 $och = ($och << 8) + ord($nch);
268 0         0 my $m = ($och & 0x3fff) >> 3;
269 0         0 my $n = ($och & 0x7) + 3;
270              
271             # This isn't very perl-like, but a simple
272             # substr($out,$lo-$m,$n) doesn't work.
273 0         0 my $lo = length $out;
274 0         0 for( my $j = 0; $j < $n; $j ++, $lo ++ ) {
275 0 0       0 die "bad Doc compression" unless ($lo-$m) >= 0;
276 0         0 $out .= substr( $out, $lo-$m, 1 );
277             }
278             }
279             }
280              
281 0         0 return $out;
282             }
283              
284             sub Write {
285 3     3 1 7 my $self = shift;
286              
287 3         6 my $prc = $self->{attributes}{resource};
288 3 50       10 my $recs = $prc ? $self->{'resources'} : $self->{'records'};
289 3         5 my $header = $recs->[0];
290 3 50       31 unless( defined _parse_headerrec($header) ) {
291 0         0 die "@_: Doesn't appear to be a correct book...";
292             }
293              
294 3         20 $self->SUPER::Write(@_);
295             }
296              
297             =head2 text
298              
299             $text = $doc->text;
300              
301             Return the contents of the Doc database.
302              
303             $text = $doc->text( @text );
304              
305             Set the contents of the Doc book to the specified arguments. All the list arguments
306             will simply be concatenated together.
307              
308             =cut
309              
310             sub text {
311 0     0 1   my $self = shift;
312              
313 0           my $body = '';
314 0           my $prc = $self->{attributes}{resource};
315              
316 0 0         if( @_ > 0 ) {
    0          
317 0           $body = join( '', @_ );
318              
319 0           my $version = DOC_COMPRESSED;
320              
321 0           $self->{'records'} = [];
322 0           $self->{'resources'} = [];
323              
324             # first record is the header
325 0 0         my $header = $prc ? $self->append_Resource() : $self->append_Record();
326 0           $header->{'version'} = $version;
327 0           $header->{'length'} = 0;
328 0           $header->{'records'} = 0;
329 0           $header->{'recsize'} = DOC_RECSIZE;
330              
331             # break the document into record-sized chunks
332 0           for( my $i = 0; $i < length($body); $i += DOC_RECSIZE ) {
333 0 0         my $record = $prc ? $self->append_Resource : $self->append_Record;
334 0           my $chunk = substr($body,$i,DOC_RECSIZE);
335 0           $record->{'data'} = _compress_record( $version, $chunk );
336              
337 0           $header->{'records'} ++;
338 0           $header->{'length'} += length $body;
339             }
340              
341             $header->{'recsize'} = $header->{'length'}
342 0 0         if $header->{'length'} < DOC_RECSIZE;
343              
344             # pack up the header
345             $header->{'data'} = pack( 'n xx N n n N',
346             $header->{'version'}, $header->{'length'},
347 0           $header->{'records'}, $header->{'recsize'}, 0 );
348              
349             } elsif( defined wantarray ) {
350              
351 0 0         my $recs = $prc ? $self->{'resources'} : $self->{'records'};
352              
353 0           my $header = $recs->[0];
354 0 0         if( defined _parse_headerrec($header) ) {
355             # a proper Doc file should be fine, but if it's not Doc
356             # compression like some Mobi docs seem to be we want to
357             # bail early. Otherwise we end up with a huge stream of
358             # substr() errors and we _still_ don't get any content.
359 0           eval {
360 0 0   0 0   sub min { return ($_[0]<$_[1]) ? $_[0] : $_[1] }
361 0           my $maxi = min($#$recs, $header->{'records'});
362 0           for( my $i = 1; $i <= $maxi; $i ++ ) {
363 0           my $data = $recs->[$i]->{'data'};
364 0           my $len = length($data);
365 0           my $overlap = "";
366 0 0         if ($self->{multibyteoverlap}) {
367 0           my $c = chop $data;
368 0           print STDERR "I:$i - $len - ", int($c), "\n";
369 0           my $n = $c & 7;
370 0           foreach (0..$n-1) {
371 0           $overlap = (chop $data) . $overlap;
372             }
373             }
374            
375 0           $body .= _decompress_record( $header->{'version'},
376             $data );
377 0           $body .= $overlap;
378             }
379             };
380 0 0         return undef if $@;
381             }
382             }
383              
384 0           return $body;
385             }
386              
387             =head2 textfile
388              
389             $doc->textfile( "README.txt" );
390              
391             Set the contents of the Doc to the contents of the file and sets the name of the PDB to
392             the specified filename.
393              
394             =cut
395              
396             sub textfile($$) {
397 0     0 1   my ($self, $filename) = @_;
398              
399 0 0         open IN, "< $filename" or return undef;
400 0           binmode IN;
401 0           $self->text( '', );
402 0           close IN;
403              
404 0           $self->{'name'} = $filename;
405             }
406              
407             1;
408             __END__