File Coverage

blib/lib/Text/Extract/Word.pm
Criterion Covered Total %
statement 233 254 91.7
branch 32 48 66.6
condition 1 3 33.3
subroutine 29 31 93.5
pod 8 8 100.0
total 303 344 88.0


line stmt bran cond sub pod time code
1             package Text::Extract::Word;
2            
3 4     4   106837 use strict;
  4         12  
  4         168  
4 4     4   22 use warnings;
  4         9  
  4         191  
5            
6             our $VERSION = 0.02;
7            
8 4     4   20 use base qw(Exporter);
  4         14  
  4         677  
9            
10             our @EXPORT_OK = qw(get_all_text);
11            
12             #use Smart::Comments;
13            
14 4     4   23 use Carp;
  4         6  
  4         387  
15 4     4   10400 use Encode;
  4         60512  
  4         588  
16 4     4   3864 use POSIX;
  4         35835  
  4         28  
17 4     4   26931 use OLE::Storage_Lite;
  4         118927  
  4         186  
18 4     4   41 use IO::File;
  4         7  
  4         1084  
19 4     4   22 use Scalar::Util;
  4         7  
  4         7677  
20            
21             sub new {
22 10     10 1 1214 my ($this, @options) = @_;
23 10   33     75 my $class = ref($this) || $this;
24            
25 10         21 my $self = { };
26 10         24 bless $self, $class;
27 10         34 _initialize($self, @options);
28 10         45 return $self;
29             }
30            
31             sub _initialize {
32 10     10   23 my ($self, @options) = @_;
33 10         19 my $value = shift(@options);
34 10 50       34 if (@options) {
35 0         0 carp("Ignored additional parameters to constructor");
36             }
37 10 50       475 if (Scalar::Util::openhandle($value)) {
    50          
38 0         0 $self->{_fh} = $value;
39             } elsif (-e $value) {
40 10         91 my $oIo = IO::File->new();
41 10 50       418 $oIo->open($value, "<") or croak("Can't open $value: $!");
42 10         767 binmode($oIo);
43 10         48 $self->{_fh} = $oIo;
44             } else {
45 0         0 croak("Invalid parameter to constructor: $value should be a file handle or file name");
46             }
47 10         32 _extract_stream($self);
48             }
49            
50             sub _compare_ranges {
51 0     0   0 my ($range1, $range2) = @_;
52 0         0 return ($range1->[0] <=> $range2->[0]);
53             }
54            
55             sub _extract_stream {
56 10     10   23 my ($self) = @_;
57            
58 10         21 my $fh = $self->{_fh};
59 10         96 my $ofs = OLE::Storage_Lite->new($fh);
60 10         117 my $name = encode("UCS-2LE", "WordDocument");
61 10         19170 my @pps = $ofs->getPpsSearch([$name], 1, 1);
62 10 50       23250 croak("This does not seem to be a Word document") unless (@pps);
63            
64             # OK, at this stage, we have the word stream. Now we need to start reading from it.
65 10         33 my $data = $pps[0]->{Data};
66 10         34 $self->{_data} = $data;
67            
68 10         35 my $magic = unpack("v", substr($data, 0x0000, 2));
69 10 50       66 croak(sprintf("This does not seem to be a Word document, but it is pretending to be one: %x", $magic)) unless ($magic == 0xa5ec);
70            
71 10         26 my $flags = unpack("v", substr($data, 0x000A, 2));
72 10 50       96 my $table = ($flags & 0x0200) ? "1Table" : "0Table";
73 10         41 $table = encode("UCS-2LE", $table);
74            
75 10         289 @pps = $ofs->getPpsSearch([$table], 1, 1);
76 10 50       20899 confess("Internal error: could not locate table stream") unless (@pps);
77            
78 10         28 $table = $pps[0]->{Data};
79 10         25 $self->{_table} = $table;
80            
81 10         27 my $fcMin = unpack("V", substr($data, 0x0018, 4));
82 10         21 my $ccpText = unpack("V", substr($data, 0x004c, 4));
83 10         23 my $ccpFtn = unpack("V", substr($data, 0x0050, 4));
84 10         22 my $ccpHdd = unpack("V", substr($data, 0x0054, 4));
85 10         27 my $ccpAtn = unpack("V", substr($data, 0x005c, 4));
86            
87 10         20 $self->{_fcMin} = $fcMin;
88 10         22 $self->{_ccpText} = $ccpText;
89 10         16 $self->{_ccpFtn} = $ccpFtn;
90 10         19 $self->{_ccpHdd} = $ccpHdd;
91 10         27 $self->{_ccpAtn} = $ccpAtn;
92            
93 10         22 my $charPLC = unpack("V", substr($data, 0x00fa, 4));
94 10         23 my $charPlcSize = unpack("V", substr($data, 0x00fe, 4));
95 10         21 my $parPLC = unpack("V", substr($data, 0x0102, 4));
96 10         20 my $parPlcSize = unpack("V", substr($data, 0x0106, 4));
97            
98             # get the location of the piece table
99 10         23 my $complexOffset = unpack("V", substr($data, 0x01a2, 4));
100            
101             ### fcMin: $fcMin
102             ### ccpText: $ccpText
103             ### ccpFtn: $ccpFtn
104             ### ccpHdd: $ccpHdd
105             ### ccpAtn: $ccpAtn
106             ### end: $ccpText + $ccpFtn + $ccpHdd + $ccpAtn
107            
108             # Read character positioning data positions
109 10         32 my $fcPlcfBteChpx = unpack("V", substr($data, 0x0fa, 4));
110 10         20 my $lcbPlcfBteChpx = unpack("V", substr($data, 0x0fe, 4));
111 10         23 $self->{_fcPlcfBteChpx} = $fcPlcfBteChpx;
112 10         22 $self->{_lcbPlcfBteChpx} = $lcbPlcfBteChpx;
113            
114 10         35 _get_bookmarks($self);
115            
116 10         36 my @pieces = _find_text(\$table, $complexOffset);
117 10         29 @pieces = sort { $a->{start} <=> $b->{start} } @pieces;
  12         25  
118            
119 10         34 _get_text(\$data, \@pieces);
120            
121 10         109 $self->{_pieces} = \@pieces;
122             }
123            
124             sub _get_bookmarks {
125 10     10   15 my ($self) = @_;
126            
127             # Now to look for bookmark information
128 10         65 my $fcSttbfBkmk = unpack("V", substr($self->{_data}, 0x0142, 4));
129 10         24 my $lcbSttbfBkmk = unpack("V", substr($self->{_data}, 0x0146, 4));
130 10         23 my $fcPlcfBkf = unpack("V", substr($self->{_data}, 0x014a, 4));
131 10         23 my $lcbPlcfBkf = unpack("V", substr($self->{_data}, 0x014e, 4));
132 10         21 my $fcPlcfBkl = unpack("V", substr($self->{_data}, 0x0152, 4));
133 10         22 my $lcbPlcfBkl = unpack("V", substr($self->{_data}, 0x0156, 4));
134             ### fcSttbfBkmk: $fcSttbfBkmk
135             ### lcbSttbfBkmk: $lcbSttbfBkmk
136             ### fcPlcfBkf: $fcPlcfBkf
137             ### lcbPlcfBkf: $lcbPlcfBkf
138             ### fcPlcfBkl: $fcPlcfBkl
139             ### lcbPlcfBkl: $lcbPlcfBkl
140            
141 10 100       32 return if ($lcbSttbfBkmk == 0);
142            
143             # Read the bookmark name block
144 2         10 my $sttbfBkmk = substr($self->{_table}, $fcSttbfBkmk, $lcbSttbfBkmk);
145 2         5 my $plcfBkf = substr($self->{_table}, $fcPlcfBkf, $lcbPlcfBkf);
146 2         5 my $plcfBkl = substr($self->{_table}, $fcPlcfBkl, $lcbPlcfBkl);
147            
148             # Now we can read the bookmark names
149            
150 2         13 my $fcExtend = unpack("v", substr($sttbfBkmk, 0, 2));
151 2         3 my $cData = unpack("v", substr($sttbfBkmk, 2, 2));
152 2         4 my $cbExtra = unpack("v", substr($sttbfBkmk, 4, 2));
153 2 50       6 confess("Internal error: unexpected single-byte bookmark data") unless ($fcExtend == 0xffff);
154            
155 2         2 my $offset = 6;
156 2         3 my $index = 0;
157 2         5 my %bookmarks = ();
158 2         6 while($offset < $lcbSttbfBkmk) {
159 70         111 my $length = unpack("v", substr($sttbfBkmk, $offset, 2));
160 70         66 $length = $length * 2;
161 70         106 my $string = substr($sttbfBkmk, $offset + 2, $length);
162 70         132 my $cpStart = unpack("V", substr($plcfBkf, $index * 4, 4));
163 70         97 my $cpEnd = unpack("V", substr($plcfBkl, $index * 4, 4));
164 70         149 $string = Encode::decode("UCS-2LE", $string);
165             ### field name: $string
166             ### position: $cpStart
167             ### position: $cpEnd
168 70         1321 $bookmarks{$string} = {start => $cpStart, end => $cpEnd};
169 70         234 $offset += $length + 2;
170 70         143 $index++;
171             }
172            
173 2         8 $self->{_bookmarks} = \%bookmarks;
174             }
175            
176             sub _get_piece {
177 17     17   34 my ($dataref, $piece) = @_;
178            
179 17         35 my $pstart = $piece->{start};
180 17         24 my $ptotLength = $piece->{totLength};
181 17         44 my $pfilePos = $piece->{filePos};
182 17         21 my $punicode = $piece->{unicode};
183            
184 17         24 my $pend = $pstart + $ptotLength;
185 17         23 my $textStart = $pfilePos;
186 17         23 my $textEnd = $textStart + ($pend - $pstart);
187            
188 17 100       40 if ($punicode) {
189             ### Adding ucs2 text...
190             ### Start: $textStart
191             ### End: $textEnd
192             ### Length: $textEnd - $textStart
193             ### Bytes: $ptotLength
194 8         20 $piece->{text} = _add_unicode_text($textStart, $textEnd, $dataref);
195 8         15 return;
196             } else {
197             ### Adding iso8869 text...
198             ### Start: $textStart
199             ### End: $textEnd
200             ### Length: $textEnd - $textStart
201             ### Bytes: $ptotLength
202 9         31 $piece->{text} = _add_text($textStart, $textEnd, $dataref);
203 9         21 return;
204             }
205             }
206            
207             sub _get_text {
208 10     10   18 my ($dataref, $piecesref) = @_;
209            
210 10         24 my @pieces = @$piecesref;
211 10         19 my @result = ();
212 10         17 my $index = 1;
213 10         12 my $position = 0;
214            
215 10         22 foreach my $piece (@pieces) {
216            
217             ### piece: $index++
218             ### position: $position
219 17         37 $piece->{position} = $position;
220            
221 17         51 _get_piece($dataref, $piece);
222 17         27 my $segment = $piece->{text};
223 17         32 push @result, $segment;
224 17         267 my $length = length($segment);
225 17         31 $piece->{length} = $length;
226 17         47 $piece->{endPosition} = $position + $length;
227            
228 17         58 $position += $length;
229             }
230            
231             ### End position: $position
232 10         25 return;
233             }
234            
235             sub _add_unicode_text {
236 8     8   12 my ($textStart, $textEnd, $dataref) = @_;
237            
238 8         23 my $string = substr($$dataref, $textStart, 2*($textEnd - $textStart));
239            
240 8         28 my $perl_string = Encode::decode("UCS-2LE", $string);
241 8         224 return $perl_string;
242             }
243            
244             sub _add_text {
245 9     9   15 my ($textStart, $textEnd, $dataref) = @_;
246            
247 9         107 my $string = substr($$dataref, $textStart, $textEnd - $textStart);
248            
249 9         35 my $perl_string = Encode::decode("iso-8859-1", $string);
250            
251             # See the conversion table for FcCompressed structures. Note that these
252             # should not affect positions, as these are characters now, not bytes
253 4     4   4261 $perl_string =~ tr[\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9f][\x{201A}\x{0192}\x{201E}\x{2026}\x{2020}\x{2021}\x{02C6}\x{2030}\x{0160}\x{2039}\x{0152}\x{2018}\x{2019}\x{201C}\x{201D}\x{2022}\x{2013}\x{2014}\x{02DC}\x{2122}\x{0161}\x{203A}\x{0153}\x{0178}];
  4         39  
  4         53  
  9         2427  
254            
255 9         38 return $perl_string;
256             }
257            
258             sub _get_chunks {
259 0     0   0 my ($start, $length, $piecesref) = @_;
260 0         0 my @result = ();
261 0         0 my $end = $start + $length;
262            
263 0         0 foreach my $piece (@$piecesref) {
264 0         0 my ($pstart, $ptotLength, $pfilePos, $punicode) = @$piece;
265 0         0 my $pend = $pstart + $ptotLength;
266 0 0       0 if ($pstart < $end) {
267 0 0       0 if ($start < $pend) {
268 0         0 push @result, $piece;
269             }
270             } else {
271 0         0 last;
272             }
273             }
274            
275 0         0 return @result;
276             }
277            
278             sub _find_text {
279 10     10   18 my ($tableref, $pos) = @_;
280            
281 10         16 my @pieces = ();
282            
283 10         41 while(unpack("C", substr($$tableref, $pos, 1)) == 1) {
284 0         0 $pos++;
285 0         0 my $skip = unpack("v", substr($$tableref, $pos, 2));
286             # print STDERR sprintf("Skipping %d\n", $skip);
287 0         0 $pos += 2 + $skip;
288             }
289            
290 10 50       35 if (unpack("C", substr($$tableref, $pos, 1)) != 2) {
291 0         0 confess("Internal error: ccorrupted Word file");
292             } else {
293 10         28 my $pieceTableSize = unpack("V", substr($$tableref, ++$pos, 4));
294             # print STDERR sprintf("pieceTableSize: %d\n", $pieceTableSize);
295            
296 10         15 $pos += 4;
297 10         21 my $pieces = ($pieceTableSize - 4) / 12;
298             # print STDERR sprintf("pieces: %d\n", $pieces);
299 10         15 my $start = 0;
300            
301 10         38 for (my $x = 0; $x < $pieces; $x++) {
302 17         53 my $filePos = unpack("V", substr($$tableref, $pos + (($pieces + 1) * 4) + ($x * 8) + 2, 4));
303 17         23 my $unicode = 0;
304 17 100       36 if (($filePos & 0x40000000) == 0) {
305 8         12 $unicode = 1;
306             } else {
307 9         12 $unicode = 0;
308 9         19 $filePos &= ~(0x40000000); #gives me FC in doc stream
309 9         14 $filePos /= 2;
310             }
311             # print STDERR sprintf("filePos: %x\n", $filePos);
312 17         36 my $lStart = unpack("V", substr($$tableref, $pos + ($x * 4), 4));
313 17         36 my $lEnd = unpack("V", substr($$tableref, $pos + (($x + 1) * 4), 4));
314 17         25 my $totLength = $lEnd - $lStart;
315            
316             # print STDERR "lStart: $lStart; lEnd: $lEnd\n";
317            
318             # print STDERR ("Piece: " . (1 + $x) . ", start=" . $start
319             # . ", len=" . $totLength . ", phys=" . $filePos
320             # . ", uni=" .$unicode . "\n");
321            
322             # TextPiece piece = new TextPiece(start, totLength, filePos, unicode);
323             # start = start + totLength;
324             # text.add(piece);
325            
326 17         74 push @pieces, {start => $start,
327             totLength => $totLength,
328             filePos => $filePos,
329             unicode => $unicode};
330 17 100       68 $start = $start + (($unicode) ? $totLength/2 : $totLength);
331             }
332             }
333 10         32 return @pieces;
334             }
335            
336             sub _get_piece_index {
337 172     172   174 my ($self, $position) = @_;
338 172 50       317 confess("Internal error: invalid position") if (! defined($position));
339 172         297 my $index = 0;
340 172         175 foreach my $piece (@{$self->{_pieces}}) {
  172         326  
341 805 100       1400 return $index if ($position <= $piece->{endPosition});
342 633         565 $index++;
343             }
344             }
345            
346             sub _get_text_range {
347 86     86   100 my ($self, $start, $end) = @_;
348            
349 86         110 my $pieces = $self->{_pieces};
350 86         151 my $start_piece = _get_piece_index($self, $start);
351 86         143 my $end_piece = _get_piece_index($self, $end);
352 86         123 my @result = ();
353 86         184 for(my $i = $start_piece; $i <= $end_piece; $i++) {
354 101         116 my $piece = $pieces->[$i];
355 101 100       196 my $xstart = ($i == $start_piece) ? $start - $piece->{position} : 0;
356 101 100       150 my $xend = ($i == $end_piece) ? $end - $piece->{position} : $piece->{endPosition};
357 101         683 push @result, substr($piece->{text}, $xstart, $xend - $xstart);
358             }
359            
360 86         500 return join("", @result);
361             }
362            
363             sub get_bookmarks {
364 10     10 1 1816 my ($self, $filter) = @_;
365 10         21 my $bookmarks = $self->{_bookmarks};
366 10         93 my @bookmark_names = sort keys %$bookmarks;
367 10         25 foreach my $name (@bookmark_names) {
368 72         127 my $bookmark = $bookmarks->{$name};
369 72 100       140 next if (exists($bookmark->{value}));
370 70         79 my $start = $bookmark->{start};
371 70         74 my $end = $bookmark->{end};
372 70         134 my $value = _get_text_range($self, $start - 1, $end);
373 70 100       174 if (substr($value, 0, 1) ne chr(19)) {
374 1         4 $value = substr($value, 1);
375             }
376 70         164 $bookmark->{value} = $value;
377             ### name: $name
378             ### value: $value
379             }
380            
381 10         25 return { map { ($_ => _filter($bookmarks->{$_}->{value}, $filter) ) } @bookmark_names };
  72         193  
382             }
383            
384             sub get_body {
385 3     3 1 14 my ($self, $filter) = @_;
386 3         8 my $start = 0;
387 3         13 return _filter(_get_text_range($self, $start, $start + $self->{_ccpText}), $filter);
388             }
389            
390             sub get_footnotes {
391 2     2 1 5 my ($self, $filter) = @_;
392 2         4 my $start = $self->{_ccpText};
393 2         6 return _filter(_get_text_range($self, $start, $start + $self->{_ccpFtn}), $filter);
394             }
395            
396             sub get_headers {
397 2     2 1 8 my ($self, $filter) = @_;
398 2         5 my $start = $self->{_ccpText} + $self->{_ccpFtn};
399 2         7 return _filter(_get_text_range($self, $start, $start + $self->{_ccpHdd}), $filter);
400             }
401            
402             sub get_annotations {
403 2     2 1 4 my ($self, $filter) = @_;
404 2         4 my $start = $self->{_ccpText} + $self->{_ccpFtn} + $self->{_ccpHdd};
405 2         10 return _filter(_get_text_range($self, $start, $start + $self->{_ccpAtn}), $filter);
406             }
407            
408             sub get_text {
409 1     1 1 2 my ($self, $filter) = @_;
410 1         3 return $self->get_body($filter) .
411             $self->get_footnotes($filter) .
412             $self->get_headers($filter) .
413             $self->get_annotations($filter);
414             }
415            
416             sub _filter {
417 81     81   107 my ($text, $filter) = @_;
418 81 100       125 if (! defined($filter)) {
    50          
419 79         486 $text =~ tr/\x02\x05\x08//d;
420 79         505 $text =~ tr/\x{2018}\x{2019}\x{201c}\x{201d}\x{0007}\x{000d}\x{2002}\x{2003}\x{2012}\x{2013}\x{2014}/''""\t\n \-\-\-/;
421 79         546 $text =~ s{\cS(?:[^\cT]*\cT)([^\cU]*)\cU}{$1}g;
422 79         319 $text =~ s{\cS(?:[^\cU]*\cU)}{}g;
423 79         343 $text =~ s{[\cJ\cM]}{\n}g;
424             } elsif ($filter eq ':raw') {
425             # Do nothing
426             } else {
427 0         0 croak("Invalid filter type: $filter");
428             }
429 81         256 return $text;
430             }
431            
432             sub get_all_text {
433 7     7 1 13199 my ($file) = @_;
434            
435 7         54 my $instance = __PACKAGE__->new($file);
436            
437 7         28 $instance->get_bookmarks();
438 7         38 return _get_text_range($instance, 0, $instance->{_ccpText} +
439             $instance->{_ccpFtn} +
440             $instance->{_ccpHdd} +
441             $instance->{_ccpAtn});
442             }
443            
444             1;
445            
446             =head1 NAME
447            
448             Text::Extract::Word - Extract text from Word files
449            
450             =head1 SYNOPSIS
451            
452             # object-based interface
453             use Text::Extract::Word;
454             my $file = Text::Extract::Word->new("test1.doc");
455             my $text = $file->get_text();
456             my $body = $file->get_body();
457             my $footnotes = $file->get_footnotes();
458             my $headers = $file->get_headers();
459             my $annotations = $file->get_annotations();
460             my $bookmarks = $file->get_bookmarks();
461            
462             # specify :raw if you don't want the text cleaned
463             my $raw = $file->get_text(':raw');
464            
465             # legacy interface
466             use Text::Extract::Word qw(get_all_text);
467             my $text = get_all_text("test1.doc");
468            
469             =head1 DESCRIPTION
470            
471             This simple module allows the textual contents to be extracted from a Word file.
472             The code was ported from Java code, originally part of the Apache POE project, but
473             extensive code changes were made internally.
474            
475             =head1 OBJECT-BASED INTERFACE
476            
477             =head2 Text::Extract::Word->new($input);
478            
479             Passed either a file name or an open file handle, this constructor returns an
480             instance that can be used to query the file contents.
481            
482             =head1 METHODS
483            
484             All the query methods accept an optional filter argument that can take the value
485             ':raw' -- if this is passed the original Word file contents will be returned without
486             any attempt to clean the text.
487            
488             The default filter attempts to remove Word internal characters used to identify
489             fields (including field instructions), and translate common Unicode 'fancy' quotes
490             into more conventional ISO-8859-1 equivalents, for ease of processing. Table cell
491             markers are also translated into tabs, and paragraph marks into Perl newlines.
492            
493             =head2 get_body([$filter])
494            
495             Returns the text for the main body of the Word document. This excludes headers,
496             footers, and annotations.
497            
498             =head2 get_headers([$filter])
499            
500             Returns the header and footer texts for the Word document, as a single scalar
501             string.
502            
503             =head2 get_footnotes([$filter])
504            
505             Returns the footnote and endnode texts for the Word document, as a single scalar
506             string.
507            
508             =head2 get_annotations([$filter])
509            
510             Returns the annotation texts for the Word document, as a single scalar
511             string.
512            
513             =head2 get_text([$filter])
514            
515             Returns the concatenated text from the body, headers, footnotes, and annotations
516             of the the Word document, as a single scalar string.
517            
518             =head2 get_bookmarks([$filter])
519            
520             Returns the bookmark texts for the Word document, as a hash reference. The keys
521             in the hash are the bookmark names (Word requires that these are unique) and
522             the values are the filtered bookmark texts.
523            
524             This method can be used to get Word form text data out of a Word file. All text fields
525             in a Word form will normally be labelled as bookmarks, and will be returned by this
526             method. Non-textual form fields (including drop-downs) will not be returned, as these
527             are not labelled as bookmarks.
528            
529             =head1 FUNCTIONS
530            
531             =head2 get_all_text($filename)
532            
533             The only function exportable by this module, when called on a file name, returns the
534             raw text contents of the Word file. The contents are returned as UTF-8 encoded text.
535             This is unfiltered, for compatibility with previous versions of the module.
536            
537             =head1 TODO
538            
539             =over 4
540            
541             =item *
542            
543             handle non-textual form fields
544            
545             =back
546            
547             =head1 BUGS
548            
549             =over 4
550            
551             =item *
552            
553             support for legacy Word - the module does not extract text from Word version 6 or earlier
554            
555             =back
556            
557             =head1 SEE ALSO
558            
559             L also has a script C (Let's Have a Look at Word) which extracts
560             text from Word files. This is simply a much smaller module with lighter dependencies,
561             using L for its storage management.
562            
563             =head1 AUTHOR
564            
565             Stuart Watt, stuart@morungos.com
566            
567             =head1 COPYRIGHT
568            
569             Copyright (c) 2010 Stuart Watt. All rights reserved.
570            
571             =cut
572