File Coverage

blib/lib/Text/Extract/Word.pm

Criterion	Covered	Total	%
statement	233	254	91.7
branch	32	48	66.6
condition	1	3	33.3
subroutine	29	31	93.5
pod	8	8	100.0
total	303	344	88.0

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Extract::Word;
2
3	4			4		106837	use strict;
	4					12
	4					168
4	4			4		22	use warnings;
	4					9
	4					191
5
6							our $VERSION = 0.02;
7
8	4			4		20	use base qw(Exporter);
	4					14
	4					677
9
10							our @EXPORT_OK = qw(get_all_text);
11
12							#use Smart::Comments;
13
14	4			4		23	use Carp;
	4					6
	4					387
15	4			4		10400	use Encode;
	4					60512
	4					588
16	4			4		3864	use POSIX;
	4					35835
	4					28
17	4			4		26931	use OLE::Storage_Lite;
	4					118927
	4					186
18	4			4		41	use IO::File;
	4					7
	4					1084
19	4			4		22	use Scalar::Util;
	4					7
	4					7677
20
21							sub new {
22	10			10	1	1214	my ($this, @options) = @_;
23	10		33			75	my $class = ref($this) \|\| $this;
24
25	10					21	my $self = { };
26	10					24	bless $self, $class;
27	10					34	_initialize($self, @options);
28	10					45	return $self;
29							}
30
31							sub _initialize {
32	10			10		23	my ($self, @options) = @_;
33	10					19	my $value = shift(@options);
34	10	50				34	if (@options) {
35	0					0	carp("Ignored additional parameters to constructor");
36							}
37	10	50				475	if (Scalar::Util::openhandle($value)) {
		50
38	0					0	$self->{_fh} = $value;
39							} elsif (-e $value) {
40	10					91	my $oIo = IO::File->new();
41	10	50				418	$oIo->open($value, "<") or croak("Can't open $value: $!");
42	10					767	binmode($oIo);
43	10					48	$self->{_fh} = $oIo;
44							} else {
45	0					0	croak("Invalid parameter to constructor: $value should be a file handle or file name");
46							}
47	10					32	_extract_stream($self);
48							}
49
50							sub _compare_ranges {
51	0			0		0	my ($range1, $range2) = @_;
52	0					0	return ($range1->[0] <=> $range2->[0]);
53							}
54
55							sub _extract_stream {
56	10			10		23	my ($self) = @_;
57
58	10					21	my $fh = $self->{_fh};
59	10					96	my $ofs = OLE::Storage_Lite->new($fh);
60	10					117	my $name = encode("UCS-2LE", "WordDocument");
61	10					19170	my @pps = $ofs->getPpsSearch([$name], 1, 1);
62	10	50				23250	croak("This does not seem to be a Word document") unless (@pps);
63
64							# OK, at this stage, we have the word stream. Now we need to start reading from it.
65	10					33	my $data = $pps[0]->{Data};
66	10					34	$self->{_data} = $data;
67
68	10					35	my $magic = unpack("v", substr($data, 0x0000, 2));
69	10	50				66	croak(sprintf("This does not seem to be a Word document, but it is pretending to be one: %x", $magic)) unless ($magic == 0xa5ec);
70
71	10					26	my $flags = unpack("v", substr($data, 0x000A, 2));
72	10	50				96	my $table = ($flags & 0x0200) ? "1Table" : "0Table";
73	10					41	$table = encode("UCS-2LE", $table);
74
75	10					289	@pps = $ofs->getPpsSearch([$table], 1, 1);
76	10	50				20899	confess("Internal error: could not locate table stream") unless (@pps);
77
78	10					28	$table = $pps[0]->{Data};
79	10					25	$self->{_table} = $table;
80
81	10					27	my $fcMin = unpack("V", substr($data, 0x0018, 4));
82	10					21	my $ccpText = unpack("V", substr($data, 0x004c, 4));
83	10					23	my $ccpFtn = unpack("V", substr($data, 0x0050, 4));
84	10					22	my $ccpHdd = unpack("V", substr($data, 0x0054, 4));
85	10					27	my $ccpAtn = unpack("V", substr($data, 0x005c, 4));
86
87	10					20	$self->{_fcMin} = $fcMin;
88	10					22	$self->{_ccpText} = $ccpText;
89	10					16	$self->{_ccpFtn} = $ccpFtn;
90	10					19	$self->{_ccpHdd} = $ccpHdd;
91	10					27	$self->{_ccpAtn} = $ccpAtn;
92
93	10					22	my $charPLC = unpack("V", substr($data, 0x00fa, 4));
94	10					23	my $charPlcSize = unpack("V", substr($data, 0x00fe, 4));
95	10					21	my $parPLC = unpack("V", substr($data, 0x0102, 4));
96	10					20	my $parPlcSize = unpack("V", substr($data, 0x0106, 4));
97
98							# get the location of the piece table
99	10					23	my $complexOffset = unpack("V", substr($data, 0x01a2, 4));
100
101							### fcMin: $fcMin
102							### ccpText: $ccpText
103							### ccpFtn: $ccpFtn
104							### ccpHdd: $ccpHdd
105							### ccpAtn: $ccpAtn
106							### end: $ccpText + $ccpFtn + $ccpHdd + $ccpAtn
107
108							# Read character positioning data positions
109	10					32	my $fcPlcfBteChpx = unpack("V", substr($data, 0x0fa, 4));
110	10					20	my $lcbPlcfBteChpx = unpack("V", substr($data, 0x0fe, 4));
111	10					23	$self->{_fcPlcfBteChpx} = $fcPlcfBteChpx;
112	10					22	$self->{_lcbPlcfBteChpx} = $lcbPlcfBteChpx;
113
114	10					35	_get_bookmarks($self);
115
116	10					36	my @pieces = _find_text(\$table, $complexOffset);
117	10					29	@pieces = sort { $a->{start} <=> $b->{start} } @pieces;
	12					25
118
119	10					34	_get_text(\$data, \@pieces);
120
121	10					109	$self->{_pieces} = \@pieces;
122							}
123
124							sub _get_bookmarks {
125	10			10		15	my ($self) = @_;
126
127							# Now to look for bookmark information
128	10					65	my $fcSttbfBkmk = unpack("V", substr($self->{_data}, 0x0142, 4));
129	10					24	my $lcbSttbfBkmk = unpack("V", substr($self->{_data}, 0x0146, 4));
130	10					23	my $fcPlcfBkf = unpack("V", substr($self->{_data}, 0x014a, 4));
131	10					23	my $lcbPlcfBkf = unpack("V", substr($self->{_data}, 0x014e, 4));
132	10					21	my $fcPlcfBkl = unpack("V", substr($self->{_data}, 0x0152, 4));
133	10					22	my $lcbPlcfBkl = unpack("V", substr($self->{_data}, 0x0156, 4));
134							### fcSttbfBkmk: $fcSttbfBkmk
135							### lcbSttbfBkmk: $lcbSttbfBkmk
136							### fcPlcfBkf: $fcPlcfBkf
137							### lcbPlcfBkf: $lcbPlcfBkf
138							### fcPlcfBkl: $fcPlcfBkl
139							### lcbPlcfBkl: $lcbPlcfBkl
140
141	10	100				32	return if ($lcbSttbfBkmk == 0);
142
143							# Read the bookmark name block
144	2					10	my $sttbfBkmk = substr($self->{_table}, $fcSttbfBkmk, $lcbSttbfBkmk);
145	2					5	my $plcfBkf = substr($self->{_table}, $fcPlcfBkf, $lcbPlcfBkf);
146	2					5	my $plcfBkl = substr($self->{_table}, $fcPlcfBkl, $lcbPlcfBkl);
147
148							# Now we can read the bookmark names
149
150	2					13	my $fcExtend = unpack("v", substr($sttbfBkmk, 0, 2));
151	2					3	my $cData = unpack("v", substr($sttbfBkmk, 2, 2));
152	2					4	my $cbExtra = unpack("v", substr($sttbfBkmk, 4, 2));
153	2	50				6	confess("Internal error: unexpected single-byte bookmark data") unless ($fcExtend == 0xffff);
154
155	2					2	my $offset = 6;
156	2					3	my $index = 0;
157	2					5	my %bookmarks = ();
158	2					6	while($offset < $lcbSttbfBkmk) {
159	70					111	my $length = unpack("v", substr($sttbfBkmk, $offset, 2));
160	70					66	$length = $length * 2;
161	70					106	my $string = substr($sttbfBkmk, $offset + 2, $length);
162	70					132	my $cpStart = unpack("V", substr($plcfBkf, $index * 4, 4));
163	70					97	my $cpEnd = unpack("V", substr($plcfBkl, $index * 4, 4));
164	70					149	$string = Encode::decode("UCS-2LE", $string);
165							### field name: $string
166							### position: $cpStart
167							### position: $cpEnd
168	70					1321	$bookmarks{$string} = {start => $cpStart, end => $cpEnd};
169	70					234	$offset += $length + 2;
170	70					143	$index++;
171							}
172
173	2					8	$self->{_bookmarks} = \%bookmarks;
174							}
175
176							sub _get_piece {
177	17			17		34	my ($dataref, $piece) = @_;
178
179	17					35	my $pstart = $piece->{start};
180	17					24	my $ptotLength = $piece->{totLength};
181	17					44	my $pfilePos = $piece->{filePos};
182	17					21	my $punicode = $piece->{unicode};
183
184	17					24	my $pend = $pstart + $ptotLength;
185	17					23	my $textStart = $pfilePos;
186	17					23	my $textEnd = $textStart + ($pend - $pstart);
187
188	17	100				40	if ($punicode) {
189							### Adding ucs2 text...
190							### Start: $textStart
191							### End: $textEnd
192							### Length: $textEnd - $textStart
193							### Bytes: $ptotLength
194	8					20	$piece->{text} = _add_unicode_text($textStart, $textEnd, $dataref);
195	8					15	return;
196							} else {
197							### Adding iso8869 text...
198							### Start: $textStart
199							### End: $textEnd
200							### Length: $textEnd - $textStart
201							### Bytes: $ptotLength
202	9					31	$piece->{text} = _add_text($textStart, $textEnd, $dataref);
203	9					21	return;
204							}
205							}
206
207							sub _get_text {
208	10			10		18	my ($dataref, $piecesref) = @_;
209
210	10					24	my @pieces = @$piecesref;
211	10					19	my @result = ();
212	10					17	my $index = 1;
213	10					12	my $position = 0;
214
215	10					22	foreach my $piece (@pieces) {
216
217							### piece: $index++
218							### position: $position
219	17					37	$piece->{position} = $position;
220
221	17					51	_get_piece($dataref, $piece);
222	17					27	my $segment = $piece->{text};
223	17					32	push @result, $segment;
224	17					267	my $length = length($segment);
225	17					31	$piece->{length} = $length;
226	17					47	$piece->{endPosition} = $position + $length;
227
228	17					58	$position += $length;
229							}
230
231							### End position: $position
232	10					25	return;
233							}
234
235							sub _add_unicode_text {
236	8			8		12	my ($textStart, $textEnd, $dataref) = @_;
237
238	8					23	my $string = substr($$dataref, $textStart, 2*($textEnd - $textStart));
239
240	8					28	my $perl_string = Encode::decode("UCS-2LE", $string);
241	8					224	return $perl_string;
242							}
243
244							sub _add_text {
245	9			9		15	my ($textStart, $textEnd, $dataref) = @_;
246
247	9					107	my $string = substr($$dataref, $textStart, $textEnd - $textStart);
248
249	9					35	my $perl_string = Encode::decode("iso-8859-1", $string);
250
251							# See the conversion table for FcCompressed structures. Note that these
252							# should not affect positions, as these are characters now, not bytes
253	4			4		4261	$perl_string =~ tr[\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9f][\x{201A}\x{0192}\x{201E}\x{2026}\x{2020}\x{2021}\x{02C6}\x{2030}\x{0160}\x{2039}\x{0152}\x{2018}\x{2019}\x{201C}\x{201D}\x{2022}\x{2013}\x{2014}\x{02DC}\x{2122}\x{0161}\x{203A}\x{0153}\x{0178}];
	4					39
	4					53
	9					2427
254
255	9					38	return $perl_string;
256							}
257
258							sub _get_chunks {
259	0			0		0	my ($start, $length, $piecesref) = @_;
260	0					0	my @result = ();
261	0					0	my $end = $start + $length;
262
263	0					0	foreach my $piece (@$piecesref) {
264	0					0	my ($pstart, $ptotLength, $pfilePos, $punicode) = @$piece;
265	0					0	my $pend = $pstart + $ptotLength;
266	0	0				0	if ($pstart < $end) {
267	0	0				0	if ($start < $pend) {
268	0					0	push @result, $piece;
269							}
270							} else {
271	0					0	last;
272							}
273							}
274
275	0					0	return @result;
276							}
277
278							sub _find_text {
279	10			10		18	my ($tableref, $pos) = @_;
280
281	10					16	my @pieces = ();
282
283	10					41	while(unpack("C", substr($$tableref, $pos, 1)) == 1) {
284	0					0	$pos++;
285	0					0	my $skip = unpack("v", substr($$tableref, $pos, 2));
286							# print STDERR sprintf("Skipping %d\n", $skip);
287	0					0	$pos += 2 + $skip;
288							}
289
290	10	50				35	if (unpack("C", substr($$tableref, $pos, 1)) != 2) {
291	0					0	confess("Internal error: ccorrupted Word file");
292							} else {
293	10					28	my $pieceTableSize = unpack("V", substr($$tableref, ++$pos, 4));
294							# print STDERR sprintf("pieceTableSize: %d\n", $pieceTableSize);
295
296	10					15	$pos += 4;
297	10					21	my $pieces = ($pieceTableSize - 4) / 12;
298							# print STDERR sprintf("pieces: %d\n", $pieces);
299	10					15	my $start = 0;
300
301	10					38	for (my $x = 0; $x < $pieces; $x++) {
302	17					53	my $filePos = unpack("V", substr($$tableref, $pos + (($pieces + 1) * 4) + ($x * 8) + 2, 4));
303	17					23	my $unicode = 0;
304	17	100				36	if (($filePos & 0x40000000) == 0) {
305	8					12	$unicode = 1;
306							} else {
307	9					12	$unicode = 0;
308	9					19	$filePos &= ~(0x40000000); #gives me FC in doc stream
309	9					14	$filePos /= 2;
310							}
311							# print STDERR sprintf("filePos: %x\n", $filePos);
312	17					36	my $lStart = unpack("V", substr($$tableref, $pos + ($x * 4), 4));
313	17					36	my $lEnd = unpack("V", substr($$tableref, $pos + (($x + 1) * 4), 4));
314	17					25	my $totLength = $lEnd - $lStart;
315
316							# print STDERR "lStart: $lStart; lEnd: $lEnd\n";
317
318							# print STDERR ("Piece: " . (1 + $x) . ", start=" . $start
319							# . ", len=" . $totLength . ", phys=" . $filePos
320							# . ", uni=" .$unicode . "\n");
321
322							# TextPiece piece = new TextPiece(start, totLength, filePos, unicode);
323							# start = start + totLength;
324							# text.add(piece);
325
326	17					74	push @pieces, {start => $start,
327							totLength => $totLength,
328							filePos => $filePos,
329							unicode => $unicode};
330	17	100				68	$start = $start + (($unicode) ? $totLength/2 : $totLength);
331							}
332							}
333	10					32	return @pieces;
334							}
335
336							sub _get_piece_index {
337	172			172		174	my ($self, $position) = @_;
338	172	50				317	confess("Internal error: invalid position") if (! defined($position));
339	172					297	my $index = 0;
340	172					175	foreach my $piece (@{$self->{_pieces}}) {
	172					326
341	805	100				1400	return $index if ($position <= $piece->{endPosition});
342	633					565	$index++;
343							}
344							}
345
346							sub _get_text_range {
347	86			86		100	my ($self, $start, $end) = @_;
348
349	86					110	my $pieces = $self->{_pieces};
350	86					151	my $start_piece = _get_piece_index($self, $start);
351	86					143	my $end_piece = _get_piece_index($self, $end);
352	86					123	my @result = ();
353	86					184	for(my $i = $start_piece; $i <= $end_piece; $i++) {
354	101					116	my $piece = $pieces->[$i];
355	101	100				196	my $xstart = ($i == $start_piece) ? $start - $piece->{position} : 0;
356	101	100				150	my $xend = ($i == $end_piece) ? $end - $piece->{position} : $piece->{endPosition};
357	101					683	push @result, substr($piece->{text}, $xstart, $xend - $xstart);
358							}
359
360	86					500	return join("", @result);
361							}
362
363							sub get_bookmarks {
364	10			10	1	1816	my ($self, $filter) = @_;
365	10					21	my $bookmarks = $self->{_bookmarks};
366	10					93	my @bookmark_names = sort keys %$bookmarks;
367	10					25	foreach my $name (@bookmark_names) {
368	72					127	my $bookmark = $bookmarks->{$name};
369	72	100				140	next if (exists($bookmark->{value}));
370	70					79	my $start = $bookmark->{start};
371	70					74	my $end = $bookmark->{end};
372	70					134	my $value = _get_text_range($self, $start - 1, $end);
373	70	100				174	if (substr($value, 0, 1) ne chr(19)) {
374	1					4	$value = substr($value, 1);
375							}
376	70					164	$bookmark->{value} = $value;
377							### name: $name
378							### value: $value
379							}
380
381	10					25	return { map { ($_ => _filter($bookmarks->{$_}->{value}, $filter) ) } @bookmark_names };
	72					193
382							}
383
384							sub get_body {
385	3			3	1	14	my ($self, $filter) = @_;
386	3					8	my $start = 0;
387	3					13	return _filter(_get_text_range($self, $start, $start + $self->{_ccpText}), $filter);
388							}
389
390							sub get_footnotes {
391	2			2	1	5	my ($self, $filter) = @_;
392	2					4	my $start = $self->{_ccpText};
393	2					6	return _filter(_get_text_range($self, $start, $start + $self->{_ccpFtn}), $filter);
394							}
395
396							sub get_headers {
397	2			2	1	8	my ($self, $filter) = @_;
398	2					5	my $start = $self->{_ccpText} + $self->{_ccpFtn};
399	2					7	return _filter(_get_text_range($self, $start, $start + $self->{_ccpHdd}), $filter);
400							}
401
402							sub get_annotations {
403	2			2	1	4	my ($self, $filter) = @_;
404	2					4	my $start = $self->{_ccpText} + $self->{_ccpFtn} + $self->{_ccpHdd};
405	2					10	return _filter(_get_text_range($self, $start, $start + $self->{_ccpAtn}), $filter);
406							}
407
408							sub get_text {
409	1			1	1	2	my ($self, $filter) = @_;
410	1					3	return $self->get_body($filter) .
411							$self->get_footnotes($filter) .
412							$self->get_headers($filter) .
413							$self->get_annotations($filter);
414							}
415
416							sub _filter {
417	81			81		107	my ($text, $filter) = @_;
418	81	100				125	if (! defined($filter)) {
		50
419	79					486	$text =~ tr/\x02\x05\x08//d;
420	79					505	$text =~ tr/\x{2018}\x{2019}\x{201c}\x{201d}\x{0007}\x{000d}\x{2002}\x{2003}\x{2012}\x{2013}\x{2014}/''""\t\n \-\-\-/;
421	79					546	$text =~ s{\cS(?:[^\cT]\cT)([^\cU])\cU}{$1}g;
422	79					319	$text =~ s{\cS(?:[^\cU]*\cU)}{}g;
423	79					343	$text =~ s{[\cJ\cM]}{\n}g;
424							} elsif ($filter eq ':raw') {
425							# Do nothing
426							} else {
427	0					0	croak("Invalid filter type: $filter");
428							}
429	81					256	return $text;
430							}
431
432							sub get_all_text {
433	7			7	1	13199	my ($file) = @_;
434
435	7					54	my $instance = __PACKAGE__->new($file);
436
437	7					28	$instance->get_bookmarks();
438	7					38	return _get_text_range($instance, 0, $instance->{_ccpText} +
439							$instance->{_ccpFtn} +
440							$instance->{_ccpHdd} +
441							$instance->{_ccpAtn});
442							}
443
444							1;
445
446							=head1 NAME
447
448							Text::Extract::Word - Extract text from Word files
449
450							=head1 SYNOPSIS
451
452							# object-based interface
453							use Text::Extract::Word;
454							my $file = Text::Extract::Word->new("test1.doc");
455							my $text = $file->get_text();
456							my $body = $file->get_body();
457							my $footnotes = $file->get_footnotes();
458							my $headers = $file->get_headers();
459							my $annotations = $file->get_annotations();
460							my $bookmarks = $file->get_bookmarks();
461
462							# specify :raw if you don't want the text cleaned
463							my $raw = $file->get_text(':raw');
464
465							# legacy interface
466							use Text::Extract::Word qw(get_all_text);
467							my $text = get_all_text("test1.doc");
468
469							=head1 DESCRIPTION
470
471							This simple module allows the textual contents to be extracted from a Word file.
472							The code was ported from Java code, originally part of the Apache POE project, but
473							extensive code changes were made internally.
474
475							=head1 OBJECT-BASED INTERFACE
476
477							=head2 Text::Extract::Word->new($input);
478
479							Passed either a file name or an open file handle, this constructor returns an
480							instance that can be used to query the file contents.
481
482							=head1 METHODS
483
484							All the query methods accept an optional filter argument that can take the value
485							':raw' -- if this is passed the original Word file contents will be returned without
486							any attempt to clean the text.
487
488							The default filter attempts to remove Word internal characters used to identify
489							fields (including field instructions), and translate common Unicode 'fancy' quotes
490							into more conventional ISO-8859-1 equivalents, for ease of processing. Table cell
491							markers are also translated into tabs, and paragraph marks into Perl newlines.
492
493							=head2 get_body([$filter])
494
495							Returns the text for the main body of the Word document. This excludes headers,
496							footers, and annotations.
497
498							=head2 get_headers([$filter])
499
500							Returns the header and footer texts for the Word document, as a single scalar
501							string.
502
503							=head2 get_footnotes([$filter])
504
505							Returns the footnote and endnode texts for the Word document, as a single scalar
506							string.
507
508							=head2 get_annotations([$filter])
509
510							Returns the annotation texts for the Word document, as a single scalar
511							string.
512
513							=head2 get_text([$filter])
514
515							Returns the concatenated text from the body, headers, footnotes, and annotations
516							of the the Word document, as a single scalar string.
517
518							=head2 get_bookmarks([$filter])
519
520							Returns the bookmark texts for the Word document, as a hash reference. The keys
521							in the hash are the bookmark names (Word requires that these are unique) and
522							the values are the filtered bookmark texts.
523
524							This method can be used to get Word form text data out of a Word file. All text fields
525							in a Word form will normally be labelled as bookmarks, and will be returned by this
526							method. Non-textual form fields (including drop-downs) will not be returned, as these
527							are not labelled as bookmarks.
528
529							=head1 FUNCTIONS
530
531							=head2 get_all_text($filename)
532
533							The only function exportable by this module, when called on a file name, returns the
534							raw text contents of the Word file. The contents are returned as UTF-8 encoded text.
535							This is unfiltered, for compatibility with previous versions of the module.
536
537							=head1 TODO
538
539							=over 4
540
541							=item *
542
543							handle non-textual form fields
544
545							=back
546
547							=head1 BUGS
548
549							=over 4
550
551							=item *
552
553							support for legacy Word - the module does not extract text from Word version 6 or earlier
554
555							=back
556
557							=head1 SEE ALSO
558
559							L also has a script C (Let's Have a Look at Word) which extracts
560							text from Word files. This is simply a much smaller module with lighter dependencies,
561							using L for its storage management.
562
563							=head1 AUTHOR
564
565							Stuart Watt, stuart@morungos.com
566
567							=head1 COPYRIGHT
568
569							Copyright (c) 2010 Stuart Watt. All rights reserved.
570
571							=cut
572