File Coverage

blib/lib/MsOffice/Word/Surgeon.pm
Criterion Covered Total %
statement 60 84 71.4
branch 6 16 37.5
condition 2 6 33.3
subroutine 17 24 70.8
pod 7 10 70.0
total 92 140 65.7


line stmt bran cond sub pod time code
1             package MsOffice::Word::Surgeon;
2 1     1   110118 use 5.24.0;
  1         4  
3 1     1   766 use Moose;
  1         486521  
  1         6  
4 1     1   8779 use MooseX::StrictConstructor;
  1         32460  
  1         6  
5 1     1   11177 use Archive::Zip qw(AZ_OK);
  1         91269  
  1         65  
6 1     1   10 use Encode qw(encode_utf8 decode_utf8);
  1         2  
  1         49  
7 1     1   6 use Carp qw(croak);
  1         3  
  1         45  
8 1     1   510 use MsOffice::Word::Surgeon::Revision;
  1         4  
  1         51  
9 1     1   646 use MsOffice::Word::Surgeon::PackagePart;
  1         3  
  1         145  
10              
11             # syntactic sugar for attributes
12             sub has_lazy ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr")}
13             sub has_inner ($@) {my $attr = shift; has_lazy($attr => @_, init_arg => undef)}
14              
15              
16 1     1   10 use namespace::clean -except => 'meta';
  1         2  
  1         8  
17              
18             our $VERSION = '2.01';
19              
20              
21             #======================================================================
22             # ATTRIBUTES
23             #======================================================================
24              
25             # attributes to the constructor -- either the filename or an existing zip archive
26             has 'docx' => (is => 'ro', isa => 'Str');
27             has_lazy 'zip' => (is => 'ro', isa => 'Archive::Zip');
28              
29             # inner attributes lazily constructed by the module
30             has_inner 'parts' => (is => 'ro', isa => 'HashRef[MsOffice::Word::Surgeon::PackagePart]',
31             traits => ['Hash'], handles => {part => 'get'});
32              
33             has_inner 'document' => (is => 'ro', isa => 'MsOffice::Word::Surgeon::PackagePart',
34             handles => [qw/contents original_contents indented_contents plain_text replace/]);
35             # Note: this attribute is equivalent to $self->part('document'); made into an attribute
36             # for convenience and for automatic delegation of methods through the 'handles' declaration
37              
38             # just a slot for internal storage
39             has 'next_rev_id' => (is => 'bare', isa => 'Num', default => 1, init_arg => undef);
40             # used by the revision() method for creating *::Revision objects -- each instance
41             # gets a fresh value
42              
43              
44             #======================================================================
45             # BUILDING INSTANCES
46             #======================================================================
47              
48              
49             # syntactic sugar for supporting ->new($path) instead of ->new(docx => $path)
50             around BUILDARGS => sub {
51             my $orig = shift;
52             my $class = shift;
53              
54             if ( @_ == 1 && !ref $_[0] ) {
55             return $class->$orig(docx => $_[0]);
56             }
57             else {
58             return $class->$orig(@_);
59             }
60             };
61              
62              
63             # make sure that the constructor got either a 'docx' or a 'zip' attribute
64             sub BUILD {
65 1     1 0 2505 my $self = shift;
66              
67 1         3 my $class = ref $self;
68              
69             $self->{docx} || $self->{zip}
70 1 0 33     5 or croak "$class->new() : need either 'docx' or 'zip' attribute";
71             not ($self->{docx} && $self->{zip})
72 1 50 33     9 or croak "$class->new() : can't have both 'docx' and 'zip' attributes";
73             }
74              
75              
76             #======================================================================
77             # LAZY ATTRIBUTE CONSTRUCTORS
78             #======================================================================
79              
80             sub _zip {
81 1     1   3 my $self = shift;
82              
83 1         9 my $zip = Archive::Zip->new;
84 1 50       50 $zip->read($self->{docx}) == AZ_OK
85             or croak "cannot unzip $self->{docx}";
86              
87 1         5474 return $zip;
88             }
89              
90              
91              
92             sub _parts {
93 1     1   2 my $self = shift;
94              
95             # first create a package part for the main document
96 1         25 my $doc = MsOffice::Word::Surgeon::PackagePart->new(surgeon => $self,
97             part_name => 'document');
98              
99             # gather names of headers and footers related to that document
100 6         37 my @headers_footers = map {$_->{Target} =~ s/\.xml$//r}
101 1 100       2130 grep {$_ && $_->{short_type} =~ /^(header|footer)$/}
  22         95  
102             $doc->relationships->@*;
103              
104             # create package parts for headers and footers and assemble all parts into a hash
105 1         5 my %parts = (document => $doc);
106             $parts{$_} = MsOffice::Word::Surgeon::PackagePart->new(surgeon => $self,
107             part_name => $_)
108 1         11 for @headers_footers;
109              
110 1         10324 return \%parts;
111             }
112              
113              
114 1     1   37 sub _document {shift->part('document')}
115              
116              
117             #======================================================================
118             # ACCESSING OR CHANGING THE INTERNAL STATE
119             #======================================================================
120              
121             sub xml_member {
122 8     8 1 24 my ($self, $member_name, $new_content) = @_;
123              
124 8 50       35 if (! defined $new_content) { # used as a reader
125 8 50       233 my $bytes = $self->zip->contents($member_name)
126             or croak "no zip member for $member_name";
127 8         12996 return decode_utf8($bytes);
128             }
129             else { # used as a writer
130 0         0 my $bytes = encode_utf8($new_content);
131 0         0 return $self->zip->contents($member_name, $bytes);
132             }
133             }
134              
135             sub _content_types {
136 0     0   0 my ($self, $new_content_types) = @_;
137 0         0 return $self->xml_member('[Content_Types].xml', $new_content_types);
138             }
139              
140              
141             sub headers {
142 2     2 1 1151 my ($self) = @_;
143 2         85 return sort {substr($a, 6) <=> substr($b, 6)} grep {/^header/} keys $self->parts->%*;
  6         41  
  14         61  
144             }
145              
146             sub footers {
147 1     1 1 7 my ($self) = @_;
148 1         39 return sort {substr($a, 6) <=> substr($b, 6)} grep {/^footer/} keys $self->parts->%*;
  3         20  
  7         26  
149             }
150              
151             sub new_rev_id {
152 0     0 0 0 my ($self) = @_;
153 0         0 return $self->{next_rev_id}++;
154             }
155              
156              
157              
158             #======================================================================
159             # GENERIC PROPAGATION TO ALL PARTS
160             #======================================================================
161              
162              
163             sub all_parts_do {
164 2     2 1 2861 my ($self, $method_name, @args) = @_;
165              
166 2         78 my $parts = $self->parts;
167              
168             # apply the method to each package part
169 2         7 my %result;
170 2         43 $result{$_} = $parts->{$_}->$method_name(@args) foreach keys %$parts;
171              
172              
173 2         278 return \%result;
174             }
175              
176              
177             #======================================================================
178             # CLONING
179             #======================================================================
180              
181             sub clone {
182 0     0 0   my $self = shift;
183              
184             # create a new Zip archive and copy all members to it
185 0           my $new_zip = Archive::Zip->new;
186 0           foreach my $member ($self->zip->members) {
187 0           $new_zip->addMember($member);
188             }
189              
190             # create a new instance of this class
191 0           my $class = ref $self;
192 0           my $clone = $class->new(zip => $new_zip);
193              
194             # other attributes will be recreated lazily within the clone .. not
195             # the most efficient way, but it is easier and safer, otherwise there is
196             # a risk of mixed references
197              
198 0           return $clone;
199             }
200              
201             #======================================================================
202             # SAVING THE FILE
203             #======================================================================
204              
205              
206             sub _update_contents_in_zip {
207 0     0     my $self = shift;
208 0           $_->_update_contents_in_zip foreach values $self->parts->%*;
209             }
210              
211              
212             sub overwrite {
213 0     0 1   my $self = shift;
214              
215 0           $self->_update_contents_in_zip;
216 0 0         $self->zip->overwrite == AZ_OK
217             or croak "error overwriting zip archive " . $self->docx;
218             }
219              
220             sub save_as {
221 0     0 1   my ($self, $docx) = @_;
222              
223 0           $self->_update_contents_in_zip;
224 0 0         $self->zip->writeToFileNamed($docx) == AZ_OK
225             or croak "error writing zip archive to $docx";
226             }
227              
228              
229             #======================================================================
230             # DELEGATION TO OTHER CLASSES
231             #======================================================================
232              
233             sub new_revision {
234 0     0 1   my $self = shift;
235              
236 0           my $revision = MsOffice::Word::Surgeon::Revision->new(rev_id => $self->new_rev_id, @_);
237 0           return $revision->as_xml;
238             }
239              
240              
241             1;
242              
243             __END__
244              
245             =encoding ISO-8859-1
246              
247             =head1 NAME
248              
249             MsOffice::Word::Surgeon - tamper with the guts of Microsoft docx documents, with regexes
250              
251             =head1 SYNOPSIS
252              
253             my $surgeon = MsOffice::Word::Surgeon->new(docx => $filename);
254              
255             # extract plain text
256             my $main_text = $surgeon->document->plain_text;
257             my @header_texts = map {$surgeon->part($_)->plain_text} $surgeon->headers;
258              
259             # anonymize
260             my %alias = ('Claudio MONTEVERDI' => 'A_____', 'Heinrich SCHÜTZ' => 'B_____');
261             my $pattern = join "|", keys %alias;
262             my $replacement_callback = sub {
263             my %args = @_;
264             my $replacement = $surgeon->new_revision(to_delete => $args{matched},
265             to_insert => $alias{$args{matched}},
266             run => $args{run},
267             xml_before => $args{xml_before},
268             );
269             return $replacement;
270             };
271             $surgeon->all_parts_do(replace => qr[$pattern], $replacement_callback);
272              
273             # save the result
274             $surgeon->overwrite; # or ->save_as($new_filename);
275              
276              
277             =head1 DESCRIPTION
278              
279             =head2 Purpose
280              
281             This module supports a few operations for inspecting or modifying contents
282             in Microsoft Word documents in '.docx' format -- therefore the name
283             'surgeon'. Since a surgeon does not give life, there is no support for
284             creating fresh documents; if you have such needs, use one of the other
285             packages listed in the L<SEE ALSO> section -- or use the companion module
286             L<MsOffice::Word::Template>.
287              
288             Some applications for this module are :
289              
290             =over
291              
292             =item *
293              
294             content extraction in plain text format;
295              
296             =item *
297              
298             unlinking fields (equivalent of performing Ctrl-Shift-F9 on the whole document)
299              
300             =item *
301              
302             regex replacements within text, for example for :
303              
304             =over
305              
306             =item *
307              
308             anonymization, i.e. replacement of names or adresses by aliases;
309              
310             =item *
311              
312             templating, i.e. replacement of special markup by contents coming from a data tree
313             (see also L<MsOffice::Word::Template>).
314              
315             =back
316              
317             =item *
318              
319             insertion of generated images (for example barcodes) -- see L<MsOffice::Word::Surgeon::PackagePart/images>;
320              
321             =item *
322              
323             pretty-printing the internal XML structure.
324              
325             =back
326              
327              
328              
329              
330             =head2 Operating mode
331              
332             The format of Microsoft C<.docx> documents is described in
333             L<http://www.ecma-international.org/publications/standards/Ecma-376.htm>
334             and L<http://officeopenxml.com/>. An excellent introduction can be
335             found at L<https://www.toptal.com/xml/an-informal-introduction-to-docx>.
336             Internally, a document is a zipped
337             archive, where the member named C<word/document.xml> stores the main
338             document contents, in XML format.
339              
340             The present module does not parse all details of the whole XML
341             structure because it only focuses on I<text> nodes (those that contain
342             literal text) and I<run> nodes (those that contain text formatting
343             properties). All remaining XML information, for example for
344             representing sections, paragraphs, tables, etc., is stored as opaque
345             XML fragments; these fragments are re-inserted at proper places when
346             reassembling the whole document after having modified some text nodes.
347              
348              
349             =head1 METHODS
350              
351             =head2 Constructor
352              
353             =head3 new
354              
355             my $surgeon = MsOffice::Word::Surgeon->new(docx => $filename);
356             # or simply : ->new($filename);
357              
358             Builds a new surgeon instance, initialized with the contents of the given filename.
359              
360             =head2 Accessors
361              
362             =head3 docx
363              
364             Path to the C<.docx> file
365              
366             =head3 zip
367              
368             Instance of L<Archive::Zip> associated with this file
369              
370             =head3 parts
371              
372             Hashref to L<MsOffice::Word::Surgeon::PackagePart> objects, keyed by their part name in the ZIP file.
373             There is always a C<'document'> part. Currently, other optional parts may be headers and footers.
374             Future versions may include other parts like footnotes or endnotes.
375              
376             =head3 document
377              
378             Shortcut to C<< $surgeon->part('document') >> -- the
379             L<MsOffice::Word::Surgeon::PackagePart> object corresponding to the main document.
380             See the C<PackagePart> documentation for operations on part objects.
381             Besides, the following operations are supported directly as methods to the C<< $surgeon >> object
382             and are automatically delegated to the C<< document >> part :
383             C<contents>, C<original_contents>, C<indented_contents>, C<plain_text>, C<replace>.
384              
385              
386              
387             =head3 headers
388              
389             my @header_parts = $surgeon->headers;
390              
391             Returns the ordered list of names of header members stored in the ZIP file.
392              
393             =head3 footers
394              
395             my @footer_parts = $surgeon->footers;
396              
397             Returns the ordered list of names of footer members stored in the ZIP file.
398              
399              
400             =head2 Other methods
401              
402              
403             =head3 part
404              
405             my $part = $surgeon->part($part_name);
406              
407             Returns the L<MsOffice::Word::Surgeon::PackagePart> object corresponding to the given part name.
408              
409              
410             =head3 all_parts_do
411              
412             my $result = $surgeon->all_parts_do($method_name => %args);
413              
414             Calls the given method on all part objects. Results are accumulated
415             in a hash, with part names as keys to the results. In most cases this is
416             used to invoqke the L<MsOffice::Word::Surgeon::PackagePart/replace> method.
417              
418              
419             =head3 xml_member
420              
421             my $xml = $surgeon->xml_member($member_name); # reading
422             # or
423             $surgeon->xml_member($member_name, $new_xml); # writing
424              
425             Reads or writes the given member name in the ZIP file, with utf8 decoding or encoding.
426              
427              
428             =head3 save_as
429              
430             $surgeon->save_as($docx_file);
431              
432             Writes the ZIP archive into the given file.
433              
434              
435             =head3 overwrite
436              
437             $surgeon->overwrite;
438              
439             Writes the updated ZIP archive into the initial file.
440              
441              
442             =head3 new_revision
443              
444             my $xml = $surgeon->new_revision(
445             to_delete => $text_to_delete,
446             to_insert => $text_to_insert,
447             author => $author_string,
448             date => $date_string,
449             run => $run_object,
450             xml_before => $xml_string,
451             );
452              
453             This method is syntactic sugar for instantiating the
454             L<MsOffice::Word::Surgeon::Revision> class and returning
455             XML markup for MsWord revisions (a.k.a. "tracked changes")
456             generated by that class. Users can
457             then manually review those revisions within MsWord and accept or reject
458             them. This is best used in collaboration with the L</replace> method :
459             the replacement callback can call C<< $self->new_revision(...) >> to
460             generate revision marks in the document.
461              
462             Either C<to_delete> or C<to_insert> (or both) must
463             be present. Other parameters are optional. The parameters are :
464              
465             =over
466              
467             =item to_delete
468              
469             The string of text to delete (usually this will be the C<matched> argument
470             passed to the replacement callback).
471              
472             =item to_insert
473              
474             The string of new text to insert.
475              
476             =item author
477              
478             A short string that will be displayed by MsWord as the "author" of this revision.
479              
480             =item date
481              
482             A date (and optional time) in ISO format that will be displayed by
483             MsWord as the date of this revision. The current date and time
484             will be used by default.
485              
486             =item run
487              
488             A reference to the L<MsOffice::Word::Surgeon::Run> object surrounding
489             this revision. The formatting properties of that run will be
490             copied into the C<< <w:r> >> nodes of the deleted and inserted text fragments.
491              
492              
493             =item xml_before
494              
495             An optional XML fragment to be inserted before the C<< <w:t> >> node
496             of the inserted text
497              
498             =back
499              
500              
501             =head1 SEE ALSO
502              
503             The L<https://metacpan.org/pod/Document::OOXML> distribution on CPAN
504             also manipulates C<docx> documents, but with another approach :
505             internally it uses L<XML::LibXML> and XPath expressions for
506             manipulating XML nodes. The API has some intersections with the
507             present module, but there are also some differences : C<Document::OOXML>
508             has more support for styling, while C<MsOffice::Word::Surgeon>
509             has more flexible mechanisms for replacing
510             text fragments.
511              
512              
513             Other programming languages also have packages for dealing with C<docx> documents; here
514             are some references :
515              
516             =over
517              
518             =item L<https://docs.microsoft.com/en-us/office/open-xml/word-processing>
519              
520             The C# Open XML SDK from Microsoft
521              
522             =item L<http://www.ericwhite.com/blog/open-xml-powertools-developer-center/>
523              
524             Additional functionalities built on top of the XML SDK.
525              
526             =item L<https://poi.apache.org>
527              
528             An open source Java library from the Apache foundation.
529              
530             =item L<https://www.docx4java.org/trac/docx4j>
531              
532             Another open source Java library, competitor to Apache POI.
533              
534             =item L<https://phpword.readthedocs.io/en/latest/>
535              
536             A PHP library dealing not only with Microsoft OOXML documents but also
537             with OASIS and RTF formats.
538              
539             =item L<https://pypi.org/project/python-docx/>
540              
541             A Python library, documented at L<https://python-docx.readthedocs.io/en/latest/>.
542              
543             =back
544              
545             As far as I can tell, most of these libraries provide objects and methods that
546             closely reflect the complete XML structure : for example they have classes for
547             paragraphs, styles, fonts, inline shapes, etc.
548              
549             The present module is much simpler but also much more limited : it was optimised
550             for dealing with the text contents and offers no support for presentation or
551             paging features. However, it has the rare advantage of providing an API for
552             regex substitutions within Word documents.
553              
554             The L<MsOffice::Word::Template> module relies on the present module, together with
555             the L<Perl Template Toolkit|Template>, to implement a templating system for Word documents.
556              
557              
558             =head1 AUTHOR
559              
560             Laurent Dami, E<lt>dami AT cpan DOT org<gt>
561              
562             =head1 COPYRIGHT AND LICENSE
563              
564             Copyright 2019-2022 by Laurent Dami.
565              
566             This library is free software; you can redistribute it and/or modify
567             it under the same terms as Perl itself.