File Coverage

blib/lib/MsOffice/Word/Surgeon/PackagePart.pm
Criterion Covered Total %
statement 157 205 76.5
branch 21 44 47.7
condition 24 40 60.0
subroutine 31 37 83.7
pod 13 16 81.2
total 246 342 71.9


line stmt bran cond sub pod time code
1             package MsOffice::Word::Surgeon::PackagePart;
2 1     1   18 use 5.24.0;
  1         5  
3 1     1   18 use Moose;
  1         2  
  1         9  
4 1     1   6839 use MooseX::StrictConstructor;
  1         2  
  1         11  
5 1     1   3278 use MsOffice::Word::Surgeon::Utils qw(maybe_preserve_spaces is_at_run_level);
  1         5  
  1         74  
6 1     1   504 use MsOffice::Word::Surgeon::Run;
  1         4  
  1         46  
7 1     1   616 use MsOffice::Word::Surgeon::Text;
  1         3  
  1         34  
8 1     1   705 use XML::LibXML;
  1         31826  
  1         8  
9 1     1   170 use List::Util qw(max);
  1         4  
  1         70  
10 1     1   6 use Carp qw(croak carp);
  1         3  
  1         93  
11              
12             # syntactic sugar for attributes
13             sub has_inner ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr", init_arg => undef)}
14              
15             # constant integers to specify indentation modes -- see L<XML::LibXML>
16 1     1   7 use constant XML_NO_INDENT => 0;
  1         2  
  1         56  
17 1     1   6 use constant XML_SIMPLE_INDENT => 1;
  1         3  
  1         63  
18              
19 1     1   8 use namespace::clean -except => 'meta';
  1         2  
  1         10  
20              
21             our $VERSION = '2.02';
22              
23              
24             #======================================================================
25             # ATTRIBUTES
26             #======================================================================
27              
28              
29             # attributes passed to the constructor
30             has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1, weak_ref => 1);
31             has 'part_name' => (is => 'ro', isa => 'Str', required => 1);
32              
33              
34             # attributes constructed by the module -- not received through the constructor
35             has_inner 'contents' => (is => 'rw', isa => 'Str', trigger => \&_on_new_contents);
36             has_inner 'runs' => (is => 'ro', isa => 'ArrayRef', clearer => 'clear_runs');
37             has_inner 'relationships' => (is => 'ro', isa => 'ArrayRef');
38             has_inner 'images' => (is => 'ro', isa => 'HashRef');
39              
40             has 'contents_has_changed' => (is => 'bare', isa => 'Bool', default => 0);
41             has 'was_cleaned_up' => (is => 'bare', isa => 'Bool', default => 0);
42              
43             #======================================================================
44             # GLOBAL VARIABLES
45             #======================================================================
46              
47             # Various regexes for removing uninteresting XML information
48             my %noise_reduction_regexes = (
49             proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>),
50             revision_ids => qr(\sw:rsid\w+="[^"]+"),
51             complex_script_bold => qr(<w:bCs/>),
52             page_breaks => qr(<w:lastRenderedPageBreak/>),
53             language => qr(<w:lang w:val="[^/>]+/>),
54             empty_run_props => qr(<w:rPr></w:rPr>),
55             soft_hyphens => qr(<w:softHyphen/>),
56             );
57              
58             my @noise_reduction_list = qw/proof_checking revision_ids
59             complex_script_bold page_breaks language
60             empty_run_props soft_hyphens/;
61              
62             #======================================================================
63             # LAZY ATTRIBUTE CONSTRUCTORS AND TRIGGERS
64             #======================================================================
65              
66              
67             sub _runs {
68 12     12   23 my $self = shift;
69              
70 12         24 state $run_regex = qr[
71             <w:r> # opening tag for the run
72             (?:<w:rPr>(.*?)</w:rPr>)? # run properties -- capture in $1
73             (.*?) # run contents -- capture in $2
74             </w:r> # closing tag for the run
75             ]x;
76              
77 12         24 state $txt_regex = qr[
78             <w:t(?:\ xml:space="preserve")?> # opening tag for the text contents
79             (.*?) # text contents -- capture in $1
80             </w:t> # closing tag for text
81             ]x;
82              
83              
84             # split XML content into run fragments
85 12         384 my $contents = $self->contents;
86 12         12876 my @run_fragments = split m[$run_regex], $contents, -1;
87 12         35 my @runs;
88              
89             # build internal RUN objects
90             RUN:
91 12         60 while (my ($xml_before_run, $props, $run_contents) = splice @run_fragments, 0, 3) {
92 2004   100     2338004 $run_contents //= '';
93              
94             # split XML of this run into text fragmentsn
95 2004         22690 my @txt_fragments = split m[$txt_regex], $run_contents, -1;
96 2004         4251 my @texts;
97              
98             # build internal TEXT objects
99             TXT:
100 2004         6191 while (my ($xml_before_text, $txt_contents) = splice @txt_fragments, 0, 2) {
101 3634 100 66     1627409 next TXT if !$xml_before_text && ( !(defined $txt_contents) || $txt_contents eq '');
      66        
102 1840   50     7805 push @texts, MsOffice::Word::Surgeon::Text->new(
      100        
103             xml_before => $xml_before_text // '',
104             literal_text => $txt_contents // '',
105             );
106             }
107              
108             # assemble TEXT objects into a RUN object
109 2004 100 100     41488 next RUN if !$xml_before_run && !@texts;
110 1852   50     8101 push @runs, MsOffice::Word::Surgeon::Run->new(
      100        
111             xml_before => $xml_before_run // '',
112             props => $props // '',
113             inner_texts => \@texts,
114             );
115             }
116              
117 12         16007 return \@runs;
118             }
119              
120              
121             sub _relationships {
122 1     1   3 my $self = shift;
123              
124             # xml that describes the relationships for this package part
125 1         8 my $rel_xml = $self->_rels_xml;
126              
127             # parse the relationships and assemble into a sparse array indexed by relationship ids
128 1         74 my @relationships;
129 1         19 while ($rel_xml =~ m[<Relationship\s+(.*?)/>]g) {
130 21         46 my %attrs = parse_attrs($1);
131 21   50     95 $attrs{$_} or croak "missing attribute '$_' in <Relationship> node" for qw/Id Type Target/;
132 21         79 ($attrs{num} = $attrs{Id}) =~ s[^\D+][];
133 21         102 ($attrs{short_type} = $attrs{Type}) =~ s[^.*/][];
134 21         150 $relationships[$attrs{num}] = \%attrs;
135             }
136              
137 1         42 return \@relationships;
138             }
139              
140              
141             sub _images {
142 0     0   0 my $self = shift;
143              
144             # get relationship ids associated with images
145 0         0 my %rel_image = map {$_->{Id} => $_->{Target}}
146 0 0       0 grep {$_ && $_->{short_type} eq 'image'}
  0         0  
147             $self->relationships->@*;
148              
149             # get titles and relationship ids of images found within the part contents
150 0         0 my %image;
151 0         0 my @drawings = $self->contents =~ m[<w:drawing>(.*?)</w:drawing>]g;
152             DRAWING:
153 0         0 foreach my $drawing (@drawings) {
154 0 0       0 if ($drawing =~ m[<wp:docPr \s+ (.*?) />
155             .*?
156             <a:blip \s+ r:embed="(\w+)"]x) {
157 0         0 my ($lst_attrs, $rId) = ($1, $2);
158 0         0 my %attrs = parse_attrs($lst_attrs);
159             my $img_id = $attrs{title} || $attrs{descr}
160 0 0 0     0 or next DRAWING;
161              
162 0 0       0 $image{$img_id} = "word/$rel_image{$rId}"
163             or die "couldn't find image for relationship '$rId' associated with image '$img_id'";
164             # NOTE: targets in the rels XML miss the "word/" prefix, I don't know why.
165             }
166             }
167              
168 0         0 return \%image;
169             }
170              
171              
172 7     7   19 sub _contents {shift->original_contents}
173              
174             sub _on_new_contents {
175 16     16   31 my $self = shift;
176              
177 16         633 $self->clear_runs;
178 16         224038 $self->{contents_has_changed} = 1;
179             }
180              
181             #======================================================================
182             # METHODS
183             #======================================================================
184              
185              
186             sub _rels_xml {
187 1     1   4 my ($self, $new_xml) = @_;
188 1         34 my $rels_name = sprintf "word/_rels/%s.xml.rels", $self->part_name;
189 1         36 return $self->surgeon->xml_member($rels_name, $new_xml);
190             }
191              
192              
193             sub zip_member_name {
194 7     7 0 15 my $self = shift;
195 7         248 return sprintf "word/%s.xml", $self->part_name;
196             }
197              
198              
199             sub original_contents {
200 7     7 1 13 my $self = shift;
201              
202 7         224 return $self->surgeon->xml_member($self->zip_member_name);
203             }
204              
205              
206             sub image {
207 0     0 0 0 my ($self, $title, $new_image_content) = @_;
208              
209             # name of the image file within the zip
210 0   0     0 my $zip_member_name = $self->images->{$title}
211             || ($title =~ /^\d+$/ ? "word/media/image$title.png"
212             : die "couldn't find image '$title'");
213              
214             # delegate to Archive::Zip::contents
215 0         0 return $self->surgeon->zip->contents($zip_member_name, $new_image_content);
216             }
217              
218              
219              
220             #======================================================================
221             # CONTENTS RESTITUTION
222             #======================================================================
223              
224             sub indented_contents {
225 0     0 1 0 my $self = shift;
226              
227 0         0 my $dom = XML::LibXML->load_xml(string => $self->contents);
228 0         0 return $dom->toString(XML_SIMPLE_INDENT); # returned as bytes sequence, not a Perl string
229             }
230              
231              
232             sub plain_text {
233 2     2 1 30 my $self = shift;
234              
235             # XML contents
236 2         67 my $txt = $self->contents;
237              
238             # replace opening paragraph tags by newlines
239 2         4535 $txt =~ s/(<w:p[ >])/\n$1/g;
240              
241             # replace break tags by newlines
242 2         3857 $txt =~ s[<w:br/>][\n]g;
243              
244             # replace tab nodes by ASCII tabs
245 2         5192 $txt =~ s/<w:tab[^s][^>]*>/\t/g;
246              
247             # remove all remaining XML tags
248 2         15472 $txt =~ s/<[^>]+>//g;
249              
250 2         37 return $txt;
251             }
252              
253              
254              
255              
256             #======================================================================
257             # MODIFYING CONTENTS
258             #======================================================================
259              
260             sub cleanup_XML {
261 2     2 1 10 my ($self, @merge_args) = @_;
262              
263             # avoid doing it twice
264 2 100       11 return if $self->{was_cleaned_up};
265              
266             # do the cleanup
267 1         7 $self->reduce_all_noises;
268 1         6 my $names_of_ASK_fields = $self->unlink_fields;
269 1         12 $self->suppress_bookmarks(@$names_of_ASK_fields);
270 1         11 $self->merge_runs(@merge_args);
271              
272             # remember it was done
273 1         84971 $self->{was_cleaned_up} = 1;
274             }
275              
276             sub noise_reduction_regex {
277 7     7 1 13 my ($self, $regex_name) = @_;
278 7 50       25 my $regex = $noise_reduction_regexes{$regex_name}
279             or croak "->noise_reduction_regex('$regex_name') : unknown regex name";
280 7         15 return $regex;
281             }
282              
283             sub reduce_noise {
284 2     2 1 8 my ($self, @noises) = @_;
285              
286             # gather regexes to apply, given either directly as regex refs, or as names of builtin regexes
287 2 100       6 my @regexes = map {ref $_ eq 'Regexp' ? $_ : $self->noise_reduction_regex($_)} @noises;
  10         26  
288              
289             # get contents, apply all regexes, put back the modified contents.
290 2         74 my $contents = $self->contents;
291 1     1   2814 no warnings 'uninitialized'; # for regexes without capture groups, $1 will be undef
  1         3  
  1         1642  
292 2         25522 $contents =~ s/$_/$1/g foreach @regexes;
293 2         94 $self->contents($contents);
294             }
295              
296             sub reduce_all_noises {
297 1     1 1 2 my $self = shift;
298              
299 1         5 $self->reduce_noise(@noise_reduction_list);
300             }
301              
302             sub suppress_bookmarks {
303 1     1 1 4 my ($self, @names_to_erase) = @_;
304              
305             # closure to decide what to do with bookmark contents
306 1         4 my %should_erase_contents = map {($_ => 1)} @names_to_erase;
  2         11  
307             my $deal_with_bookmark_text = sub {
308 17     17   64 my ($bookmark_name, $bookmark_contents) = @_;
309 17 100       698 return $should_erase_contents{$bookmark_name} ? "" : $bookmark_contents;
310 1         8 };
311              
312             # regex to find bookmarks markup
313 1         6 state $bookmark_rx = qr{
314             <w:bookmarkStart # initial tag
315             .+? w:id="(\d+)" # 'id' attribute, bookmark identifier -- capture 1
316             .+? w:name="([^"]+)" # 'name' attribute -- capture 2
317             .*? /> # end of this tag
318             (.*?) # bookmark contents (may be empty) -- capture 3
319             <w:bookmarkEnd # ending tag
320             \s+ w:id="\1" # same 'id' attribute
321             .*? /> # end of this tag
322             }sx;
323              
324             # remove bookmarks markup
325 1         34 my $contents = $self->contents;
326 1         1428 $contents =~ s{$bookmark_rx}{$deal_with_bookmark_text->($2, $3)}eg;
  17         50  
327              
328             # re-inject the modified contents
329 1         45 $self->contents($contents);
330             }
331              
332             sub merge_runs {
333 1     1 1 5 my ($self, %args) = @_;
334              
335             # check validity of received args
336 1         4 state $is_valid_arg = {no_caps => 1};
337 1         6 my @invalid_args = grep {!$is_valid_arg->{$_}} keys %args;
  1         5  
338 1 50       5 croak "merge_runs(): invalid arg(s): " . join ", ", @invalid_args if @invalid_args;
339              
340 1         3 my @new_runs;
341             # loop over internal "run" objects
342 1         2 foreach my $run (@{$self->runs}) {
  1         35  
343              
344 793 50       2817 $run->remove_caps_property if $args{no_caps};
345              
346             # check if the current run can be merged with the previous one
347 793 100 66     22914 if ( !$run->xml_before # no other XML markup between the 2 runs
      100        
348             && @new_runs # there was a previous run
349             && $new_runs[-1]->props eq $run->props # both runs have the same properties
350             ) {
351             # conditions are OK, so merge this run with the previous one
352 272         644 $new_runs[-1]->merge($run);
353             }
354             else {
355             # conditions not OK, just push this run without merging
356 521         1280 push @new_runs, $run;
357             }
358             }
359              
360             # reassemble the whole stuff and inject it as new contents
361 1         15 $self->contents(join "", map {$_->as_xml} @new_runs);
  521         1315  
362             }
363              
364              
365              
366             sub unlink_fields {
367 1     1 1 3 my $self = shift;
368              
369             # must find out what are the ASK fields before erasing the markup
370 1         5 state $ask_field_rx = qr[<w:instrText[^>]+?>\s+ASK\s+(\w+)];
371 1         33 my $contents = $self->contents;
372 1         233 my @names_of_ASK_fields = $contents =~ /$ask_field_rx/g;
373              
374              
375             # regexes to remove field nodes and "field instruction" nodes
376 1         5 state $field_instruction_txt_rx = qr[<w:instrText.*?</w:instrText>];
377 1         5 state $field_boundary_rx = qr[<w:fldChar
378             (?: [^>]*?/> # ignore all attributes until end of node ..
379             | # .. or
380             [^>]*?>.*?</w:fldChar>) # .. ignore node content until closing tag
381             ]x; # field boundaries are encoded as "begin" / "separate" / "end"
382 1         4 state $simple_field_rx = qr[</?w:fldSimple[^>]*>];
383              
384             # apply the regexes
385 1         6 $self->reduce_noise($field_instruction_txt_rx, $field_boundary_rx, $simple_field_rx);
386              
387 1         4 return \@names_of_ASK_fields;
388             }
389              
390              
391             sub replace {
392 11     11 1 60 my ($self, $pattern, $replacement_callback, %replacement_args) = @_;
393              
394             # shared initial string for error messages
395 11         25 my $error_msg = '->replace($pattern, $callback, %args)';
396              
397             # default value for arg 'cleanup_XML', possibly from deprecated arg 'keep_xml_as_is'
398 11 50       28 if (delete $replacement_args{keep_xml_as_is}) {
399             not exists $replacement_args{cleanup_XML}
400 0 0       0 or croak "$error_msg: deprecated arg 'keep_xml_as_is' conflicts with arg 'cleanup_XML'";
401 0         0 carp "$error_msg: arg 'keep_xml_as_is' is deprecated, use 'cleanup_XML' instead";
402 0         0 $replacement_args{cleanup_XML} = 0;
403             }
404             else {
405 11   100     32 $replacement_args{cleanup_XML} //= 1; # default
406             }
407              
408             # cleanup the XML structure so that replacements work better
409 11 100       31 if (my $cleanup_args = $replacement_args{cleanup_XML}) {
410 1 50       6 $cleanup_args = {} if ! ref $cleanup_args;
411 1 50       6 ref $cleanup_args eq 'HASH'
412             or croak "$error_msg: arg 'cleanup_XML' should be a hashref";
413 1         6 $self->cleanup_XML(%$cleanup_args);
414             }
415              
416             # check for presences of a special option to avoid modying contents
417 11         21 my $dont_overwrite_contents = delete $replacement_args{dont_overwrite_contents};
418              
419             # apply replacements and generate new XML
420             my $xml = join "",
421 11         366 map {$_->replace($pattern, $replacement_callback, %replacement_args)} $self->runs->@*;
  1059         4078  
422              
423             # overwrite previous contents
424 11 50       596 $self->contents($xml) unless $dont_overwrite_contents;
425              
426 11         87 return $xml;
427             }
428              
429              
430             sub _update_contents_in_zip { # called for each part before saving the zip file
431 0     0   0 my $self = shift;
432              
433             $self->surgeon->xml_member($self->zip_member_name, $self->contents)
434 0 0       0 if $self->{contents_has_changed};
435             }
436              
437              
438             sub replace_image {
439 0     0 1 0 my ($self, $image_title, $image_PNG_content) = @_;
440              
441 0 0       0 my $member_name = $self->images->{$image_title}
442             or die "could not find an image with title: $image_title";
443 0         0 $self->surgeon->zip->contents($member_name, $image_PNG_content);
444             }
445              
446              
447              
448             sub add_image {
449 0     0 1 0 my ($self, $image_PNG_content) = @_;
450              
451             # compute a fresh image number and a fresh relationship id
452 0         0 my @image_members = $self->surgeon->zip->membersMatching(qr[^word/media/image]);
453 0         0 my @image_nums = map {$_->fileName =~ /(\d+)/} @image_members;
  0         0  
454 0   0     0 my $last_img_num = max @image_nums // 0;
455 0         0 my $target = sprintf "media/image%d.png", $last_img_num + 1;
456 0         0 my $last_rId_num = $self->relationships->$#*;
457 0         0 my $rId = sprintf "rId%d", $last_rId_num + 1;
458              
459             # assemble XML for the new relationship
460 0         0 my $type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
461 0         0 my $new_rel_xml = qq{<Relationship Id="$rId" Type="$type" Target="$target"/>};
462              
463             # update the rels member
464 0         0 my $xml = $self->_rels_xml;
465 0         0 $xml =~ s[</Relationships>][$new_rel_xml</Relationships>];
466 0         0 $self->_rels_xml($xml);
467              
468             # add the image as a new member into the archive
469 0         0 my $member_name = "word/$target";
470 0         0 $self->surgeon->zip->addString(\$image_PNG_content, $member_name);
471              
472             # update the global content_types if it doesn't include PNG
473 0         0 my $ct = $self->surgeon->_content_types;
474 0 0       0 if ($ct !~ /Extension="png"/) {
475 0         0 $ct =~ s[(<Types[^>]+>)][$1<Default Extension="png" ContentType="image/png"/>];
476 0         0 $self->surgeon->_content_types($ct);
477             }
478              
479             # return the relationship id
480 0         0 return $rId;
481             }
482              
483              
484              
485             #======================================================================
486             # UTILITY FUNCTIONS
487             #======================================================================
488              
489              
490             sub parse_attrs { # cheap parsing of attribute lists in an XML node
491 21     21 0 56 my ($lst_attrs) = @_;
492              
493 21         34 state $attr_pair_regex = qr[
494             ([^=\s"'&<>]+) # attribute name
495             \h* = \h* # Eq
496             (?: # attribute value
497             " ([^<"]*) " # .. enclosed in double quotes
498             |
499             ' ([^<']*) ' # .. or enclosed in single quotes
500             )
501             ]x;
502              
503 21         33 state $entity = {quot => '"', amp => '&', 'lt' => '<', gt => '>'};
504 21         34 state $entity_names = join "|", keys %$entity;
505              
506 21         29 my %attr;
507 21         130 while ($lst_attrs =~ /$attr_pair_regex/g) {
508 69   33     260 my ($name, $val) = ($1, $2 // $3);
509 69         127 $val =~ s/&(entity_names);/$entity->{$1}/eg;
  0         0  
510 69         460 $attr{$name} = $val;
511             }
512              
513 21         120 return %attr;
514             }
515              
516              
517             1;
518              
519             __END__
520              
521             =encoding ISO-8859-1
522              
523             =head1 NAME
524              
525             MsOffice::Word::Surgeon::PackagePart - Operations on a single part within the ZIP package of a docx document
526              
527             =head1 SYNOPSIS
528              
529             my $part = $surgeon->document;
530             print $part->plain_text;
531             $part->replace(qr[$pattern], $replacement_callback);
532             $part->replace_image($image_alt_text, $image_PNG_content);
533              
534              
535             =head1 DESCRIPTION
536              
537             This class is part of L<MsOffice::Word::Surgeon>; it encapsulates operations for a single
538             I<package part> within the ZIP package of a C<.docx> document.
539             It is mostly used for the I<document> part, that contains the XML representation of the
540             main document body. However, other parts such as headers, footers, footnotes, etc. have the
541             same internal representation and therefore the same operations can be invoked.
542              
543              
544             =head1 METHODS
545              
546             =head2 new
547              
548             my $run = MsOffice::Word::Surgeon::PackagePart->new(
549             surgeon => $surgeon,
550             part_name => $name,
551             );
552              
553             Constructor for a new part object. This is called internally from
554             L<MsOffice::Word::Surgeon>; it is not meant to be called directly
555             by clients.
556              
557             =head3 Constructor arguments
558              
559              
560             =over
561              
562             =item surgeon
563              
564             a weak reference to the main surgeon object
565              
566             =item part_name
567              
568             ZIP member name of this part
569              
570             =back
571              
572             =head3 Other attributes
573              
574             Other attributes, which are not passed through the constructor but are generated lazily on demand, are :
575              
576             =over
577              
578             =item contents
579              
580             the XML contents of this part
581              
582             =item runs
583              
584             a decomposition of the XML contents into a collection of
585             L<MsOffice::Word::Surgeon::Run> objects.
586              
587             =item relationships
588              
589             an arrayref of Office relationships associated with this part. This information comes from
590             a C<.rels> member in the ZIP archive, named after the name of the package part.
591             Array indices correspond to relationship numbers. Array values are hashrefs with
592             keys
593              
594             =over
595              
596             =item Id
597              
598             the full relationship id
599              
600             =item num
601              
602             the numeric part of C<rId>
603              
604             =item Type
605              
606             the full reference to the XML schema for this relationship
607              
608             =item short_type
609              
610             only the last word of the type, e.g. 'image', 'style', etc.
611              
612             =item Target
613              
614             designation of the target within the ZIP file. The prefix 'word/' must be
615             added for having a complete Zip member name.
616              
617             =back
618              
619              
620              
621             =item images
622              
623             a hashref of images within this package part. Keys of the hash are image I<alternative texts>.
624             If present, the alternative I<title> will be prefered; otherwise the alternative I<description> will be taken
625             (note : the I<title> field was displayed in Office 2013 and 2016, but more recent versions only display
626             the I<description> field -- see
627             L<https://support.microsoft.com/en-us/office/add-alternative-text-to-a-shape-picture-chart-smartart-graphic-or-other-object-44989b2a-903c-4d9a-b742-6a75b451c669|MsOffice documentation>).
628              
629             Images without alternative text will not be accessible through the current Perl module.
630              
631             Values of the hash are zip member names for the corresponding
632             image representations in C<.png> format.
633              
634              
635             =back
636              
637              
638             =head2 Contents restitution
639              
640             =head3 contents
641              
642             Returns a Perl string with the current internal XML representation of the part
643             contents.
644              
645             =head3 original_contents
646              
647             Returns a Perl string with the XML representation of the
648             part contents, as it was in the ZIP archive before any
649             modification.
650              
651             =head3 indented_contents
652              
653             Returns an indented version of the XML contents, suitable for inspection in a text editor.
654             This is produced by L<XML::LibXML::Document/toString> and therefore is returned as an encoded
655             byte string, not a Perl string.
656              
657             =head3 plain_text
658              
659             Returns the text contents of the part, without any markup.
660             Paragraphs and breaks are converted to newlines, all other formatting instructions are ignored.
661              
662              
663             =head3 runs
664              
665             Returns a list of L<MsOffice::Word::Surgeon::Run> objects. Each of
666             these objects holds an XML fragment; joining all fragments
667             restores the complete document.
668              
669             my $contents = join "", map {$_->as_xml} $self->runs;
670              
671              
672             =head2 Modifying contents
673              
674              
675             =head3 cleanup_XML
676              
677             $part->cleanup_XML(%args);
678              
679             Apply several other methods for removing unnecessary nodes within the internal
680             XML. This method successively calls L</reduce_all_noises>, L</unlink_fields>,
681             L</suppress_bookmarks> and L</merge_runs>.
682              
683             Currently there is only one legal arg :
684              
685             =over
686              
687             =item C<no_caps>
688              
689             If true, the method L<MsOffice::Word::Surgeon::Run/remove_caps_property> is automatically
690             called for each run object. As a result, all texts within runs with the C<caps> property are automatically
691             converted to uppercase.
692              
693             =back
694              
695              
696              
697             =head3 reduce_noise
698              
699             $part->reduce_noise($regex1, $regex2, ...);
700              
701             This method is used for removing unnecessary information in the XML
702             markup. It applies the given list of regexes to the whole document,
703             suppressing matches. The final result is put back into
704             C<< $self->contents >>. Regexes may be given either as C<< qr/.../ >>
705             references, or as names of builtin regexes (described below). Regexes
706             are applied to the whole XML contents, not only to run nodes.
707              
708              
709             =head3 noise_reduction_regex
710              
711             my $regex = $part->noise_reduction_regex($regex_name);
712              
713             Returns the builtin regex corresponding to the given name.
714             Known regexes are :
715              
716             proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>),
717             revision_ids => qr(\sw:rsid\w+="[^"]+"),
718             complex_script_bold => qr(<w:bCs/>),
719             page_breaks => qr(<w:lastRenderedPageBreak/>),
720             language => qr(<w:lang w:val="[^/>]+/>),
721             empty_run_props => qr(<w:rPr></w:rPr>),
722             soft_hyphens => qr(<w:softHyphen/>),
723              
724             =head3 reduce_all_noises
725              
726             $part->reduce_all_noises;
727              
728             Applies all regexes from the previous method.
729              
730             =head3 unlink_fields
731              
732             my $names_of_ASK_fields = $part->unlink_fields;
733              
734             Removes all fields from the part, just leaving the current
735             value stored in each field. This is the equivalent of performing Ctrl-Shift-F9
736             on the whole document.
737              
738             The return value is an arrayref to a list of names of ASK fields within the document.
739             Such names should then be passed to the L</suppress_bookmarks> method
740             (see below).
741              
742              
743             =head3 suppress_bookmarks
744              
745             $part->suppress_bookmarks(@names_to_erase);
746              
747             Removes bookmarks markup in the part. This is useful because
748             MsWord may silently insert bookmarks in unexpected places; therefore
749             some searches within the text may fail because of such bookmarks.
750              
751             By default, this method only removes the bookmarks markup, leaving
752             intact the contents of the bookmark. However, when the name of a
753             bookmark belongs to the list C<< @names_to_erase >>, the contents
754             is also removed. Currently this is used for suppressing ASK fields,
755             because such fields contain a bookmark content that is never displayed by MsWord.
756              
757              
758              
759             =head3 merge_runs
760              
761             $part->merge_runs(no_caps => 1); # optional arg
762              
763             Walks through all runs of text within the document, trying to merge
764             adjacent runs when possible (i.e. when both runs have the same
765             properties, and there is no other XML node inbetween).
766              
767             This operation is a prerequisite before performing replace operations, because
768             documents edited in MsWord often have run boundaries across sentences or
769             even in the middle of words; so regex searches can only be successful if those
770             artificial boundaries have been removed.
771              
772             If the argument C<< no_caps => 1 >> is present, the merge operation
773             will also convert runs with the C<w:caps> property, putting all letters
774             into uppercase and removing the property; this makes more merges possible.
775              
776              
777             =head3 replace
778              
779             $part->replace($pattern, $replacement, %replacement_args);
780              
781             Replaces all occurrences of C<$pattern> regex within the text nodes by the
782             given C<$replacement>. This is not exactly like a search-replace
783             operation performed within MsWord, because the search does not cross boundaries
784             of text nodes. In order to maximize the chances of successful replacements,
785             the L</cleanup_XML> method is automatically called before starting the operation.
786              
787             The argument C<$pattern> can be either a string or a reference to a regular expression.
788             It should not contain any capturing parentheses, because that would perturb text
789             splitting operations.
790              
791             The argument C<$replacement> can be either a fixed string, or a reference to
792             a callback subroutine that will be called for each match.
793              
794              
795             The C<< %replacement_args >> hash can be used to pass information to the callback
796             subroutine. That hash will be enriched with three entries :
797              
798             =over
799              
800             =item matched
801              
802             The string that has been matched by C<$pattern>.
803              
804             =item run
805              
806             The run object in which this text resides.
807              
808             =item xml_before
809              
810             The XML fragment (possibly empty) found before the matched text .
811              
812             =back
813              
814             The callback subroutine may return either plain text or structured XML.
815             See L<MsOffice::Word::Surgeon::Run/SYNOPSIS> for an example of a replacement callback.
816              
817             The following special keys within C<< %replacement_args >> are interpreted by the
818             C<replace()> method itself, and therefore are not passed to the callback subroutine :
819              
820             =over
821              
822             =item keep_xml_as_is
823              
824             if true, no call is made to the L</cleanup_XML> method before performing the replacements
825              
826             =item dont_overwrite_contents
827              
828             if true, the internal XML contents is not modified in place; the new XML after performing
829             replacements is merely returned to the caller.
830              
831             =item cleanup_args
832              
833             the argument should be an arrayref and will be passed to the L</cleanup_XML> method. This
834             is typically used as
835              
836             $part->replace($pattern, $replacement, cleanup_args => [no_caps => 1]);
837              
838             =back
839              
840              
841             =head3 replace_image
842              
843             $part->replace_image($image_alt_text, $image_PNG_content);
844              
845             Replaces an existing PNG image by a new image. All features of the old image will
846             be preserved (size, positioning, border, etc.) -- only the image itself will be
847             replaced. The C<$image_alt_text> must correspond to the I<alternative text> set in Word
848             for this image.
849              
850             This operation replaces a ZIP member within the C<.docx> file. If several XML
851             nodes refer to the I<same> ZIP member, i.e. if the same image is displayed at several
852             locations, the new image will appear at all locations, even if they do not have the
853             same alternative text -- unfortunately this module currently has no facility for
854             duplicating an existing image into separate instances. So if your intent is to only replace
855             one image, your original document should contain several distinct images, coming from
856             several distinct C<.PNG> file copies.
857              
858              
859             =head3 add_image
860              
861             my $rId = $part->add_image($image_PNG_content);
862              
863             Stores the given PNG image within the ZIP file, adds it as a relationship to the
864             current part, and returns the relationship id. This operation is not sufficient
865             to make the image visible in Word : it just stores the image, but you still
866             have to insert a proper C<drawing> node in the contents XML, using the C<$rId>.
867             Future versions of this module may offer helper methods for that purpose;
868             currently it must be done by hand.
869              
870              
871             =head1 AUTHOR
872              
873             Laurent Dami, E<lt>dami AT cpan DOT org<gt>
874              
875             =head1 COPYRIGHT AND LICENSE
876              
877             Copyright 2019-2023 by Laurent Dami.
878              
879             This library is free software; you can redistribute it and/or modify
880             it under the same terms as Perl itself.
881              
882              
883