File Coverage

blib/lib/MsOffice/Word/Surgeon/PackagePart.pm
Criterion Covered Total %
statement 148 193 76.6
branch 16 32 50.0
condition 23 40 57.5
subroutine 31 37 83.7
pod 13 16 81.2
total 231 318 72.6


line stmt bran cond sub pod time code
1             package MsOffice::Word::Surgeon::PackagePart;
2 1     1   18 use 5.24.0;
  1         4  
3 1     1   19 use Moose;
  1         3  
  1         7  
4 1     1   7069 use MooseX::StrictConstructor;
  1         2  
  1         9  
5 1     1   3442 use MsOffice::Word::Surgeon::Utils qw(maybe_preserve_spaces is_at_run_level);
  1         2  
  1         75  
6 1     1   504 use MsOffice::Word::Surgeon::Run;
  1         4  
  1         45  
7 1     1   674 use MsOffice::Word::Surgeon::Text;
  1         2  
  1         35  
8 1     1   919 use XML::LibXML;
  1         35333  
  1         9  
9 1     1   196 use List::Util qw(max);
  1         3  
  1         76  
10 1     1   10 use Carp qw(croak);
  1         7  
  1         97  
11              
12             # syntactic sugar for attributes
13             sub has_inner ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr", init_arg => undef)}
14              
15             # constant integers to specify indentation modes -- see L<XML::LibXML>
16 1     1   16 use constant XML_NO_INDENT => 0;
  1         3  
  1         72  
17 1     1   8 use constant XML_SIMPLE_INDENT => 1;
  1         2  
  1         60  
18              
19 1     1   11 use namespace::clean -except => 'meta';
  1         2  
  1         16  
20              
21             our $VERSION = '2.01';
22              
23              
24             #======================================================================
25             # ATTRIBUTES
26             #======================================================================
27              
28              
29             # attributes passed to the constructor
30             has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1, weak_ref => 1);
31             has 'part_name' => (is => 'ro', isa => 'Str', required => 1);
32              
33              
34             # attributes constructed by the module -- not received through the constructor
35             has_inner 'contents' => (is => 'rw', isa => 'Str', trigger => \&_on_new_contents);
36             has_inner 'runs' => (is => 'ro', isa => 'ArrayRef', clearer => 'clear_runs');
37             has_inner 'relationships' => (is => 'ro', isa => 'ArrayRef');
38             has_inner 'images' => (is => 'ro', isa => 'HashRef');
39              
40             has 'contents_has_changed' => (is => 'bare', isa => 'Bool', default => 0);
41              
42             #======================================================================
43             # GLOBAL VARIABLES
44             #======================================================================
45              
46             # Various regexes for removing uninteresting XML information
47             my %noise_reduction_regexes = (
48             proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>),
49             revision_ids => qr(\sw:rsid\w+="[^"]+"),
50             complex_script_bold => qr(<w:bCs/>),
51             page_breaks => qr(<w:lastRenderedPageBreak/>),
52             language => qr(<w:lang w:val="[^/>]+/>),
53             empty_run_props => qr(<w:rPr></w:rPr>),
54             soft_hyphens => qr(<w:softHyphen/>),
55             );
56              
57             my @noise_reduction_list = qw/proof_checking revision_ids
58             complex_script_bold page_breaks language
59             empty_run_props soft_hyphens/;
60              
61             #======================================================================
62             # LAZY ATTRIBUTE CONSTRUCTORS AND TRIGGERS
63             #======================================================================
64              
65              
66             sub _runs {
67 26     26   50 my $self = shift;
68              
69 26         47 state $run_regex = qr[
70             <w:r> # opening tag for the run
71             (?:<w:rPr>(.*?)</w:rPr>)? # run properties -- capture in $1
72             (.*?) # run contents -- capture in $2
73             </w:r> # closing tag for the run
74             ]x;
75              
76 26         58 state $txt_regex = qr[
77             <w:t(?:\ xml:space="preserve")?> # opening tag for the text contents
78             (.*?) # text contents -- capture in $1
79             </w:t> # closing tag for text
80             ]x;
81              
82              
83             # split XML content into run fragments
84 26         847 my $contents = $self->contents;
85 26         20284 my @run_fragments = split m[$run_regex], $contents, -1;
86 26         74 my @runs;
87              
88             # build internal RUN objects
89             RUN:
90 26         139 while (my ($xml_before_run, $props, $run_contents) = splice @run_fragments, 0, 3) {
91 3052   100     3611652 $run_contents //= '';
92              
93             # split XML of this run into text fragmentsn
94 3052         34760 my @txt_fragments = split m[$txt_regex], $run_contents, -1;
95 3052         5959 my @texts;
96              
97             # build internal TEXT objects
98             TXT:
99 3052         9404 while (my ($xml_before_text, $txt_contents) = splice @txt_fragments, 0, 2) {
100 5698 100 66     2540602 next TXT if !$xml_before_text && ( !(defined $txt_contents) || $txt_contents eq '');
      66        
101 2874   50     12695 push @texts, MsOffice::Word::Surgeon::Text->new(
      100        
102             xml_before => $xml_before_text // '',
103             literal_text => $txt_contents // '',
104             );
105             }
106              
107             # assemble TEXT objects into a RUN object
108 3052 100 100     41539 next RUN if !$xml_before_run && !@texts;
109 2893   50     13636 push @runs, MsOffice::Word::Surgeon::Run->new(
      100        
110             xml_before => $xml_before_run // '',
111             props => $props // '',
112             inner_texts => \@texts,
113             );
114             }
115              
116 26         34342 return \@runs;
117             }
118              
119              
120             sub _relationships {
121 1     1   5 my $self = shift;
122              
123             # xml that describes the relationships for this package part
124 1         4 my $rel_xml = $self->_rels_xml;
125              
126             # parse the relationships and assemble into a sparse array indexed by relationship ids
127 1         69 my @relationships;
128 1         22 while ($rel_xml =~ m[<Relationship\s+(.*?)/>]g) {
129 21         49 my %attrs = parse_attrs($1);
130 21   50     100 $attrs{$_} or croak "missing attribute '$_' in <Relationship> node" for qw/Id Type Target/;
131 21         80 ($attrs{num} = $attrs{Id}) =~ s[^\D+][];
132 21         103 ($attrs{short_type} = $attrs{Type}) =~ s[^.*/][];
133 21         156 $relationships[$attrs{num}] = \%attrs;
134             }
135              
136 1         41 return \@relationships;
137             }
138              
139              
140             sub _images {
141 0     0   0 my $self = shift;
142              
143             # get relationship ids associated with images
144 0         0 my %rel_image = map {$_->{Id} => $_->{Target}}
145 0 0       0 grep {$_ && $_->{short_type} eq 'image'}
  0         0  
146             $self->relationships->@*;
147              
148             # get titles and relationship ids of images found within the part contents
149 0         0 my %image;
150 0         0 my @drawings = $self->contents =~ m[<w:drawing>(.*?)</w:drawing>]g;
151             DRAWING:
152 0         0 foreach my $drawing (@drawings) {
153 0 0       0 if ($drawing =~ m[<wp:docPr \s+ (.*?) />
154             .*?
155             <a:blip \s+ r:embed="(\w+)"]x) {
156 0         0 my ($lst_attrs, $rId) = ($1, $2);
157 0         0 my %attrs = parse_attrs($lst_attrs);
158             my $img_id = $attrs{title} || $attrs{descr}
159 0 0 0     0 or next DRAWING;
160              
161 0 0       0 $image{$img_id} = "word/$rel_image{$rId}"
162             or die "couldn't find image for relationship '$rId' associated with image '$img_id'";
163             # NOTE: targets in the rels XML miss the "word/" prefix, I don't know why.
164             }
165             }
166              
167 0         0 return \%image;
168             }
169              
170              
171 7     7   20 sub _contents {shift->original_contents}
172              
173             sub _on_new_contents {
174 72     72   138 my $self = shift;
175              
176 72         2638 $self->clear_runs;
177 72         228344 $self->{contents_has_changed} = 1;
178             }
179              
180             #======================================================================
181             # METHODS
182             #======================================================================
183              
184              
185             sub _rels_xml {
186 1     1   4 my ($self, $new_xml) = @_;
187 1         33 my $rels_name = sprintf "word/_rels/%s.xml.rels", $self->part_name;
188 1         35 return $self->surgeon->xml_member($rels_name, $new_xml);
189             }
190              
191              
192             sub zip_member_name {
193 7     7 0 19 my $self = shift;
194 7         223 return sprintf "word/%s.xml", $self->part_name;
195             }
196              
197              
198             sub original_contents {
199 7     7 1 13 my $self = shift;
200              
201 7         222 return $self->surgeon->xml_member($self->zip_member_name);
202             }
203              
204              
205             sub image {
206 0     0 0 0 my ($self, $title, $new_image_content) = @_;
207              
208             # name of the image file within the zip
209 0   0     0 my $zip_member_name = $self->images->{$title}
210             || ($title =~ /^\d+$/ ? "word/media/image$title.png"
211             : die "couldn't find image '$title'");
212              
213             # delegate to Archive::Zip::contents
214 0         0 return $self->surgeon->zip->contents($zip_member_name, $new_image_content);
215             }
216              
217              
218              
219             #======================================================================
220             # CONTENTS RESTITUTION
221             #======================================================================
222              
223             sub indented_contents {
224 0     0 1 0 my $self = shift;
225              
226 0         0 my $dom = XML::LibXML->load_xml(string => $self->contents);
227 0         0 return $dom->toString(XML_SIMPLE_INDENT); # returned as bytes sequence, not a Perl string
228             }
229              
230              
231             sub plain_text {
232 2     2 1 32 my $self = shift;
233              
234             # XML contents
235 2         66 my $txt = $self->contents;
236              
237             # replace opening paragraph tags by newlines
238 2         4526 $txt =~ s/(<w:p[ >])/\n$1/g;
239              
240             # replace break tags by newlines
241 2         3902 $txt =~ s[<w:br/>][\n]g;
242              
243             # replace tab nodes by ASCII tabs
244 2         5236 $txt =~ s/<w:tab[^s][^>]*>/\t/g;
245              
246             # remove all remaining XML tags
247 2         15930 $txt =~ s/<[^>]+>//g;
248              
249 2         44 return $txt;
250             }
251              
252              
253              
254              
255             #======================================================================
256             # MODIFYING CONTENTS
257             #======================================================================
258              
259             sub cleanup_XML {
260 15     15 1 85070 my ($self, @merge_args) = @_;
261              
262 15         51 $self->reduce_all_noises;
263 15         73 my $names_of_ASK_fields = $self->unlink_fields;
264 15         63 $self->suppress_bookmarks(@$names_of_ASK_fields);
265 15         51 $self->merge_runs(@merge_args);
266             }
267              
268             sub noise_reduction_regex {
269 105     105 1 196 my ($self, $regex_name) = @_;
270 105 50       273 my $regex = $noise_reduction_regexes{$regex_name}
271             or croak "->noise_reduction_regex('$regex_name') : unknown regex name";
272 105         224 return $regex;
273             }
274              
275             sub reduce_noise {
276 30     30 1 91 my ($self, @noises) = @_;
277              
278             # gather regexes to apply, given either directly as regex refs, or as names of builtin regexes
279 30 100       61 my @regexes = map {ref $_ eq 'Regexp' ? $_ : $self->noise_reduction_regex($_)} @noises;
  150         347  
280              
281             # get contents, apply all regexes, put back the modified contents.
282 30         1049 my $contents = $self->contents;
283 1     1   2755 no warnings 'uninitialized'; # for regexes without capture groups, $1 will be undef
  1         3  
  1         1597  
284 30         53086 $contents =~ s/$_/$1/g foreach @regexes;
285 30         1072 $self->contents($contents);
286             }
287              
288             sub reduce_all_noises {
289 15     15 1 28 my $self = shift;
290              
291 15         50 $self->reduce_noise(@noise_reduction_list);
292             }
293              
294             sub suppress_bookmarks {
295 15     15 1 36 my ($self, @names_to_erase) = @_;
296              
297             # closure to decide what to do with bookmark contents
298 15         29 my %should_erase_contents = map {($_ => 1)} @names_to_erase;
  2         9  
299             my $deal_with_bookmark_text = sub {
300 18     18   54 my ($bookmark_name, $bookmark_contents) = @_;
301 18 100       372 return $should_erase_contents{$bookmark_name} ? "" : $bookmark_contents;
302 15         82 };
303              
304             # regex to find bookmarks markup
305 15         28 state $bookmark_rx = qr{
306             <w:bookmarkStart # initial tag
307             .+? w:id="(\d+)" # 'id' attribute, bookmark identifier -- capture 1
308             .+? w:name="([^"]+)" # 'name' attribute -- capture 2
309             .*? /> # end of this tag
310             (.*?) # bookmark contents (may be empty) -- capture 3
311             <w:bookmarkEnd # ending tag
312             \s+ w:id="\1" # same 'id' attribute
313             .*? /> # end of this tag
314             }sx;
315              
316             # remove bookmarks markup
317 15         476 my $contents = $self->contents;
318 15         4092 $contents =~ s{$bookmark_rx}{$deal_with_bookmark_text->($2, $3)}eg;
  18         45  
319              
320             # re-inject the modified contents
321 15         497 $self->contents($contents);
322             }
323              
324             sub merge_runs {
325 15     15 1 43 my ($self, %args) = @_;
326              
327             # check validity of received args
328 15         32 state $is_valid_arg = {no_caps => 1};
329             $is_valid_arg->{$_} or croak "merge_runs(): invalid arg: $_"
330 15   50     56 foreach keys %args;
331              
332 15         29 my @new_runs;
333             # loop over internal "run" objects
334 15         23 foreach my $run (@{$self->runs}) {
  15         472  
335              
336 1844 100       5147 $run->remove_caps_property if $args{no_caps};
337              
338             # check if the current run can be merged with the previous one
339 1844 100 66     52337 if ( !$run->xml_before # no other XML markup between the 2 runs
      100        
340             && @new_runs # there was a previous run
341             && $new_runs[-1]->props eq $run->props # both runs have the same properties
342             ) {
343             # conditions are OK, so merge this run with the previous one
344 277         689 $new_runs[-1]->merge($run);
345             }
346             else {
347             # conditions not OK, just push this run without merging
348 1567         3994 push @new_runs, $run;
349             }
350             }
351              
352             # reassemble the whole stuff and inject it as new contents
353 15         64 $self->contents(join "", map {$_->as_xml} @new_runs);
  1567         4116  
354             }
355              
356              
357              
358             sub unlink_fields {
359 15     15 1 35 my $self = shift;
360              
361             # must find out what are the ASK fields before erasing the markup
362 15         31 state $ask_field_rx = qr[<w:instrText[^>]+?>\s+ASK\s+(\w+)];
363 15         475 my $contents = $self->contents;
364 15         610 my @names_of_ASK_fields = $contents =~ /$ask_field_rx/g;
365              
366              
367             # regexes to remove field nodes and "field instruction" nodes
368 15         39 state $field_instruction_txt_rx = qr[<w:instrText.*?</w:instrText>];
369 15         36 state $field_boundary_rx = qr[<w:fldChar
370             (?: [^>]*?/> # ignore all attributes until end of node ..
371             | # .. or
372             [^>]*?>.*?</w:fldChar>) # .. ignore node content until closing tag
373             ]x; # field boundaries are encoded as "begin" / "separate" / "end"
374 15         27 state $simple_field_rx = qr[</?w:fldSimple[^>]*>];
375              
376             # apply the regexes
377 15         48 $self->reduce_noise($field_instruction_txt_rx, $field_boundary_rx, $simple_field_rx);
378              
379 15         49 return \@names_of_ASK_fields;
380             }
381              
382              
383             sub replace {
384 11     11 1 61 my ($self, $pattern, $replacement_callback, %replacement_args) = @_;
385              
386             # cleanup the XML structure so that replacements work better
387 11         25 my $keep_xml_as_is = delete $replacement_args{keep_xml_as_is};
388 11 100       41 $self->cleanup_XML unless $keep_xml_as_is;
389              
390             # check for presences of a special option to avoid modying contents
391 11         168847 my $dont_overwrite_contents = delete $replacement_args{dont_overwrite_contents};
392              
393             # apply replacements and generate new XML
394             my $xml = join "",
395 11         401 map {$_->replace($pattern, $replacement_callback, %replacement_args)} $self->runs->@*;
  1049         4084  
396              
397             # overwrite previous contents
398 11 50       552 $self->contents($xml) unless $dont_overwrite_contents;
399              
400 11         111 return $xml;
401             }
402              
403              
404             sub _update_contents_in_zip { # called for each part before saving the zip file
405 0     0   0 my $self = shift;
406              
407             $self->surgeon->xml_member($self->zip_member_name, $self->contents)
408 0 0       0 if $self->{contents_has_changed};
409             }
410              
411              
412             sub replace_image {
413 0     0 1 0 my ($self, $image_title, $image_PNG_content) = @_;
414              
415 0 0       0 my $member_name = $self->images->{$image_title}
416             or die "could not find an image with title: $image_title";
417 0         0 $self->surgeon->zip->contents($member_name, $image_PNG_content);
418             }
419              
420              
421              
422             sub add_image {
423 0     0 1 0 my ($self, $image_PNG_content) = @_;
424              
425             # compute a fresh image number and a fresh relationship id
426 0         0 my @image_members = $self->surgeon->zip->membersMatching(qr[^word/media/image]);
427 0         0 my @image_nums = map {$_->fileName =~ /(\d+)/} @image_members;
  0         0  
428 0   0     0 my $last_img_num = max @image_nums // 0;
429 0         0 my $target = sprintf "media/image%d.png", $last_img_num + 1;
430 0         0 my $last_rId_num = $self->relationships->$#*;
431 0         0 my $rId = sprintf "rId%d", $last_rId_num + 1;
432              
433             # assemble XML for the new relationship
434 0         0 my $type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
435 0         0 my $new_rel_xml = qq{<Relationship Id="$rId" Type="$type" Target="$target"/>};
436              
437             # update the rels member
438 0         0 my $xml = $self->_rels_xml;
439 0         0 $xml =~ s[</Relationships>][$new_rel_xml</Relationships>];
440 0         0 $self->_rels_xml($xml);
441              
442             # add the image as a new member into the archive
443 0         0 my $member_name = "word/$target";
444 0         0 $self->surgeon->zip->addString(\$image_PNG_content, $member_name);
445              
446             # update the global content_types if it doesn't include PNG
447 0         0 my $ct = $self->surgeon->_content_types;
448 0 0       0 if ($ct !~ /Extension="png"/) {
449 0         0 $ct =~ s[(<Types[^>]+>)][$1<Default Extension="png" ContentType="image/png"/>];
450 0         0 $self->surgeon->_content_types($ct);
451             }
452              
453             # return the relationship id
454 0         0 return $rId;
455             }
456              
457              
458              
459             #======================================================================
460             # UTILITY FUNCTIONS
461             #======================================================================
462              
463              
464             sub parse_attrs { # cheap parsing of attribute lists in an XML node
465 21     21 0 56 my ($lst_attrs) = @_;
466              
467 21         29 state $attr_pair_regex = qr[
468             ([^=\s"'&<>]+) # attribute name
469             \h* = \h* # Eq
470             (?: # attribute value
471             " ([^<"]*) " # .. enclosed in double quotes
472             |
473             ' ([^<']*) ' # .. or enclosed in single quotes
474             )
475             ]x;
476              
477 21         35 state $entity = {quot => '"', amp => '&', 'lt' => '<', gt => '>'};
478 21         35 state $entity_names = join "|", keys %$entity;
479              
480 21         26 my %attr;
481 21         141 while ($lst_attrs =~ /$attr_pair_regex/g) {
482 69   33     241 my ($name, $val) = ($1, $2 // $3);
483 69         136 $val =~ s/&(entity_names);/$entity->{$1}/eg;
  0         0  
484 69         477 $attr{$name} = $val;
485             }
486              
487 21         132 return %attr;
488             }
489              
490              
491             1;
492              
493             __END__
494              
495             =encoding ISO-8859-1
496              
497             =head1 NAME
498              
499             MsOffice::Word::Surgeon::PackagePart - Operations on a single part within the ZIP package of a docx document
500              
501             =head1 DESCRIPTION
502              
503             This class is part of L<MsOffice::Word::Surgeon>; it encapsulates operations for a single
504             I<package part> within the ZIP package of a C<.docx> document.
505             It is mostly used for the I<document> part, that contains the XML representation of the
506             main document body. However, other parts such as headers, footers, footnotes, etc. have the
507             same internal representation and therefore the same operations can be invoked.
508              
509              
510             =head1 METHODS
511              
512             =head2 new
513              
514             my $run = MsOffice::Word::Surgeon::PackagePart->new(
515             surgeon => $surgeon,
516             part_name => $name,
517             );
518              
519             Constructor for a new part object. This is called internally from
520             L<MsOffice::Word::Surgeon>; it is not meant to be called directly
521             by clients.
522              
523             =head3 Constructor arguments
524              
525              
526             =over
527              
528             =item surgeon
529              
530             a weak reference to the main surgeon object
531              
532             =item part_name
533              
534             ZIP member name of this part
535              
536             =back
537              
538             =head3 Other attributes
539              
540             Other attributes, which are not passed through the constructor but are generated lazily on demand, are :
541              
542             =over
543              
544             =item contents
545              
546             the XML contents of this part
547              
548             =item runs
549              
550             a decomposition of the XML contents into a collection of
551             L<MsOffice::Word::Surgeon::Run> objects.
552              
553             =item relationships
554              
555             an arrayref of Office relationships associated with this part. This information comes from
556             a C<.rels> member in the ZIP archive, named after the name of the package part.
557             Array indices correspond to relationship numbers. Array values are hashrefs with
558             keys
559              
560             =over
561              
562             =item Id
563              
564             the full relationship id
565              
566             =item num
567              
568             the numeric part of C<rId>
569              
570             =item Type
571              
572             the full reference to the XML schema for this relationship
573              
574             =item short_type
575              
576             only the last word of the type, e.g. 'image', 'style', etc.
577              
578             =item Target
579              
580             designation of the target within the ZIP file. The prefix 'word/' must be
581             added for having a complete Zip member name.
582              
583             =back
584              
585              
586              
587             =item images
588              
589             a hashref of images within this package part. Keys of the hash are image I<alternative texts>.
590             If present, the alternative I<title> will be prefered; otherwise the alternative I<description> will be taken
591             (note : the I<title> field was displayed in Office 2013 and 2016, but more recent versions only display
592             the I<description> field -- see
593             L<https://support.microsoft.com/en-us/office/add-alternative-text-to-a-shape-picture-chart-smartart-graphic-or-other-object-44989b2a-903c-4d9a-b742-6a75b451c669|MsOffice documentation>).
594              
595             Images without alternative text will not be accessible through the current Perl module.
596              
597             Values of the hash are zip member names for the corresponding
598             image representations in C<.png> format.
599              
600              
601             =back
602              
603              
604             =head2 Contents restitution
605              
606             =head3 contents
607              
608             Returns a Perl string with the current internal XML representation of the part
609             contents.
610              
611             =head3 original_contents
612              
613             Returns a Perl string with the XML representation of the
614             part contents, as it was in the ZIP archive before any
615             modification.
616              
617             =head3 indented_contents
618              
619             Returns an indented version of the XML contents, suitable for inspection in a text editor.
620             This is produced by L<XML::LibXML::Document/toString> and therefore is returned as an encoded
621             byte string, not a Perl string.
622              
623             =head3 plain_text
624              
625             Returns the text contents of the part, without any markup.
626             Paragraphs and breaks are converted to newlines, all other formatting instructions are ignored.
627              
628              
629             =head3 runs
630              
631             Returns a list of L<MsOffice::Word::Surgeon::Run> objects. Each of
632             these objects holds an XML fragment; joining all fragments
633             restores the complete document.
634              
635             my $contents = join "", map {$_->as_xml} $self->runs;
636              
637              
638             =head2 Modifying contents
639              
640              
641             =head3 cleanup_XML
642              
643             $part->cleanup_XML;
644              
645             Apply several other methods for removing unnecessary nodes within the internal
646             XML. This method successively calls L</reduce_all_noises>, L</unlink_fields>,
647             L</suppress_bookmarks> and L</merge_runs>.
648              
649              
650             =head3 reduce_noise
651              
652             $part->reduce_noise($regex1, $regex2, ...);
653              
654             This method is used for removing unnecessary information in the XML
655             markup. It applies the given list of regexes to the whole document,
656             suppressing matches. The final result is put back into
657             C<< $self->contents >>. Regexes may be given either as C<< qr/.../ >>
658             references, or as names of builtin regexes (described below). Regexes
659             are applied to the whole XML contents, not only to run nodes.
660              
661              
662             =head3 noise_reduction_regex
663              
664             my $regex = $part->noise_reduction_regex($regex_name);
665              
666             Returns the builtin regex corresponding to the given name.
667             Known regexes are :
668              
669             proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>),
670             revision_ids => qr(\sw:rsid\w+="[^"]+"),
671             complex_script_bold => qr(<w:bCs/>),
672             page_breaks => qr(<w:lastRenderedPageBreak/>),
673             language => qr(<w:lang w:val="[^/>]+/>),
674             empty_run_props => qr(<w:rPr></w:rPr>),
675             soft_hyphens => qr(<w:softHyphen/>),
676              
677             =head3 reduce_all_noises
678              
679             $part->reduce_all_noises;
680              
681             Applies all regexes from the previous method.
682              
683             =head3 unlink_fields
684              
685             my $names_of_ASK_fields = $part->unlink_fields;
686              
687             Removes all fields from the part, just leaving the current
688             value stored in each field. This is the equivalent of performing Ctrl-Shift-F9
689             on the whole document.
690              
691             The return value is an arrayref to a list of names of ASK fields within the document.
692             Such names should then be passed to the L</suppress_bookmarks> method
693             (see below).
694              
695              
696             =head3 suppress_bookmarks
697              
698             $part->suppress_bookmarks(@names_to_erase);
699              
700             Removes bookmarks markup in the part. This is useful because
701             MsWord may silently insert bookmarks in unexpected places; therefore
702             some searches within the text may fail because of such bookmarks.
703              
704             By default, this method only removes the bookmarks markup, leaving
705             intact the contents of the bookmark. However, when the name of a
706             bookmark belongs to the list C<< @names_to_erase >>, the contents
707             is also removed. Currently this is used for suppressing ASK fields,
708             because such fields contain a bookmark content that is never displayed by MsWord.
709              
710              
711              
712             =head3 merge_runs
713              
714             $part->merge_runs(no_caps => 1); # optional arg
715              
716             Walks through all runs of text within the document, trying to merge
717             adjacent runs when possible (i.e. when both runs have the same
718             properties, and there is no other XML node inbetween).
719              
720             This operation is a prerequisite before performing replace operations, because
721             documents edited in MsWord often have run boundaries across sentences or
722             even in the middle of words; so regex searches can only be successful if those
723             artificial boundaries have been removed.
724              
725             If the argument C<< no_caps => 1 >> is present, the merge operation
726             will also convert runs with the C<w:caps> property, putting all letters
727             into uppercase and removing the property; this makes more merges possible.
728              
729              
730             =head3 replace
731              
732             $part->replace($pattern, $replacement, %replacement_args);
733              
734             Replaces all occurrences of C<$pattern> regex within the text nodes by the
735             given C<$replacement>. This is not exactly like a search-replace
736             operation performed within MsWord, because the search does not cross boundaries
737             of text nodes. In order to maximize the chances of successful replacements,
738             the L</cleanup_XML> method is automatically called before starting the operation.
739              
740             The argument C<$pattern> can be either a string or a reference to a regular expression.
741             It should not contain any capturing parentheses, because that would perturb text
742             splitting operations.
743              
744             The argument C<$replacement> can be either a fixed string, or a reference to
745             a callback subroutine that will be called for each match.
746              
747              
748             The C<< %replacement_args >> hash can be used to pass information to the callback
749             subroutine. That hash will be enriched with three entries :
750              
751             =over
752              
753             =item matched
754              
755             The string that has been matched by C<$pattern>.
756              
757             =item run
758              
759             The run object in which this text resides.
760              
761             =item xml_before
762              
763             The XML fragment (possibly empty) found before the matched text .
764              
765             =back
766              
767             The callback subroutine may return either plain text or structured XML.
768             See L<MsOffice::Word::Surgeon::Run/SYNOPSIS> for an example of a replacement callback.
769              
770             The following special keys within C<< %replacement_args >> are interpreted by the
771             C<replace()> method itself, and therefore are not passed to the callback subroutine :
772              
773             =over
774              
775             =item keep_xml_as_is
776              
777             if true, no call is made to the L</cleanup_XML> method before performing the replacements
778              
779             =item dont_overwrite_contents
780              
781             if true, the internal XML contents is not modified in place; the new XML after performing
782             replacements is merely returned to the caller.
783              
784             =back
785              
786              
787             =head3 replace_image
788              
789             $part->replace_image($image_alt_text, $image_PNG_content);
790              
791             Replaces an existing PNG image by a new image. All features of the old image will
792             be preserved (size, positioning, border, etc.) -- only the image itself will be
793             replaced. The C<$image_alt_text> must correspond to the I<alternative text> set in Word
794             for this image.
795              
796             This operation replaces a ZIP member within the C<.docx> file. If several XML
797             nodes refer to the I<same> ZIP member, i.e. if the same image is displayed at several
798             locations, the new image will appear at all locations, even if they do not have the
799             same alternative text -- unfortunately this module currently has no facility for
800             duplicating an existing image into separate instances. So if your intent is to only replace
801             one image, your original document should contain several distinct images, coming from
802             several distinct C<.PNG> file copies.
803              
804              
805             =head3 add_image
806              
807             my $rId = $part->add_image($image_PNG_content);
808              
809             Stores the given PNG image within the ZIP file, adds it as a relationship to the
810             current part, and returns the relationship id. This operation is not sufficient
811             to make the image visible in Word : it just stores the image, but you still
812             have to insert a proper C<drawing> node in the contents XML, using the C<$rId>.
813             Future versions of this module may offer helper methods for that purpose;
814             currently it must be done by hand.
815              
816              
817             =head1 AUTHOR
818              
819             Laurent Dami, E<lt>dami AT cpan DOT org<gt>
820              
821             =head1 COPYRIGHT AND LICENSE
822              
823             Copyright 2019-2022 by Laurent Dami.
824              
825             This library is free software; you can redistribute it and/or modify
826             it under the same terms as Perl itself.
827              
828              
829