File Coverage

blib/lib/MsOffice/Word/Surgeon/PackagePart.pm
Criterion Covered Total %
statement 158 206 76.7
branch 21 44 47.7
condition 24 40 60.0
subroutine 31 37 83.7
pod 13 16 81.2
total 247 343 72.0


line stmt bran cond sub pod time code
1             package MsOffice::Word::Surgeon::PackagePart;
2 1     1   13 use 5.24.0;
  1         4  
3 1     1   13 use Moose;
  1         3  
  1         6  
4 1     1   6418 use MooseX::StrictConstructor;
  1         2  
  1         7  
5 1     1   3277 use MsOffice::Word::Surgeon::Utils qw(maybe_preserve_spaces is_at_run_level);
  1         5  
  1         105  
6 1     1   485 use MsOffice::Word::Surgeon::Run;
  1         3  
  1         31  
7 1     1   430 use MsOffice::Word::Surgeon::Text;
  1         13  
  1         34  
8 1     1   753 use XML::LibXML;
  1         32756  
  1         6  
9 1     1   168 use List::Util qw(max);
  1         2  
  1         66  
10 1     1   7 use Carp qw(croak carp);
  1         2  
  1         100  
11              
12             # syntactic sugar for attributes
13             sub has_inner ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr", init_arg => undef)}
14              
15             # constant integers to specify indentation modes -- see L<XML::LibXML>
16 1     1   8 use constant XML_NO_INDENT => 0;
  1         2  
  1         56  
17 1     1   6 use constant XML_SIMPLE_INDENT => 1;
  1         2  
  1         59  
18              
19 1     1   8 use namespace::clean -except => 'meta';
  1         3  
  1         9  
20              
21             our $VERSION = '2.03';
22              
23              
24             #======================================================================
25             # ATTRIBUTES
26             #======================================================================
27              
28              
29             # attributes passed to the constructor
30             has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1, weak_ref => 1);
31             has 'part_name' => (is => 'ro', isa => 'Str', required => 1);
32              
33              
34             # attributes constructed by the module -- not received through the constructor
35             has_inner 'contents' => (is => 'rw', isa => 'Str', trigger => \&_on_new_contents);
36             has_inner 'runs' => (is => 'ro', isa => 'ArrayRef', clearer => 'clear_runs');
37             has_inner 'relationships' => (is => 'ro', isa => 'ArrayRef');
38             has_inner 'images' => (is => 'ro', isa => 'HashRef');
39              
40             has 'contents_has_changed' => (is => 'bare', isa => 'Bool', default => 0);
41             has 'was_cleaned_up' => (is => 'bare', isa => 'Bool', default => 0);
42              
43             #======================================================================
44             # GLOBAL VARIABLES
45             #======================================================================
46              
47             # Various regexes for removing uninteresting XML information
48             my %noise_reduction_regexes = (
49             proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>),
50             revision_ids => qr(\sw:rsid\w+="[^"]+"),
51             complex_script_bold => qr(<w:bCs/>),
52             page_breaks => qr(<w:lastRenderedPageBreak/>),
53             language => qr(<w:lang w:val="[^/>]+/>),
54             empty_run_props => qr(<w:rPr></w:rPr>),
55             soft_hyphens => qr(<w:softHyphen/>),
56             );
57              
58             my @noise_reduction_list = qw/proof_checking revision_ids
59             complex_script_bold page_breaks language
60             empty_run_props soft_hyphens/;
61              
62             #======================================================================
63             # LAZY ATTRIBUTE CONSTRUCTORS AND TRIGGERS
64             #======================================================================
65              
66              
67             sub _runs {
68 12     12   28 my $self = shift;
69              
70 12         31 state $run_regex = qr[
71             <w:r> # opening tag for the run
72             (?:<w:rPr>(.*?)</w:rPr>)? # run properties -- capture in $1
73             (.*?) # run contents -- capture in $2
74             </w:r> # closing tag for the run
75             ]x;
76              
77 12         30 state $txt_regex = qr[
78             <w:t(?:\ xml:space="preserve")?> # opening tag for the text contents
79             (.*?) # text contents -- capture in $1
80             </w:t> # closing tag for text
81             ]x;
82              
83              
84             # split XML content into run fragments
85 12         392 my $contents = $self->contents;
86 12         12713 my @run_fragments = split m[$run_regex], $contents, -1;
87 12         36 my @runs;
88              
89             # build internal RUN objects
90             RUN:
91 12         61 while (my ($xml_before_run, $props, $run_contents) = splice @run_fragments, 0, 3) {
92 2004   100     2360127 $run_contents //= '';
93              
94             # split XML of this run into text fragmentsn
95 2004         23261 my @txt_fragments = split m[$txt_regex], $run_contents, -1;
96 2004         4103 my @texts;
97              
98             # build internal TEXT objects
99             TXT:
100 2004         6733 while (my ($xml_before_text, $txt_contents) = splice @txt_fragments, 0, 2) {
101 3634 100 66     1636291 next TXT if !$xml_before_text && ( !(defined $txt_contents) || $txt_contents eq '');
      66        
102 1840   50     7963 push @texts, MsOffice::Word::Surgeon::Text->new(
      100        
103             xml_before => $xml_before_text // '',
104             literal_text => $txt_contents // '',
105             );
106             }
107              
108             # assemble TEXT objects into a RUN object
109 2004 100 100     42853 next RUN if !$xml_before_run && !@texts;
110 1852   50     8180 push @runs, MsOffice::Word::Surgeon::Run->new(
      100        
111             xml_before => $xml_before_run // '',
112             props => $props // '',
113             inner_texts => \@texts,
114             );
115             }
116              
117 12         17285 return \@runs;
118             }
119              
120              
121             sub _relationships {
122 1     1   3 my $self = shift;
123              
124             # xml that describes the relationships for this package part
125 1         3 my $rel_xml = $self->_rels_xml;
126              
127             # parse the relationships and assemble into a sparse array indexed by relationship ids
128 1         77 my @relationships;
129 1         24 while ($rel_xml =~ m[<Relationship\s+(.*?)/>]g) {
130 21         47 my %attrs = parse_attrs($1);
131 21   50     94 $attrs{$_} or croak "missing attribute '$_' in <Relationship> node" for qw/Id Type Target/;
132 21         76 ($attrs{num} = $attrs{Id}) =~ s[^\D+][];
133 21         98 ($attrs{short_type} = $attrs{Type}) =~ s[^.*/][];
134 21         150 $relationships[$attrs{num}] = \%attrs;
135             }
136              
137 1         40 return \@relationships;
138             }
139              
140              
141             sub _images {
142 0     0   0 my $self = shift;
143              
144             # get relationship ids associated with images
145 0         0 my %rel_image = map {$_->{Id} => $_->{Target}}
146 0 0       0 grep {$_ && $_->{short_type} eq 'image'}
  0         0  
147             $self->relationships->@*;
148              
149             # get titles and relationship ids of images found within the part contents
150 0         0 my %image;
151 0         0 my @drawings = $self->contents =~ m[<w:drawing>(.*?)</w:drawing>]g;
152             DRAWING:
153 0         0 foreach my $drawing (@drawings) {
154 0 0       0 if ($drawing =~ m[<wp:docPr \s+ (.*?) />
155             .*?
156             <a:blip \s+ r:embed="(\w+)"]x) {
157 0         0 my ($lst_attrs, $rId) = ($1, $2);
158 0         0 my %attrs = parse_attrs($lst_attrs);
159             my $img_id = $attrs{title} || $attrs{descr}
160 0 0 0     0 or next DRAWING;
161              
162 0 0       0 $image{$img_id} = "word/$rel_image{$rId}"
163             or die "couldn't find image for relationship '$rId' associated with image '$img_id'";
164             # NOTE: targets in the rels XML miss the "word/" prefix, I don't know why.
165             }
166             }
167              
168 0         0 return \%image;
169             }
170              
171              
172 7     7   20 sub _contents {shift->original_contents}
173              
174             sub _on_new_contents {
175 16     16   34 my $self = shift;
176              
177 16         607 $self->clear_runs;
178 16         219154 $self->{contents_has_changed} = 1;
179 16         640 $self->{was_cleaned_up} = 0;
180             }
181              
182             #======================================================================
183             # METHODS
184             #======================================================================
185              
186              
187             sub _rels_xml {
188 1     1   3 my ($self, $new_xml) = @_;
189 1         60 my $rels_name = sprintf "word/_rels/%s.xml.rels", $self->part_name;
190 1         35 return $self->surgeon->xml_member($rels_name, $new_xml);
191             }
192              
193              
194             sub zip_member_name {
195 7     7 0 14 my $self = shift;
196 7         248 return sprintf "word/%s.xml", $self->part_name;
197             }
198              
199              
200             sub original_contents {
201 7     7 1 11 my $self = shift;
202              
203 7         221 return $self->surgeon->xml_member($self->zip_member_name);
204             }
205              
206              
207             sub image {
208 0     0 0 0 my ($self, $title, $new_image_content) = @_;
209              
210             # name of the image file within the zip
211 0   0     0 my $zip_member_name = $self->images->{$title}
212             || ($title =~ /^\d+$/ ? "word/media/image$title.png"
213             : die "couldn't find image '$title'");
214              
215             # delegate to Archive::Zip::contents
216 0         0 return $self->surgeon->zip->contents($zip_member_name, $new_image_content);
217             }
218              
219              
220              
221             #======================================================================
222             # CONTENTS RESTITUTION
223             #======================================================================
224              
225             sub indented_contents {
226 0     0 1 0 my $self = shift;
227              
228 0         0 my $dom = XML::LibXML->load_xml(string => $self->contents);
229 0         0 return $dom->toString(XML_SIMPLE_INDENT); # returned as bytes sequence, not a Perl string
230             }
231              
232              
233             sub plain_text {
234 2     2 1 30 my $self = shift;
235              
236             # XML contents
237 2         68 my $txt = $self->contents;
238              
239             # replace opening paragraph tags by newlines
240 2         4575 $txt =~ s/(<w:p[ >])/\n$1/g;
241              
242             # replace break tags by newlines
243 2         3982 $txt =~ s[<w:br/>][\n]g;
244              
245             # replace tab nodes by ASCII tabs
246 2         5210 $txt =~ s/<w:tab[^s][^>]*>/\t/g;
247              
248             # remove all remaining XML tags
249 2         15908 $txt =~ s/<[^>]+>//g;
250              
251 2         40 return $txt;
252             }
253              
254              
255              
256              
257             #======================================================================
258             # MODIFYING CONTENTS
259             #======================================================================
260              
261             sub cleanup_XML {
262 2     2 1 10 my ($self, @merge_args) = @_;
263              
264             # avoid doing it twice
265 2 100       8 return if $self->{was_cleaned_up};
266              
267             # do the cleanup
268 1         5 $self->reduce_all_noises;
269 1         7 my $names_of_ASK_fields = $self->unlink_fields;
270 1         8 $self->suppress_bookmarks(@$names_of_ASK_fields);
271 1         6 $self->merge_runs(@merge_args);
272              
273             # remember it was done
274 1         83302 $self->{was_cleaned_up} = 1;
275             }
276              
277             sub noise_reduction_regex {
278 7     7 1 13 my ($self, $regex_name) = @_;
279 7 50       21 my $regex = $noise_reduction_regexes{$regex_name}
280             or croak "->noise_reduction_regex('$regex_name') : unknown regex name";
281 7         18 return $regex;
282             }
283              
284             sub reduce_noise {
285 2     2 1 8 my ($self, @noises) = @_;
286              
287             # gather regexes to apply, given either directly as regex refs, or as names of builtin regexes
288 2 100       5 my @regexes = map {ref $_ eq 'Regexp' ? $_ : $self->noise_reduction_regex($_)} @noises;
  10         27  
289              
290             # get contents, apply all regexes, put back the modified contents.
291 2         74 my $contents = $self->contents;
292 1     1   2715 no warnings 'uninitialized'; # for regexes without capture groups, $1 will be undef
  1         2  
  1         1775  
293 2         25901 $contents =~ s/$_/$1/g foreach @regexes;
294 2         98 $self->contents($contents);
295             }
296              
297             sub reduce_all_noises {
298 1     1 1 3 my $self = shift;
299              
300 1         4 $self->reduce_noise(@noise_reduction_list);
301             }
302              
303             sub suppress_bookmarks {
304 1     1 1 5 my ($self, @names_to_erase) = @_;
305              
306             # closure to decide what to do with bookmark contents
307 1         3 my %should_erase_contents = map {($_ => 1)} @names_to_erase;
  2         10  
308             my $deal_with_bookmark_text = sub {
309 17     17   60 my ($bookmark_name, $bookmark_contents) = @_;
310 17 100       733 return $should_erase_contents{$bookmark_name} ? "" : $bookmark_contents;
311 1         10 };
312              
313             # regex to find bookmarks markup
314 1         5 state $bookmark_rx = qr{
315             <w:bookmarkStart # initial tag
316             .+? w:id="(\d+)" # 'id' attribute, bookmark identifier -- capture 1
317             .+? w:name="([^"]+)" # 'name' attribute -- capture 2
318             .*? /> # end of this tag
319             (.*?) # bookmark contents (may be empty) -- capture 3
320             <w:bookmarkEnd # ending tag
321             \s+ w:id="\1" # same 'id' attribute
322             .*? /> # end of this tag
323             }sx;
324              
325             # remove bookmarks markup
326 1         35 my $contents = $self->contents;
327 1         1453 $contents =~ s{$bookmark_rx}{$deal_with_bookmark_text->($2, $3)}eg;
  17         50  
328              
329             # re-inject the modified contents
330 1         45 $self->contents($contents);
331             }
332              
333             sub merge_runs {
334 1     1 1 8 my ($self, %args) = @_;
335              
336             # check validity of received args
337 1         5 state $is_valid_arg = {no_caps => 1};
338 1         7 my @invalid_args = grep {!$is_valid_arg->{$_}} keys %args;
  1         6  
339 1 50       4 croak "merge_runs(): invalid arg(s): " . join ", ", @invalid_args if @invalid_args;
340              
341 1         3 my @new_runs;
342             # loop over internal "run" objects
343 1         3 foreach my $run (@{$self->runs}) {
  1         34  
344              
345 793 50       3190 $run->remove_caps_property if $args{no_caps};
346              
347             # check if the current run can be merged with the previous one
348 793 100 66     23080 if ( !$run->xml_before # no other XML markup between the 2 runs
      100        
349             && @new_runs # there was a previous run
350             && $new_runs[-1]->props eq $run->props # both runs have the same properties
351             ) {
352             # conditions are OK, so merge this run with the previous one
353 272         638 $new_runs[-1]->merge($run);
354             }
355             else {
356             # conditions not OK, just push this run without merging
357 521         1430 push @new_runs, $run;
358             }
359             }
360              
361             # reassemble the whole stuff and inject it as new contents
362 1         15 $self->contents(join "", map {$_->as_xml} @new_runs);
  521         1194  
363             }
364              
365              
366              
367             sub unlink_fields {
368 1     1 1 2 my $self = shift;
369              
370             # must find out what are the ASK fields before erasing the markup
371 1         7 state $ask_field_rx = qr[<w:instrText[^>]+?>\s+ASK\s+(\w+)];
372 1         35 my $contents = $self->contents;
373 1         232 my @names_of_ASK_fields = $contents =~ /$ask_field_rx/g;
374              
375              
376             # regexes to remove field nodes and "field instruction" nodes
377 1         6 state $field_instruction_txt_rx = qr[<w:instrText.*?</w:instrText>];
378 1         6 state $field_boundary_rx = qr[<w:fldChar
379             (?: [^>]*?/> # ignore all attributes until end of node ..
380             | # .. or
381             [^>]*?>.*?</w:fldChar>) # .. ignore node content until closing tag
382             ]x; # field boundaries are encoded as "begin" / "separate" / "end"
383 1         3 state $simple_field_rx = qr[</?w:fldSimple[^>]*>];
384              
385             # apply the regexes
386 1         5 $self->reduce_noise($field_instruction_txt_rx, $field_boundary_rx, $simple_field_rx);
387              
388 1         10 return \@names_of_ASK_fields;
389             }
390              
391              
392             sub replace {
393 11     11 1 68 my ($self, $pattern, $replacement_callback, %replacement_args) = @_;
394              
395             # shared initial string for error messages
396 11         26 my $error_msg = '->replace($pattern, $callback, %args)';
397              
398             # default value for arg 'cleanup_XML', possibly from deprecated arg 'keep_xml_as_is'
399 11 50       35 if (delete $replacement_args{keep_xml_as_is}) {
400             not exists $replacement_args{cleanup_XML}
401 0 0       0 or croak "$error_msg: deprecated arg 'keep_xml_as_is' conflicts with arg 'cleanup_XML'";
402 0         0 carp "$error_msg: arg 'keep_xml_as_is' is deprecated, use 'cleanup_XML' instead";
403 0         0 $replacement_args{cleanup_XML} = 0;
404             }
405             else {
406 11   100     35 $replacement_args{cleanup_XML} //= 1; # default
407             }
408              
409             # cleanup the XML structure so that replacements work better
410 11 100       31 if (my $cleanup_args = $replacement_args{cleanup_XML}) {
411 1 50       6 $cleanup_args = {} if ! ref $cleanup_args;
412 1 50       7 ref $cleanup_args eq 'HASH'
413             or croak "$error_msg: arg 'cleanup_XML' should be a hashref";
414 1         5 $self->cleanup_XML(%$cleanup_args);
415             }
416              
417             # check for presences of a special option to avoid modying contents
418 11         22 my $dont_overwrite_contents = delete $replacement_args{dont_overwrite_contents};
419              
420             # apply replacements and generate new XML
421             my $xml = join "",
422 11         379 map {$_->replace($pattern, $replacement_callback, %replacement_args)} $self->runs->@*;
  1059         3701  
423              
424             # overwrite previous contents
425 11 50       562 $self->contents($xml) unless $dont_overwrite_contents;
426              
427 11         117 return $xml;
428             }
429              
430              
431             sub _update_contents_in_zip { # called for each part before saving the zip file
432 0     0   0 my $self = shift;
433              
434             $self->surgeon->xml_member($self->zip_member_name, $self->contents)
435 0 0       0 if $self->{contents_has_changed};
436             }
437              
438              
439             sub replace_image {
440 0     0 1 0 my ($self, $image_title, $image_PNG_content) = @_;
441              
442 0 0       0 my $member_name = $self->images->{$image_title}
443             or die "could not find an image with title: $image_title";
444 0         0 $self->surgeon->zip->contents($member_name, $image_PNG_content);
445             }
446              
447              
448              
449             sub add_image {
450 0     0 1 0 my ($self, $image_PNG_content) = @_;
451              
452             # compute a fresh image number and a fresh relationship id
453 0         0 my @image_members = $self->surgeon->zip->membersMatching(qr[^word/media/image]);
454 0         0 my @image_nums = map {$_->fileName =~ /(\d+)/} @image_members;
  0         0  
455 0   0     0 my $last_img_num = max @image_nums // 0;
456 0         0 my $target = sprintf "media/image%d.png", $last_img_num + 1;
457 0         0 my $last_rId_num = $self->relationships->$#*;
458 0         0 my $rId = sprintf "rId%d", $last_rId_num + 1;
459              
460             # assemble XML for the new relationship
461 0         0 my $type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
462 0         0 my $new_rel_xml = qq{<Relationship Id="$rId" Type="$type" Target="$target"/>};
463              
464             # update the rels member
465 0         0 my $xml = $self->_rels_xml;
466 0         0 $xml =~ s[</Relationships>][$new_rel_xml</Relationships>];
467 0         0 $self->_rels_xml($xml);
468              
469             # add the image as a new member into the archive
470 0         0 my $member_name = "word/$target";
471 0         0 $self->surgeon->zip->addString(\$image_PNG_content, $member_name);
472              
473             # update the global content_types if it doesn't include PNG
474 0         0 my $ct = $self->surgeon->_content_types;
475 0 0       0 if ($ct !~ /Extension="png"/) {
476 0         0 $ct =~ s[(<Types[^>]+>)][$1<Default Extension="png" ContentType="image/png"/>];
477 0         0 $self->surgeon->_content_types($ct);
478             }
479              
480             # return the relationship id
481 0         0 return $rId;
482             }
483              
484              
485              
486             #======================================================================
487             # UTILITY FUNCTIONS
488             #======================================================================
489              
490              
491             sub parse_attrs { # cheap parsing of attribute lists in an XML node
492 21     21 0 50 my ($lst_attrs) = @_;
493              
494 21         37 state $attr_pair_regex = qr[
495             ([^=\s"'&<>]+) # attribute name
496             \h* = \h* # Eq
497             (?: # attribute value
498             " ([^<"]*) " # .. enclosed in double quotes
499             |
500             ' ([^<']*) ' # .. or enclosed in single quotes
501             )
502             ]x;
503              
504 21         36 state $entity = {quot => '"', amp => '&', 'lt' => '<', gt => '>'};
505 21         30 state $entity_names = join "|", keys %$entity;
506              
507 21         28 my %attr;
508 21         130 while ($lst_attrs =~ /$attr_pair_regex/g) {
509 69   33     250 my ($name, $val) = ($1, $2 // $3);
510 69         135 $val =~ s/&(entity_names);/$entity->{$1}/eg;
  0         0  
511 69         451 $attr{$name} = $val;
512             }
513              
514 21         112 return %attr;
515             }
516              
517              
518             1;
519              
520             __END__
521              
522             =encoding ISO-8859-1
523              
524             =head1 NAME
525              
526             MsOffice::Word::Surgeon::PackagePart - Operations on a single part within the ZIP package of a docx document
527              
528             =head1 SYNOPSIS
529              
530             my $part = $surgeon->document;
531             print $part->plain_text;
532             $part->replace(qr[$pattern], $replacement_callback);
533             $part->replace_image($image_alt_text, $image_PNG_content);
534              
535              
536             =head1 DESCRIPTION
537              
538             This class is part of L<MsOffice::Word::Surgeon>; it encapsulates operations for a single
539             I<package part> within the ZIP package of a C<.docx> document.
540             It is mostly used for the I<document> part, that contains the XML representation of the
541             main document body. However, other parts such as headers, footers, footnotes, etc. have the
542             same internal representation and therefore the same operations can be invoked.
543              
544              
545             =head1 METHODS
546              
547             =head2 new
548              
549             my $run = MsOffice::Word::Surgeon::PackagePart->new(
550             surgeon => $surgeon,
551             part_name => $name,
552             );
553              
554             Constructor for a new part object. This is called internally from
555             L<MsOffice::Word::Surgeon>; it is not meant to be called directly
556             by clients.
557              
558             =head3 Constructor arguments
559              
560              
561             =over
562              
563             =item surgeon
564              
565             a weak reference to the main surgeon object
566              
567             =item part_name
568              
569             ZIP member name of this part
570              
571             =back
572              
573             =head3 Other attributes
574              
575             Other attributes, which are not passed through the constructor but are generated lazily on demand, are :
576              
577             =over
578              
579             =item contents
580              
581             the XML contents of this part
582              
583             =item runs
584              
585             a decomposition of the XML contents into a collection of
586             L<MsOffice::Word::Surgeon::Run> objects.
587              
588             =item relationships
589              
590             an arrayref of Office relationships associated with this part. This information comes from
591             a C<.rels> member in the ZIP archive, named after the name of the package part.
592             Array indices correspond to relationship numbers. Array values are hashrefs with
593             keys
594              
595             =over
596              
597             =item Id
598              
599             the full relationship id
600              
601             =item num
602              
603             the numeric part of C<rId>
604              
605             =item Type
606              
607             the full reference to the XML schema for this relationship
608              
609             =item short_type
610              
611             only the last word of the type, e.g. 'image', 'style', etc.
612              
613             =item Target
614              
615             designation of the target within the ZIP file. The prefix 'word/' must be
616             added for having a complete Zip member name.
617              
618             =back
619              
620              
621              
622             =item images
623              
624             a hashref of images within this package part. Keys of the hash are image I<alternative texts>.
625             If present, the alternative I<title> will be prefered; otherwise the alternative I<description> will be taken
626             (note : the I<title> field was displayed in Office 2013 and 2016, but more recent versions only display
627             the I<description> field -- see
628             L<https://support.microsoft.com/en-us/office/add-alternative-text-to-a-shape-picture-chart-smartart-graphic-or-other-object-44989b2a-903c-4d9a-b742-6a75b451c669|MsOffice documentation>).
629              
630             Images without alternative text will not be accessible through the current Perl module.
631              
632             Values of the hash are zip member names for the corresponding
633             image representations in C<.png> format.
634              
635              
636             =back
637              
638              
639             =head2 Contents restitution
640              
641             =head3 contents
642              
643             Returns a Perl string with the current internal XML representation of the part
644             contents.
645              
646             =head3 original_contents
647              
648             Returns a Perl string with the XML representation of the
649             part contents, as it was in the ZIP archive before any
650             modification.
651              
652             =head3 indented_contents
653              
654             Returns an indented version of the XML contents, suitable for inspection in a text editor.
655             This is produced by L<XML::LibXML::Document/toString> and therefore is returned as an encoded
656             byte string, not a Perl string.
657              
658             =head3 plain_text
659              
660             Returns the text contents of the part, without any markup.
661             Paragraphs and breaks are converted to newlines, all other formatting instructions are ignored.
662              
663              
664             =head3 runs
665              
666             Returns a list of L<MsOffice::Word::Surgeon::Run> objects. Each of
667             these objects holds an XML fragment; joining all fragments
668             restores the complete document.
669              
670             my $contents = join "", map {$_->as_xml} $self->runs;
671              
672              
673             =head2 Modifying contents
674              
675              
676             =head3 cleanup_XML
677              
678             $part->cleanup_XML(%args);
679              
680             Apply several other methods for removing unnecessary nodes within the internal
681             XML. This method successively calls L</reduce_all_noises>, L</unlink_fields>,
682             L</suppress_bookmarks> and L</merge_runs>.
683              
684             Currently there is only one legal arg :
685              
686             =over
687              
688             =item C<no_caps>
689              
690             If true, the method L<MsOffice::Word::Surgeon::Run/remove_caps_property> is automatically
691             called for each run object. As a result, all texts within runs with the C<caps> property are automatically
692             converted to uppercase.
693              
694             =back
695              
696              
697              
698             =head3 reduce_noise
699              
700             $part->reduce_noise($regex1, $regex2, ...);
701              
702             This method is used for removing unnecessary information in the XML
703             markup. It applies the given list of regexes to the whole document,
704             suppressing matches. The final result is put back into
705             C<< $self->contents >>. Regexes may be given either as C<< qr/.../ >>
706             references, or as names of builtin regexes (described below). Regexes
707             are applied to the whole XML contents, not only to run nodes.
708              
709              
710             =head3 noise_reduction_regex
711              
712             my $regex = $part->noise_reduction_regex($regex_name);
713              
714             Returns the builtin regex corresponding to the given name.
715             Known regexes are :
716              
717             proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>),
718             revision_ids => qr(\sw:rsid\w+="[^"]+"),
719             complex_script_bold => qr(<w:bCs/>),
720             page_breaks => qr(<w:lastRenderedPageBreak/>),
721             language => qr(<w:lang w:val="[^/>]+/>),
722             empty_run_props => qr(<w:rPr></w:rPr>),
723             soft_hyphens => qr(<w:softHyphen/>),
724              
725             =head3 reduce_all_noises
726              
727             $part->reduce_all_noises;
728              
729             Applies all regexes from the previous method.
730              
731             =head3 unlink_fields
732              
733             my $names_of_ASK_fields = $part->unlink_fields;
734              
735             Removes all fields from the part, just leaving the current
736             value stored in each field. This is the equivalent of performing Ctrl-Shift-F9
737             on the whole document.
738              
739             The return value is an arrayref to a list of names of ASK fields within the document.
740             Such names should then be passed to the L</suppress_bookmarks> method
741             (see below).
742              
743              
744             =head3 suppress_bookmarks
745              
746             $part->suppress_bookmarks(@names_to_erase);
747              
748             Removes bookmarks markup in the part. This is useful because
749             MsWord may silently insert bookmarks in unexpected places; therefore
750             some searches within the text may fail because of such bookmarks.
751              
752             By default, this method only removes the bookmarks markup, leaving
753             intact the contents of the bookmark. However, when the name of a
754             bookmark belongs to the list C<< @names_to_erase >>, the contents
755             is also removed. Currently this is used for suppressing ASK fields,
756             because such fields contain a bookmark content that is never displayed by MsWord.
757              
758              
759              
760             =head3 merge_runs
761              
762             $part->merge_runs(no_caps => 1); # optional arg
763              
764             Walks through all runs of text within the document, trying to merge
765             adjacent runs when possible (i.e. when both runs have the same
766             properties, and there is no other XML node inbetween).
767              
768             This operation is a prerequisite before performing replace operations, because
769             documents edited in MsWord often have run boundaries across sentences or
770             even in the middle of words; so regex searches can only be successful if those
771             artificial boundaries have been removed.
772              
773             If the argument C<< no_caps => 1 >> is present, the merge operation
774             will also convert runs with the C<w:caps> property, putting all letters
775             into uppercase and removing the property; this makes more merges possible.
776              
777              
778             =head3 replace
779              
780             $part->replace($pattern, $replacement, %replacement_args);
781              
782             Replaces all occurrences of C<$pattern> regex within the text nodes by the
783             given C<$replacement>. This is not exactly like a search-replace
784             operation performed within MsWord, because the search does not cross boundaries
785             of text nodes. In order to maximize the chances of successful replacements,
786             the L</cleanup_XML> method is automatically called before starting the operation.
787              
788             The argument C<$pattern> can be either a string or a reference to a regular expression.
789             It should not contain any capturing parentheses, because that would perturb text
790             splitting operations.
791              
792             The argument C<$replacement> can be either a fixed string, or a reference to
793             a callback subroutine that will be called for each match.
794              
795              
796             The C<< %replacement_args >> hash can be used to pass information to the callback
797             subroutine. That hash will be enriched with three entries :
798              
799             =over
800              
801             =item matched
802              
803             The string that has been matched by C<$pattern>.
804              
805             =item run
806              
807             The run object in which this text resides.
808              
809             =item xml_before
810              
811             The XML fragment (possibly empty) found before the matched text .
812              
813             =back
814              
815             The callback subroutine may return either plain text or structured XML.
816             See L<MsOffice::Word::Surgeon::Run/SYNOPSIS> for an example of a replacement callback.
817              
818             The following special keys within C<< %replacement_args >> are interpreted by the
819             C<replace()> method itself, and therefore are not passed to the callback subroutine :
820              
821             =over
822              
823             =item keep_xml_as_is
824              
825             if true, no call is made to the L</cleanup_XML> method before performing the replacements
826              
827             =item dont_overwrite_contents
828              
829             if true, the internal XML contents is not modified in place; the new XML after performing
830             replacements is merely returned to the caller.
831              
832             =item cleanup_args
833              
834             the argument should be an arrayref and will be passed to the L</cleanup_XML> method. This
835             is typically used as
836              
837             $part->replace($pattern, $replacement, cleanup_args => [no_caps => 1]);
838              
839             =back
840              
841              
842             =head3 replace_image
843              
844             $part->replace_image($image_alt_text, $image_PNG_content);
845              
846             Replaces an existing PNG image by a new image. All features of the old image will
847             be preserved (size, positioning, border, etc.) -- only the image itself will be
848             replaced. The C<$image_alt_text> must correspond to the I<alternative text> set in Word
849             for this image.
850              
851             This operation replaces a ZIP member within the C<.docx> file. If several XML
852             nodes refer to the I<same> ZIP member, i.e. if the same image is displayed at several
853             locations, the new image will appear at all locations, even if they do not have the
854             same alternative text -- unfortunately this module currently has no facility for
855             duplicating an existing image into separate instances. So if your intent is to only replace
856             one image, your original document should contain several distinct images, coming from
857             several distinct C<.PNG> file copies.
858              
859              
860             =head3 add_image
861              
862             my $rId = $part->add_image($image_PNG_content);
863              
864             Stores the given PNG image within the ZIP file, adds it as a relationship to the
865             current part, and returns the relationship id. This operation is not sufficient
866             to make the image visible in Word : it just stores the image, but you still
867             have to insert a proper C<drawing> node in the contents XML, using the C<$rId>.
868             Future versions of this module may offer helper methods for that purpose;
869             currently it must be done by hand.
870              
871              
872             =head1 AUTHOR
873              
874             Laurent Dami, E<lt>dami AT cpan DOT org<gt>
875              
876             =head1 COPYRIGHT AND LICENSE
877              
878             Copyright 2019-2023 by Laurent Dami.
879              
880             This library is free software; you can redistribute it and/or modify
881             it under the same terms as Perl itself.
882              
883              
884