File Coverage

blib/lib/MsOffice/Word/Template.pm
Criterion Covered Total %
statement 75 75 100.0
branch 8 10 80.0
condition 7 11 63.6
subroutine 12 12 100.0
pod 1 6 16.6
total 103 114 90.3


line stmt bran cond sub pod time code
1             use Moose;
2 2     2   2542 use MooseX::StrictConstructor;
  2         708285  
  2         11  
3 2     2   11938 use Carp qw(croak);
  2         46052  
  2         7  
4 2     2   14912 use HTML::Entities qw(decode_entities);
  2         4  
  2         174  
5 2     2   886 use MsOffice::Word::Surgeon 1.08;
  2         8793  
  2         157  
6 2     2   861  
  2         408183  
  2         78  
7             use namespace::clean -except => 'meta';
8 2     2   16  
  2         5  
  2         15  
9             our $VERSION = '1.02';
10              
11             # attributes for interacting with MsWord
12             has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1);
13             has 'data_color' => (is => 'ro', isa => 'Str', default => "yellow");
14             has 'control_color' => (is => 'ro', isa => 'Str', default => "green");
15             # see also BUILDARGS: the "docx" arg will be translated into "surgeon"
16              
17             # attributes for interacting with the chosen template engine
18             # Filled by default with values for the Template Toolkit (a.k.a TT2)
19             has 'start_tag' => (is => 'ro', isa => 'Str', default => "[% ");
20             has 'end_tag' => (is => 'ro', isa => 'Str', default => " %]");
21             has 'engine' => (is => 'ro', isa => 'CodeRef', default => sub {\&TT2_engine});
22             has 'engine_args' => (is => 'ro', isa => 'ArrayRef', default => sub {[]});
23              
24             # attributes constructed by the module -- not received through the constructor
25             has 'template_text' => (is => 'bare', isa => 'Str', init_arg => undef);
26             has 'engine_stash' => (is => 'bare', isa => 'HashRef', init_arg => undef,
27             clearer => 'clear_stash');
28              
29             my $XML_COMMENT_FOR_MARKING_DIRECTIVES = '<!--TEMPLATE_DIRECTIVE_ABOVE-->';
30              
31              
32              
33             #======================================================================
34             # BUILDING THE TEMPLATE
35             #======================================================================
36              
37              
38             # syntactic sugar for supporting ->new($surgeon) instead of ->new(surgeon => $surgeon)
39             around BUILDARGS => sub {
40             my $orig = shift;
41             my $class = shift;
42              
43             # if there is a unique arg without any keyword ...
44             if ( @_ == 1) {
45              
46             # if the unique arg is an instance of Surgeon, it's the "surgeon" parameter
47             unshift @_, 'surgeon' if $_[0]->isa('MsOffice::Word::Surgeon');
48              
49             # if the unique arg is a string, it's the "docx" parameter
50             unshift @_, 'docx' if $_[0] && !ref $_[0];
51             }
52              
53             # translate the "docx" parameter into a "surgeon" parameter
54             my %args = @_;
55             if (my $docx = delete $args{docx}) {
56             $args{surgeon} = MsOffice::Word::Surgeon->new(docx => $docx);
57             }
58              
59             # now call the regular Moose method
60             return $class->$orig(%args);
61             };
62              
63              
64             my ($self) = @_;
65              
66 1     1 0 1796 # assemble the template text and store it into the bare attribute
67             $self->{template_text} = $self->build_template_text;
68             }
69 1         6  
70              
71             my ($self) = @_;
72              
73             # start and end character sequences for a template fragment
74 1     1 0 2 my ($rx_start, $rx_end) = map quotemeta, $self->start_tag, $self->end_tag;
75              
76             # Regexes for extracting template directives within the XML.
77 1         31 # Such directives are identified through a specific XML comment -- this comment is
78             # inserted by method "template_fragment_for_run()" below.
79             # The (*SKIP) instructions are used to avoid backtracking after a
80             # closing tag for the subexpression has been found. Otherwise the
81             # .*? inside could possibly match across boundaries of the current
82             # XML node, we don't want that.
83              
84             # regex for matching directives to be treated outside the text flow.
85             my $regex_outside_text_flow = qr{
86             <w:r\b [^>]*> # start run node
87             (?: <w:rPr> .*? </w:rPr> (*SKIP) )? # optional run properties
88 1         48 <w:t\b [^>]*> # start text node
89             ($rx_start .*? $rx_end) (*SKIP) # template directive
90             $XML_COMMENT_FOR_MARKING_DIRECTIVES # specific XML comment
91             </w:t> # close text node
92             </w:r> # close run node
93             }sx;
94              
95             # regex for matching paragraphs that contain only a directive
96             my $regex_paragraph = qr{
97             <w:p\b [^>]*> # start paragraph node
98             (?: <w:pPr> .*? </w:pPr> (*SKIP) )? # optional paragraph properties
99 1         44 $regex_outside_text_flow
100             </w:p> # close paragraph node
101             }sx;
102              
103             # regex for matching table rows that contain only a directive in the first cell
104             my $regex_row = qr{
105             <w:tr\b [^>]*> # start row node
106             <w:tc\b [^>]*> # start cell node
107 1         46 (?:<w:tcPr> .*? </w:tcPr> (*SKIP) )? # cell properties
108             $regex_paragraph # paragraph in cell
109             </w:tc> # close cell node
110             (?:<w:tc> .*? </w:tc> (*SKIP) )* # ignore other cells on the same row
111             </w:tr> # close row node
112             }sx;
113              
114             # assemble template fragments from all runs in the document into a global template text
115             $self->surgeon->cleanup_XML;
116             my @template_fragments = map {$self->template_fragment_for_run($_)} @{$self->surgeon->runs};
117             my $template_text = join "", @template_fragments;
118 1         31  
119 1         479668 # remove markup around directives, successively for table rows, for paragraphs, and finally
  61         107456  
  1         32  
120 1         61 # for remaining directives embedded within text runs.
121             $template_text =~ s/$_/$1/g for $regex_row, $regex_paragraph, $regex_outside_text_flow;
122              
123             return $template_text;
124 1         255 }
125              
126 1         28  
127             my ($self, $run) = @_;
128              
129             my $props = $run->props;
130             my $data_color = $self->data_color;
131 61     61 0 82 my $control_color = $self->control_color;
132              
133 61         1136 # if this run is highlighted in yellow or green, it must be translated into a template directive
134 61         1416 if ($props =~ s{<w:highlight w:val="($data_color|$control_color)"/>}{}) {
135 61         1225 my $color = $1;
136             my $xml = $run->xml_before;
137              
138 61 100       239 my $inner_texts = $run->inner_texts;
139 32         50 if (@$inner_texts) {
140 32         611 $xml .= "<w:r>"; # opening XML tag for run node
141             $xml .= "<w:rPr>" . $props . "</w:rPr>" if $props; # optional run properties
142 32         728 $xml .= "<w:t>"; # opening XML tag for text node
143 32 50       184 $xml .= $self->start_tag; # start a template directive
144 32         52 foreach my $inner_text (@$inner_texts) { # loop over text nodes
145 32 100       51 my $txt = decode_entities($inner_text->literal_text); # just take inner literal text
146 32         43 $xml .= $txt . "\n";
147 32         687 # NOTE : adding "\n" because the template parser may need them for identifying end of comments
148 32         48 }
149 34         665  
150 34         250 $xml .= $self->end_tag; # end of template directive
151             $xml .= $XML_COMMENT_FOR_MARKING_DIRECTIVES
152             if $color eq $control_color; # XML comment for marking
153             $xml .= "</w:t>"; # closing XML tag for text node
154 32         636 $xml .= "</w:r>"; # closing XML tag for run node
155 32 100       66 }
156              
157 32         44 return $xml;
158 32         40 }
159              
160             # otherwise this run is just regular MsWord content
161 32         68 else {
162             return $run->as_xml;
163             }
164             }
165              
166 29         55  
167              
168             #======================================================================
169             # PROCESSING THE TEMPLATE
170             #======================================================================
171              
172             my ($self, $vars) = @_;
173              
174             # process the template to generate new XML
175             my $engine = $self->engine;
176             my $new_XML = $self->$engine($vars);
177 2     2 1 2046  
178             # insert the generated output into a new MsWord document; other zip members
179             # are cloned from the original template
180 2         58 my $new_doc = $self->surgeon->meta->clone_object($self->surgeon);
181 2         7 $new_doc->contents($new_XML);
182              
183             return $new_doc;
184             }
185 2         60  
186 2         642  
187             #======================================================================
188 2         131 # DEFAULT ENGINE : TEMPLATE TOOLKIT, a.k.a. TT2
189             #======================================================================
190              
191             # arbitrary value for the first bookmark id. 100 should most often be above other
192             # bookmarks generated by Word itself. TODO : would be better to find the highest
193             # id number really used in the template
194             my $first_bookmark_id = 100;
195              
196             # precompiled blocks as facilities to be used within templates
197             my %precompiled_blocks = (
198              
199             # a wrapper block for inserting a Word bookmark
200             bookmark => sub {
201             my $context = shift;
202             my $stash = $context->stash;
203              
204             # assemble xml markup
205             my $bookmark_id = $stash->get('global.bookmark_id') || $first_bookmark_id;
206             my $name = fix_bookmark_name($stash->get('name') || 'anonymous_bookmark');
207              
208             my $xml = qq{<w:bookmarkStart w:id="$bookmark_id" w:name="$name"/>}
209             . $stash->get('content') # content of the wrapper
210             . qq{<w:bookmarkEnd w:id="$bookmark_id"/>};
211              
212             # next bookmark will need a fresh id
213             $stash->set('global.bookmark_id', $bookmark_id+1);
214              
215             return $xml;
216             },
217              
218             # a wrapper block for linking to a bookmark
219             link_to_bookmark => sub {
220             my $context = shift;
221             my $stash = $context->stash;
222              
223             # assemble xml markup
224             my $name = fix_bookmark_name($stash->get('name') || 'anonymous_bookmark');
225             my $content = $stash->get('content');
226             my $tooltip = $stash->get('tooltip');
227             if ($tooltip) {
228             # TODO: escap quotes
229             $tooltip = qq{ w:tooltip="$tooltip"};
230             }
231             my $xml = qq{<w:hyperlink w:anchor="$name"$tooltip>$content</w:hyperlink>};
232              
233             return $xml;
234             },
235              
236             # a block for generating a Word field. Can also be used as wrapper.
237             field => sub {
238             my $context = shift;
239             my $stash = $context->stash;
240             my $code = $stash->get('code'); # field code, including possible flags
241             my $text = $stash->get('content'); # initial text content (before updating the field)
242              
243             my $xml = qq{<w:r><w:fldChar w:fldCharType="begin"/></w:r>}
244             . qq{<w:r><w:instrText xml:space="preserve"> $code </w:instrText></w:r>};
245             $xml .= qq{<w:r><w:fldChar w:fldCharType="separate"/></w:r>$text} if $text;
246             $xml .= qq{<w:r><w:fldChar w:fldCharType="end"/></w:r>};
247              
248             return $xml;
249             },
250              
251             );
252              
253              
254              
255              
256             my ($self, $vars) = @_;
257              
258             require Template::AutoFilter; # a subclass of Template that adds automatic html filtering
259              
260              
261             # assemble args to be passed to the constructor
262 2     2 0 4 my %TT2_args = @{$self->engine_args};
263             $TT2_args{BLOCKS}{$_} //= $precompiled_blocks{$_} for keys %precompiled_blocks;
264 2         416  
265              
266             # at the first invocation, create a TT2 compiled template and store it in the stash.
267             # Further invocations just reuse the TT2 object in stash.
268 2         45991 my $stash = $self->{engine_stash} //= {};
  2         64  
269 2   33     23 $stash->{TT2} //= Template::AutoFilter->new(\%TT2_args);
270             $stash->{compiled_template} //= $stash->{TT2}->template(\$self->{template_text});
271              
272             # generate new XML by invoking the template on $vars
273             my $new_XML = $stash->{TT2}->context->process($stash->{compiled_template}, $vars);
274 2   100     10  
275 2   66     10 return $new_XML;
276 2   66     17122 }
277              
278              
279 2         21994 #======================================================================
280             # UTILITY ROUTINES (not methods)
281 2         99 #======================================================================
282              
283              
284             my $name = shift;
285              
286             # see https://stackoverflow.com/questions/852922/what-are-the-limitations-for-bookmark-names-in-microsoft-word
287              
288             $name =~ s/[^\w_]+/_/g; # only digits, letters or underscores
289             $name =~ s/^(\d)/_$1/; # cannot start with a digit
290             $name = substr($name, 0, 40) if length($name) > 40; # max 40 characters long
291 16     16 0 20  
292             return $name;
293             }
294              
295 16         27  
296 16         25 1;
297 16 50       29  
298              
299 16         20 =encoding ISO-8859-1
300              
301             =head1 NAME
302              
303             MsOffice::Word::Template - generate Microsoft Word documents from Word templates
304              
305             =head1 SYNOPSIS
306              
307             my $template = MsOffice::Word::Template->new($filename);
308             my $new_doc = $template->process(\%data);
309             $new_doc->save_as($path_for_new_doc);
310              
311             =head1 DESCRIPTION
312              
313             =head2 Purpose
314              
315             This module treats a Microsoft Word document as a template for generating other documents. The idea is
316             similar to the "mail merge" functionality in Word, but with much richer possibilities, because the
317             whole power of a Perl templating engine can be exploited, for example for
318              
319             =over
320              
321             =item *
322              
323             dealing with complex, nested datastructures
324              
325             =item *
326              
327             using control directives for loops, conditionals, subroutines, etc.
328              
329             =back
330              
331              
332             Template authors just have to use the highlighing function in MsWord to
333             mark the templating directives :
334              
335             =over
336              
337             =item *
338              
339             fragments highlighted in B<yelllow> are interpreted as I<data>
340             directives, i.e. the template result will be inserted at that point in
341             the document, keeping the current formatting properties (bold, italic,
342             font, etc.).
343              
344             =item *
345              
346             fragments highlighted in B<green> are interpreted as I<control>
347             directives that do not directly generate content, like loops, conditionals,
348             etc. Paragraphs or table rows around such directives are dismissed,
349             in order to avoid empty paragraphs or empty rows in the resulting document.
350              
351             =back
352              
353             The syntax of data and control directives depends on the backend
354             templating engine. The default engine is the L<Perl Template Toolkit|Template>;
355             other engines can be specified through parameters
356             to the L</new> method -- see the L</TEMPLATE ENGINE> section below.
357              
358              
359             =head2 Status
360              
361             This first release is a proof of concept. Some simple templates have
362             been successfully tried; however it is likely that a number of
363             improvements will have to be made before this system can be used at
364             large scale in production. If you use this module, please keep me
365             informed of your difficulties, tricks, suggestions, etc.
366              
367              
368             =head1 METHODS
369              
370             =head2 new
371              
372             my $template = MsOffice::Word::Template->new($docx);
373             # or : my $template = MsOffice::Word::Template->new($surgeon); # an instance of MsOffice::Word::Surgeon
374             # or : my $template = MsOffice::Word::Template->new(docx => $docx, %options);
375              
376             In its simplest form, the constructor takes a single argument which
377             is either a string (path to a F<docx> document), or an instance of
378             L<MsOffice::Word::Surgeon>. Otherwise the constructor takes a list of named parameters,
379             which can be
380              
381              
382             =over
383              
384             =item docx
385              
386             path to a MsWord document in F<docx> format. This will automatically create
387             an instance of L<MsOffice::Word::Surgeon> and pass it to the constructor
388             through the C<surgeon> keyword.
389              
390             =item surgeon
391              
392             an instance of L<MsOffice::Word::Surgeon>. This is a mandatory parameter, either
393             directly through the C<surgeon> keyword, or indirectly through the C<docx> keyword.
394              
395             =item data_color
396              
397             the Word highlight color for marking data directives (default : yellow)
398              
399             =item control_color
400              
401             the Word highlight color for marking control directives (default : green).
402             Such directives should produce no content. They are treated outside of the regular text flow.
403              
404             =back
405              
406             In addition to the attributes above, other attributes can be passed to the
407             constructor for specifying a templating engine different from the
408             default L<Perl Template Toolkit|Template>.
409             These are described in section L</TEMPLATE ENGINE> below.
410              
411              
412             =head2 process
413              
414             my $new_doc = $template->process(\%data);
415             $new_doc->save_as($path_for_new_doc);
416              
417             Process the template on a given data tree, and return a new document
418             (actually, a new instance of L<MsOffice::Word::Surgeon>).
419             That document can then be saved using L<MsOffice::Word::Surgeon/save_as>.
420              
421              
422             =head1 AUTHORING TEMPLATES
423              
424             A template is just a regular Word document, in which the highlighted
425             fragments represent templating directives.
426              
427             The data directives, i.e. the "holes" to be filled must be highlighted
428             in B<yellow>. Such zones must contain the names of variables to fill the
429             holes. If the template engine supports it, names of variables can be paths
430             into a complex datastructure, with dots separating the levels, like
431             C<foo.3.bar.-1> -- see L<Template::Manual::Directive/GET> and
432             L<Template::Manual::Variables> if you are using the Template Toolkit.
433              
434             Control directives like C<IF>, C<FOREACH>, etc. must be highlighted in
435             B<green>. When seeing a green zone, the system will remove XML markup for
436             the surrounding text and run nodes. If the directive is the only content
437             of the paragraph, then the paragraph node is also removed. If this
438             occurs within the first cell of a table row, the markup for that row is also
439             removed. This mechanism ensures that the final result will not contain
440             empty paragraphs or empty rows at places corresponding to control directives.
441              
442             In consequence of this distinction between yellow and green
443             highlights, templating zones cannot mix data directives with control
444             directives : a data directive within a green zone would generate output
445             outside of the regular XML flow (paragraph nodes, run nodes and text
446             nodes), and therefore MsWord would generate an error when trying to
447             open such content. There is a workaround, however : data directives
448             within a green zone will work if they I<also generate the appropriate markup>
449             for paragraph nodes, run nodes and text nodes; but in that case you must
450             also apply the "none" filter from L<Template::AutoFilter> so that
451             angle brackets in XML markup do not get translated into HTML entities.
452              
453              
454             =head1 TEMPLATE ENGINE
455              
456             This module invokes a backend I<templating engine> for interpreting the
457             template directives. In order to use an engine different from the default
458             L<Template Toolkit|Template>, you must supply the following parameters
459             to the L</new> method :
460              
461             =over
462              
463             =item start_tag
464              
465             The string for identifying the start of a template directive
466              
467             =item end_tag
468              
469             The string for identifying the end of a template directive
470              
471             =item engine
472              
473             A reference to a method that will perform the templating operation (explained below)
474              
475             =item engine_args
476              
477             An optional list of parameters that may be used by the engine
478              
479             =back
480              
481             Given a datatree in C<$vars>, the engine will be called as :
482              
483             my $engine = $self->engine;
484             my $new_XML = $self->$engine($vars);
485              
486             It is up to the engine method to exploit C<< $self->engine_args >> if needed.
487              
488             If the engine is called repetively, it may need to store some data to be
489             persistent between two calls, like for example a compiled version of the
490             parsed template. To this end, there is an internal hashref attribute
491             called C<engine_stash>. If necessary the stash can be cleared through
492             the C<clear_stash> method.
493              
494             Here is an example using L<Template::Mustache> :
495              
496             my $template = MsOffice::Word::Template->new(
497             docx => $template_file,
498             start_tag => "{{",
499             end_tag => "}}",
500             engine => sub {
501             my ($self, $vars) = @_;
502              
503             # at the first invocation, create a Mustache compiled template and store it in the stash.
504             # Further invocations will just reuse the object in stash.
505             my $stash = $self->{engine_stash} //= {};
506             $stash->{mustache} //= Template::Mustache->new(
507             template => $self->{template_text},
508             @{$self->engine_args}, # for ex. partials, partial_path, context
509             # -- see L<Template::Mustache> documentation
510             );
511              
512             # generate new XML by invoking the template on $vars
513             my $new_XML = $stash->{mustache}->render($vars);
514              
515             return $new_XML;
516             },
517             );
518              
519             The engine must make sure that ampersand characters and angle brackets
520             are automatically replaced by the corresponding HTML entities
521             (otherwise the resulting XML would be incorrect and could not be
522             opened by Microsoft Word). The Mustache engine does this
523             automatically. The Template Toolkit would normally require to
524             explicitly add an C<html> filter at each directive :
525              
526             [% foo.bar | html %]
527              
528             but thanks to the L<Template::AutoFilter>
529             module, this is performed automatically.
530              
531              
532             =head1 AUTHORING NOTES SPECIFIC TO THE TEMPLATE TOOLKIT
533              
534             This chapter just gives a few hints for authoring Word templates with the
535             Template Toolkit.
536              
537             The examples below use [[double square brackets]] to indicate
538             segments that should be highlighted in B<green> within the Word template.
539              
540              
541             =head2 Bookmarks
542              
543             The template processor is instantiated with a predefined wrapper named C<bookmark>
544             for generating Word bookmarks. Here is an example:
545              
546             Here is a paragraph with [[WRAPPER bookmark name="my_bookmark"]]bookmarked text[[END]].
547              
548             The C<name> argument is automatically truncated to 40 characters, and non-alphanumeric
549             characters are replaced by underscores, in order to comply with the limitations imposed by Word
550             for bookmark names.
551              
552             =head2 Internal hyperlinks
553              
554             Similarly, there is a predefined wrapper named C<link_to_bookmark> for generating
555             hyperlinks to bookmarks. Here is an example:
556              
557             Click [[WRAPPER link_to_bookmark name="my_bookmark" tooltip="tip top"]]here[[END]].
558              
559             The C<tooltip> argument is optional.
560              
561             =head2 Word fields
562              
563             A predefined block C<field> generates XML markup for Word fields, like for example :
564              
565             Today is [[PROCESS field code="DATE \\@ \"h:mm am/pm, dddd, MMMM d\""]]
566              
567             Beware that quotes or backslashes must be escaped so that the Template Toolkit parser
568             does not interpret these characters.
569              
570             The list of Word field codes is documented at
571             L<https://support.microsoft.com/en-us/office/list-of-field-codes-in-word-1ad6d91a-55a7-4a8d-b535-cf7888659a51>.
572              
573             When used as a wrapper, the C<field> block generates a Word field with alternative
574             text content, displayed before the field gets updated. For example :
575              
576             [[WRAPPER field code="TOC \o \"1-3\" \h \z \u"]]Table of contents – press F9 to update[[END]]
577              
578              
579             =head1 TROUBLESHOOTING
580              
581             If the document generated by this module cannot open in Word, it is probably because the XML
582             generated by your template is not equilibrated and therefore not valid.
583             For example a template like this :
584              
585             This paragraph [[ IF condition ]]
586             may have problems
587             [[END]]
588              
589             is likely to generate incorrect XML, because the IF statement starts in the middle
590             of a paragraph and closes at a different paragraph -- therefore when the I<condition>
591             evaluates to false, the XML tag for closing the initial paragraph will be missing.
592              
593             Compound directives like IF .. END, FOREACH .. END, TRY .. CATCH .. END should therefore
594             be equilibrated, either all within the same paragraph, or each directive on a separate
595             paragraph. Examples like this should be successful :
596              
597             This paragraph [[ IF condition ]]has an optional part[[ ELSE ]]or an alternative[[ END ]].
598            
599             [[ SWITCH result ]]
600             [[ CASE 123 ]]
601             Not a big deal.
602             [[ CASE 789 ]]
603             You won the lottery.
604             [[ END ]]
605              
606              
607              
608             =head1 AUTHOR
609              
610             Laurent Dami, E<lt>dami AT cpan DOT org<gt>
611              
612             =head1 COPYRIGHT AND LICENSE
613              
614             Copyright 2020-2022 by Laurent Dami.
615              
616             This library is free software; you can redistribute it and/or modify
617             it under the same terms as Perl itself.
618              
619