File Coverage

blib/lib/MsOffice/Word/Template.pm
Criterion Covered Total %
statement 74 74 100.0
branch 10 14 71.4
condition 0 2 0.0
subroutine 11 11 100.0
pod 1 1 100.0
total 96 102 94.1


line stmt bran cond sub pod time code
1             use 5.024;
2 2     2   2232 use Moose;
  2         12  
3 2     2   974 use MooseX::StrictConstructor;
  2         867847  
  2         11  
4 2     2   14023 use Carp qw(croak);
  2         55794  
  2         8  
5 2     2   18133 use HTML::Entities qw(decode_entities);
  2         4  
  2         156  
6 2     2   1120 use MsOffice::Word::Surgeon 2.0;
  2         10647  
  2         215  
7 2     2   1115  
  2         646757  
  2         172  
8             # syntactic sugar for attributes
9              
10             use namespace::clean -except => 'meta';
11              
12 2     2   18 our $VERSION = '2.0';
  2         4  
  2         17  
13              
14             #======================================================================
15             # ATTRIBUTES
16             #======================================================================
17              
18             # constructor attributes for interacting with MsWord
19             # See also BUILDARGS: the constructor can also take a "docx" arg
20             # that will be automatically translated into a "surgeon" attribute
21             has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1);
22             has 'data_color' => (is => 'ro', isa => 'Str', default => "yellow");
23             has 'control_color' => (is => 'ro', isa => 'Str', default => "green");
24             has 'part_names' => (is => 'ro', isa => 'ArrayRef[Str]', lazy => 1,
25             default => sub {[keys shift->surgeon->parts->%*]});
26              
27             # constructor attributes for building a templating engine
28             has 'engine_class' => (is => 'ro', isa => 'Str', default => 'TT2');
29             has 'engine_args' => (is => 'ro', isa => 'ArrayRef', default => sub {[]});
30              
31             # attributes lazily constructed by the module -- not received through the constructor
32             has_inner 'engine' => (is => 'ro', isa => 'MsOffice::Word::Template::Engine');
33              
34             #======================================================================
35             # GLOBALS
36             #======================================================================
37              
38             my $XML_COMMENT_FOR_MARKING_DIRECTIVES = '<!--TEMPLATE_DIRECTIVE_ABOVE-->';
39              
40              
41             #======================================================================
42             # BUILDING INSTANCES
43             #======================================================================
44              
45             # syntactic sugar for supporting ->new($surgeon) instead of ->new(surgeon => $surgeon)
46             around BUILDARGS => sub {
47             my $orig = shift;
48             my $class = shift;
49              
50             # if there is a unique arg without any keyword ...
51             if ( @_ == 1) {
52              
53             # if the unique arg is an instance of Surgeon, it's the "surgeon" parameter
54             unshift @_, 'surgeon' if $_[0]->isa('MsOffice::Word::Surgeon');
55              
56             # if the unique arg is a string, it's the "docx" parameter
57             unshift @_, 'docx' if $_[0] && !ref $_[0];
58             }
59              
60             # translate the "docx" parameter into a "surgeon" parameter
61             my %args = @_;
62             if (my $docx = delete $args{docx}) {
63             $args{surgeon} = MsOffice::Word::Surgeon->new(docx => $docx);
64             }
65              
66             # now call the regular Moose method
67             return $class->$orig(%args);
68             };
69              
70              
71             #======================================================================
72             # LAZY ATTRIBUTE CONSTRUCTORS
73             #======================================================================
74              
75              
76             my ($self) = @_;
77              
78             # instantiate the templating engine
79 1     1   3 my $engine_class = $self->engine_class;
80             my $engine;
81             my @load_errors;
82 1         27 CLASS:
83 1         2 for my $class ("MsOffice::Word::Template::Engine::$engine_class", $engine_class) {
84             eval "require $class; 1" or push @load_errors, $@ and next CLASS;
85             $engine = $class->new($self->engine_args->@*) and last CLASS;
86 1         4 }
87 1 50 0     75 $engine or die "could not load engine class '$engine_class'", @load_errors;
88 1 50       30  
89             # compile regexes based on the start/end tags
90 1 50       12 my ($start_tag, $end_tag) = ($engine->start_tag, $engine->end_tag);
91             my @xml_regexes = $self->_xml_regexes($start_tag, $end_tag);
92              
93 1         31 # tell the engine to build a compiled template for each document part
94 1         23 foreach my $part_name ($self->part_names->@*) {
95             my $part = $self->surgeon->part($part_name);
96              
97 1         53 # assemble template fragments from all runs in the part into a global template text
98 7         30998 $part->cleanup_XML;
99             my @template_fragments = map {$self->_template_fragment_for_run($_, $start_tag, $end_tag)}
100             $part->runs->@*;
101 7         131 my $template_text = join "", @template_fragments;
102 7         571700  
  73         132983  
103             # remove markup around directives, successively for table rows, for paragraphs, and finally
104 7         413 # for remaining directives embedded within text runs.
105             $template_text =~ s/$_/$1/g foreach @xml_regexes;
106              
107             # compile and store the template
108 7         508 $engine->compile_template($part_name => $template_text);
109             }
110              
111 7         26 return $engine;
112             }
113              
114 1         1601  
115              
116             #======================================================================
117             # UTILITY METHODS
118             #======================================================================
119              
120              
121              
122             my ($self, $run, $start_tag, $end_tag) = @_;
123              
124             my $props = $run->props;
125             my $data_color = $self->data_color;
126 73     73   118 my $control_color = $self->control_color;
127              
128 73         1622 # if this run is highlighted in data or control color, it must be translated into a template directive
129 73         1944 if ($props =~ s{<w:highlight w:val="($data_color|$control_color)"/>}{}) {
130 73         1663 my $color = $1;
131             my $xml = $run->xml_before;
132              
133 73 100       331 # re-build the run, removing the highlight, and adding the start/end tags for the template engine
134 35         64 my $inner_texts = $run->inner_texts;
135 35         758 if (@$inner_texts) {
136             $xml .= "<w:r>"; # opening XML tag for run node
137             $xml .= "<w:rPr>" . $props . "</w:rPr>" if $props; # optional run properties
138 35         872 $xml .= "<w:t>"; # opening XML tag for text node
139 35 50       192 $xml .= $start_tag; # start a template directive
140 35         75 foreach my $inner_text (@$inner_texts) { # loop over text nodes
141 35 100       65 my $txt = decode_entities($inner_text->literal_text); # just take inner literal text
142 35         52 $xml .= $txt . "\n";
143 35         47 # NOTE : adding "\n" because the template parser may need them for identifying end of comments
144 35         58 }
145 37         806  
146 37         298 $xml .= $end_tag; # end of template directive
147             $xml .= $XML_COMMENT_FOR_MARKING_DIRECTIVES
148             if $color eq $control_color; # XML comment for marking
149             $xml .= "</w:t>"; # closing XML tag for text node
150 35         49 $xml .= "</w:r>"; # closing XML tag for run node
151 35 100       83 }
152              
153 35         52 return $xml;
154 35         51 }
155              
156             # otherwise this run is just regular MsWord content
157 35         88 else {
158             return $run->as_xml;
159             }
160             }
161              
162 38         88  
163              
164              
165             my ($self, $start_tag, $end_tag) = @_;
166              
167             # start and end character sequences for a template fragment
168             my $rx_start = quotemeta $start_tag;
169             my $rx_end = quotemeta $end_tag;
170 1     1   3  
171             # Regexes for extracting template directives within the XML.
172             # Such directives are identified through a specific XML comment -- this comment is
173 1         3 # inserted by method "template_fragment_for_run()" below.
174 1         2 # The (*SKIP) instructions are used to avoid backtracking after a
175             # closing tag for the subexpression has been found. Otherwise the
176             # .*? inside could possibly match across boundaries of the current
177             # XML node, we don't want that.
178              
179             # regex for matching directives to be treated outside the text flow.
180             my $rx_outside_text_flow = qr{
181             <w:r\b [^>]*> # start run node
182             (?: <w:rPr> .*? </w:rPr> (*SKIP) )? # optional run properties
183             <w:t\b [^>]*> # start text node
184             ($rx_start .*? $rx_end) (*SKIP) # template directive
185 1         37 $XML_COMMENT_FOR_MARKING_DIRECTIVES # specific XML comment
186             </w:t> # close text node
187             </w:r> # close run node
188             }sx;
189              
190             # regex for matching paragraphs that contain only a directive
191             my $rx_paragraph = qr{
192             <w:p\b [^>]*> # start paragraph node
193             (?: <w:pPr> .*? </w:pPr> (*SKIP) )? # optional paragraph properties
194             $rx_outside_text_flow
195             </w:p> # close paragraph node
196 1         33 }sx;
197              
198             # regex for matching table rows that contain only a directive in the first cell
199             my $rx_row = qr{
200             <w:tr\b [^>]*> # start row node
201             <w:tc\b [^>]*> # start cell node
202             (?:<w:tcPr> .*? </w:tcPr> (*SKIP) )? # cell properties
203             $rx_paragraph # paragraph in cell
204 1         40 </w:tc> # close cell node
205             (?:<w:tc> .*? </w:tc> (*SKIP) )* # ignore other cells on the same row
206             </w:tr> # close row node
207             }sx;
208              
209             return ($rx_row, $rx_paragraph, $rx_outside_text_flow);
210             # Note : the order is important
211             }
212              
213              
214 1         5  
215              
216              
217             #======================================================================
218             # PROCESSING THE TEMPLATE
219             #======================================================================
220              
221             my ($self, $vars) = @_;
222              
223             # create a clone of the original
224             my $new_doc = $self->surgeon->clone;
225              
226             foreach my $part_name ($self->part_names->@*) {
227 2     2 1 3023 my $new_doc_part = $new_doc->part($part_name);
228             my $new_contents = $self->engine->process($part_name, $new_doc_part, $vars);
229             $new_doc_part->contents($new_contents);
230 2         60 }
231              
232 2         7027 return $new_doc;
233 14         1218 }
234 14         23894  
235 14         407  
236             1;
237              
238 2         153  
239             =encoding ISO-8859-1
240              
241             =head1 NAME
242              
243             MsOffice::Word::Template - generate Microsoft Word documents from Word templates
244              
245             =head1 SYNOPSIS
246              
247             my $template = MsOffice::Word::Template->new($filename);
248             my $new_doc = $template->process(\%data);
249             $new_doc->save_as($path_for_new_doc);
250              
251             =head1 DESCRIPTION
252              
253             =head2 Purpose
254              
255             This module treats a Microsoft Word document as a template for generating other documents. The idea is
256             similar to the "mail merge" functionality in Word, but with much richer possibilities, because the
257             whole power of a Perl templating engine can be exploited, for example for
258              
259             =over
260              
261             =item *
262              
263             dealing with complex, nested datastructures
264              
265             =item *
266              
267             using control directives for loops, conditionals, subroutines, etc.
268              
269             =back
270              
271              
272             Template authors just use the highlighing function in MsWord to
273             mark the templating directives :
274              
275             =over
276              
277             =item *
278              
279             fragments highlighted in B<yelllow> are interpreted as I<data>
280             directives, i.e. the template result will be inserted at that point in
281             the document, keeping the current formatting properties (bold, italic,
282             font, etc.).
283              
284             =item *
285              
286             fragments highlighted in B<green> are interpreted as I<control>
287             directives that do not directly generate content, like loops, conditionals,
288             etc. Paragraphs or table rows around such directives are dismissed,
289             in order to avoid empty paragraphs or empty rows in the resulting document.
290              
291             =back
292              
293             The syntax of data and control directives depends on the backend
294             templating engine. The default engine is the L<Perl Template Toolkit|Template>;
295             other engines can be specified as subclasses -- see the L</TEMPLATE ENGINE> section below.
296              
297              
298             =head2 Status
299              
300             This second release is a major refactoring of the first version, together with
301             a refactoring of L<MsOffice::Word::Surgeon>. New features include support
302             for headers and footers and for image insertion. The internal object-oriented
303             structure has been redesigned.
304              
305             This module has been used successfully for a pilot project in my organization,
306             generating quite complex documents from deeply nested datastructures.
307             Yet this has not been used yet at large scale in production, so it is quite likely
308             that some youth defects may still be discovered.
309             If you use this module, please keep me
310             informed of your difficulties, tricks, suggestions, etc.
311              
312              
313             =head1 METHODS
314              
315             =head2 new
316              
317             my $template = MsOffice::Word::Template->new($docx);
318             # or : my $template = MsOffice::Word::Template->new($surgeon); # an instance of MsOffice::Word::Surgeon
319             # or : my $template = MsOffice::Word::Template->new(docx => $docx, %options);
320              
321             In its simplest form, the constructor takes a single argument which
322             is either a string (path to a F<docx> document), or an instance of
323             L<MsOffice::Word::Surgeon>. Otherwise the constructor takes a list of named parameters,
324             which can be
325              
326              
327             =over
328              
329             =item docx
330              
331             path to a MsWord document in F<docx> format. This will automatically create
332             an instance of L<MsOffice::Word::Surgeon> and pass it to the constructor
333             through the C<surgeon> keyword.
334              
335             =item surgeon
336              
337             an instance of L<MsOffice::Word::Surgeon>. This is a mandatory parameter, either
338             directly through the C<surgeon> keyword, or indirectly through the C<docx> keyword.
339              
340             =item data_color
341              
342             the Word highlight color for marking data directives (default : yellow)
343              
344             =item control_color
345              
346             the Word highlight color for marking control directives (default : green).
347             Such directives should produce no content. They are treated outside of the regular text flow.
348              
349             =back
350              
351             In addition to the attributes above, other attributes can be passed to the
352             constructor for specifying a templating engine different from the
353             default L<Perl Template Toolkit|Template>.
354             These are described in section L</TEMPLATE ENGINE> below.
355              
356              
357             =head2 process
358              
359             my $new_doc = $template->process(\%data);
360             $new_doc->save_as($path_for_new_doc);
361              
362             Processes the template on a given data tree, and returns a new document
363             (actually, a new instance of L<MsOffice::Word::Surgeon>).
364             That document can then be saved using L<MsOffice::Word::Surgeon/save_as>.
365              
366              
367             =head1 AUTHORING TEMPLATES
368              
369             A template is just a regular Word document, in which the highlighted
370             fragments represent templating directives.
371              
372             The data directives, i.e. the "holes" to be filled must be highlighted
373             in B<yellow>. Such zones must contain the names of variables to fill the
374             holes. If the template engine supports it, names of variables can be paths
375             into a complex datastructure, with dots separating the levels, like
376             C<foo.3.bar.-1> -- see L<Template::Manual::Directive/GET> and
377             L<Template::Manual::Variables> if you are using the Template Toolkit.
378              
379             Control directives like C<IF>, C<FOREACH>, etc. must be highlighted in
380             B<green>. When seeing a green zone, the system will remove XML markup for
381             the surrounding text and run nodes. If the directive is the only content
382             of the paragraph, then the paragraph node is also removed. If this
383             occurs within the first cell of a table row, the markup for that row is also
384             removed. This mechanism ensures that the final result will not contain
385             empty paragraphs or empty rows at places corresponding to control directives.
386              
387             In consequence of this distinction between yellow and green
388             highlights, templating zones cannot mix data directives with control
389             directives : a data directive within a green zone would generate output
390             outside of the regular XML flow (paragraph nodes, run nodes and text
391             nodes), and therefore MsWord would generate an error when trying to
392             open such content. There is a workaround, however : data directives
393             within a green zone will work if they I<also generate the appropriate markup>
394             for paragraph nodes, run nodes and text nodes; but in that case you must
395             also apply the "none" filter from L<Template::AutoFilter> so that
396             angle brackets in XML markup do not get translated into HTML entities.
397              
398             See also L<MsOffice::Word::Template::Engine::TT2> for
399             additional advice on authoring templates based on the
400             L<Template Toolkit|Template>.
401              
402              
403              
404             =head1 TEMPLATE ENGINE
405              
406             This module invokes a backend I<templating engine> for interpreting the
407             template directives. The default engine is
408             L<MsOffice::Word::Template::Engine::TT2>, built on top of
409             L<Template Toolkit|Template>. Another engine supplied in this distribution is
410             L<MsOffice::Word::Template::Engine::Mustache>, mostly as an example.
411             To implement another engine, just subclass
412             L<MsOffice::Word::Template::Engine>.
413              
414             To use an engine different from the default, the following arguments
415             must be supplied to the L</new> method :
416              
417             =over
418              
419             =item engine_class
420              
421             The name of the engine class. If the class is within the L<MsOffice::Word::Template::Engine>
422             namespace, just the suffix is sufficient; otherwise, specify the fully qualified class name.
423              
424             =item engine_args
425              
426             An optional list of parameters that may be used for initializing the engine
427              
428             =back
429              
430             The engine will get a C<compile_template> method call for each part in the
431             C<.docx> document (main
432              
433             Given a datatree in C<$vars>, the engine will be called as :
434              
435              
436             The engine must make sure that ampersand characters and angle brackets
437             are automatically replaced by the corresponding HTML entities
438             (otherwise the resulting XML would be incorrect and could not be
439             opened by Microsoft Word). The Mustache engine does this
440             automatically. The Template Toolkit would normally require to
441             explicitly add an C<html> filter at each directive :
442              
443             [% foo.bar | html %]
444              
445             but thanks to the L<Template::AutoFilter>
446             module, this is performed automatically.
447              
448              
449              
450             This module invokes a backend I<templating engine> for interpreting the
451             template directives. The default engine is
452             L<MsOffice::Word::Template::Engine::TT2>, built on top of
453             L<Template Toolkit|Template>. Another engine supplied in this distribution is
454             L<MsOffice::Word::Template::Engine::Mustache>, mostly as an example.
455             To implement another engine, just subclass
456             L<MsOffice::Word::Template::Engine>.
457              
458             To use an engine different from the default, the following arguments
459             must be supplied to the L</new> method :
460              
461             =over
462              
463             =item engine_class
464              
465             The name of the engine class. If the class sits within the L<MsOffice::Word::Template::Engine>
466             namespace, just the suffix is sufficient; otherwise, specify the fully qualified class name.
467              
468             =item engine_args
469              
470             An optional list of parameters that may be used for initializing the engine
471              
472             =back
473              
474             After initialization the engine will receive a C<compile_template> method call for each part in the
475             C<.docx> document, i.e. not only the main document body, but also headers and footers.
476              
477             Then the main C<process()> method, given a datatree in C<$vars>, will call
478             the engine's C<process()> method on each document part.
479              
480             The engine must make sure that ampersand characters and angle brackets
481             are automatically replaced by the corresponding HTML entities
482             (otherwise the resulting XML would be incorrect and could not be
483             opened by Microsoft Word). The Mustache engine does this
484             automatically. The Template Toolkit would normally require to
485             explicitly add an C<html> filter at each directive :
486              
487             [% foo.bar | html %]
488              
489             but thanks to the L<Template::AutoFilter>
490             module, this is performed automatically.
491              
492             =head1 TROUBLESHOOTING
493              
494             If a document generated by this module cannot open in Word, it is probably because the XML
495             generated by your template is not equilibrated and therefore not valid.
496             For example a template like this :
497              
498             This paragraph [[ IF condition ]]
499             may have problems
500             [[END]]
501              
502             is likely to generate incorrect XML, because the IF statement starts in the middle
503             of a paragraph and closes at a different paragraph -- therefore when the I<condition>
504             evaluates to false, the XML tag for closing the initial paragraph will be missing.
505              
506             Compound directives like IF .. END, FOREACH .. END, TRY .. CATCH .. END should therefore
507             be equilibrated, either all within the same paragraph, or each directive on a separate
508             paragraph. Examples like this should be successful :
509              
510             This paragraph [[ IF condition ]]has an optional part[[ ELSE ]]or an alternative[[ END ]].
511            
512             [[ SWITCH result ]]
513             [[ CASE 123 ]]
514             Not a big deal.
515             [[ CASE 789 ]]
516             You won the lottery.
517             [[ END ]]
518              
519              
520              
521             =head1 AUTHOR
522              
523             Laurent Dami, E<lt>dami AT cpan DOT org<gt>
524              
525             =head1 COPYRIGHT AND LICENSE
526              
527             Copyright 2020-2022 by Laurent Dami.
528              
529             This library is free software; you can redistribute it and/or modify
530             it under the same terms as Perl itself.
531              
532