File Coverage

blib/lib/MsOffice/Word/Template.pm
Criterion Covered Total %
statement 68 68 100.0
branch 7 8 87.5
condition 6 8 75.0
subroutine 11 11 100.0
pod 1 5 20.0
total 93 100 93.0


line stmt bran cond sub pod time code
1             package MsOffice::Word::Template;
2 2     2   2265 use Moose;
  2         968227  
  2         15  
3 2     2   16583 use MooseX::StrictConstructor;
  2         64066  
  2         10  
4 2     2   21013 use Carp qw(croak);
  2         7  
  2         145  
5 2     2   1335 use HTML::Entities qw(decode_entities);
  2         12888  
  2         152  
6 2     2   1253 use MsOffice::Word::Surgeon;
  2         518135  
  2         99  
7              
8 2     2   21 use namespace::clean -except => 'meta';
  2         5  
  2         12  
9              
10             our $VERSION = '1.01';
11              
12             # attributes for interacting with MsWord
13             has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1);
14             has 'data_color' => (is => 'ro', isa => 'Str', default => "yellow");
15             has 'control_color' => (is => 'ro', isa => 'Str', default => "green");
16             # see also BUILDARGS: the "docx" arg will be translated into "surgeon"
17              
18             # attributes for interacting with the chosen template engine
19             # Filled by default with values for the Template Toolkit (a.k.a TT2)
20             has 'start_tag' => (is => 'ro', isa => 'Str', default => "[% ");
21             has 'end_tag' => (is => 'ro', isa => 'Str', default => " %]");
22             has 'engine' => (is => 'ro', isa => 'CodeRef', default => sub {\&TT2_engine});
23             has 'engine_args' => (is => 'ro', isa => 'ArrayRef', default => sub {[]});
24              
25             # attributes constructed by the module -- not received through the constructor
26             has 'template_text' => (is => 'bare', isa => 'Str', init_arg => undef);
27             has 'engine_stash' => (is => 'bare', isa => 'HashRef', init_arg => undef,
28             clearer => 'clear_stash');
29              
30              
31             #======================================================================
32             # BUILDING THE TEMPLATE
33             #======================================================================
34              
35              
36             # syntactic sugar for supporting ->new($surgeon) instead of ->new(surgeon => $surgeon)
37             around BUILDARGS => sub {
38             my $orig = shift;
39             my $class = shift;
40              
41             # if there is a unique arg without any keyword ...
42             if ( @_ == 1) {
43              
44             # if the unique arg is an instance of Surgeon, it's the "surgeon" parameter
45             unshift @_, 'surgeon' if $_[0]->isa('MsOffice::Word::Surgeon');
46              
47             # if the unique arg is a string, it's the "docx" parameter
48             unshift @_, 'docx' if $_[0] && !ref $_[0];
49             }
50              
51             # translate the "docx" parameter into a "surgeon" parameter
52             my %args = @_;
53             if (my $docx = delete $args{docx}) {
54             $args{surgeon} = MsOffice::Word::Surgeon->new(docx => $docx);
55             }
56              
57             # now call the regular Moose method
58             return $class->$orig(%args);
59             };
60              
61              
62             sub BUILD {
63 1     1 0 2929 my ($self) = @_;
64              
65             # assemble the template text and store it into the bare attribute
66 1         4 $self->{template_text} = $self->build_template_text;
67             }
68              
69              
70             sub build_template_text {
71 1     1 0 3 my ($self) = @_;
72              
73             # start and end character sequences for a template fragment
74 1         35 my ($rx_start, $rx_end) = map quotemeta, $self->start_tag, $self->end_tag;
75              
76             # regex for matching paragraphs that contain directives to be treated outside the text flow.
77             # Such directives are identified through a specific XML comment -- this comment is
78             # inserted by method "template_fragment_for_run()" below.
79 1         58 my $regex_paragraph = qr{
80             <w:p [^>]*> # start paragraph node
81             (?: <w:pPr> .*? </w:pPr> (*SKIP) )? # optional paragraph properties
82             <w:r [^>]*> # start run node
83             <w:t [^>]*> # start text node
84             ($rx_start .*? $rx_end) (*SKIP) # template directive
85             <!--OUTSIDE_TEXT_FLOW--> # specific XML comment
86             </w:t> # close text node
87             </w:r> # close run node
88             </w:p> # close paragraph node
89             }sx;
90              
91             # regex for matching table rows that contain such paragraphs.
92 1         64 my $regex_row = qr{
93             <w:tr [^>]*> # start row node
94             <w:tc [^>]*> # start cell node
95             (?:<w:tcPr> .*? </w:tcPr> (*SKIP) )? # cell properties
96             $regex_paragraph # paragraph in cell
97             </w:tc> # close cell node
98             (?:<w:tc> .*? </w:tc> (*SKIP) )* # possibly other cells on the same row
99             </w:tr> # close row node
100             }sx;
101              
102             # NOTE : the (*SKIP) instructions in regexes above are used to avoid backtracking
103             # after a closing tag for the subexpression has been found. Otherwise the .*? inside
104             # could possibly match across boundaries of the current XML node, we don't want that.
105              
106             # assemble template fragments from all runs in the document into a global template text
107 1         41 $self->surgeon->cleanup_XML;
108 1         93843 my @template_fragments = map {$self->template_fragment_for_run($_)} @{$self->surgeon->runs};
  21         47923  
  1         37  
109 1         107 my $template_text = join "", @template_fragments;
110              
111             # remove markup for rows around directives
112 1         98 $template_text =~ s/$regex_row/$1/g;
113              
114             # remove markup for pagraphs around directives
115 1         81 $template_text =~ s/$regex_paragraph/$1/g;
116              
117 1         28 return $template_text;
118             }
119              
120              
121             sub template_fragment_for_run { # given an instance of Surgeon::Run, build a template fragment
122 21     21 0 43 my ($self, $run) = @_;
123              
124 21         535 my $props = $run->props;
125 21         683 my $data_color = $self->data_color;
126 21         581 my $control_color = $self->control_color;
127              
128             # if this run is highlighted in yellow or green, it must be translated into a template directive
129             # NOTE: the code below has much in common with Surgeon::Run::as_xml() -- maybe
130             # part of it could be shared in a future version
131 21 100       156 if ($props =~ s{<w:highlight w:val="($data_color|$control_color)"/>}{}) {
132 12         33 my $color = $1;
133 12         307 my $xml = $run->xml_before;
134              
135 12         364 my $inner_texts = $run->inner_texts;
136 12 50       87 if (@$inner_texts) {
137 12         32 $xml .= "<w:r>"; # opening XML tag for run node
138 12 100       29 $xml .= "<w:rPr>" . $props . "</w:rPr>" if $props; # optional run properties
139 12         24 $xml .= "<w:t>"; # opening XML tag for text node
140 12         362 $xml .= $self->start_tag; # start a template directive
141 12         26 foreach my $inner_text (@$inner_texts) {
142 14         370 my $txt = decode_entities($inner_text->literal_text);
143 14         150 $xml .= $txt . "\n";
144             # NOTE : adding "\n" because the template parser may need them for identifying end of comments
145             }
146              
147 12         325 $xml .= $self->end_tag; # end of template directive
148 12 100       49 $xml .= "<!--OUTSIDE_TEXT_FLOW-->" if $color eq $control_color; # XML comment for marking
149 12         24 $xml .= "</w:t>"; # closing XML tag for text node
150 12         24 $xml .= "</w:r>"; # closing XML tag for run node
151             }
152              
153 12         38 return $xml;
154             }
155              
156             # otherwise this run is just regular MsWord content
157             else {
158 9         27 return $run->as_xml;
159             }
160             }
161              
162              
163              
164             #======================================================================
165             # PROCESSING THE TEMPLATE
166             #======================================================================
167              
168             sub process {
169 2     2 1 1779 my ($self, $vars) = @_;
170              
171             # process the template to generate new XML
172 2         90 my $engine = $self->engine;
173 2         9 my $new_XML = $self->$engine($vars);
174              
175             # insert the generated output into a new MsWord document; other zip members
176             # are cloned from the original template
177 2         84 my $new_doc = $self->surgeon->meta->clone_object($self->surgeon);
178 2         857 $new_doc->contents($new_XML);
179              
180 2         201 return $new_doc;
181             }
182              
183              
184             #======================================================================
185             # DEFAULT ENGINE : TEMPLATE TOOLKIT, a.k.a. TT2
186             #======================================================================
187              
188              
189             sub TT2_engine {
190 2     2 0 20 my ($self, $vars) = @_;
191              
192 2         774 require Template::AutoFilter; # a subclass of Template that adds automatic html filtering
193              
194             # at the first invocation, create a TT2 compiled template and store it in the stash.
195             # Further invocations just reuse the TT2 object in stash.
196 2   100     65627 my $stash = $self->{engine_stash} //= {};
197 2   66     12 $stash->{TT2} //= Template::AutoFilter->new(@{$self->engine_args});
  1         50  
198 2   66     24680 $stash->{compiled_template} //= $stash->{TT2}->template(\$self->{template_text});
199              
200             # generate new XML by invoking the template on $vars
201 2         16218 my $new_XML = $stash->{TT2}->context->process($stash->{compiled_template}, $vars);
202              
203 2         5653 return $new_XML;
204             }
205              
206              
207              
208              
209             1;
210              
211             __END__
212              
213             =encoding ISO-8859-1
214              
215             =head1 NAME
216              
217             MsOffice::Word::Template - generate Microsoft Word documents from Word templates
218              
219             =head1 SYNOPSIS
220              
221             my $template = MsOffice::Word::Template->new($filename);
222             my $new_doc = $template->process(\%data);
223             $new_doc->save_as($path_for_new_doc);
224              
225             =head1 DESCRIPTION
226              
227             =head2 Purpose
228              
229             This module treats a Microsoft Word document as a template for generating other documents. The idea is
230             similar to the "mail merge" functionality in Word, but with much richer possibilities, because the
231             whole power of a Perl templating engine can be exploited, for example for
232              
233             =over
234              
235             =item *
236              
237             dealing with complex, nested datastructures
238              
239             =item *
240              
241             using control directives for loops, conditionals, subroutines, etc.
242              
243             =back
244              
245              
246             Template authors just have to use the highlighing function in MsWord to
247             mark the templating directives :
248              
249             =over
250              
251             =item *
252              
253             fragments highlighted in B<yelllow> are interpreted as I<data>
254             directives, i.e. the template result will be inserted at that point in
255             the document, keeping the current formatting properties (bold, italic,
256             font, etc.).
257              
258             =item *
259              
260             fragments highlighted in B<green> are interpreted as I<control>
261             directives that do not directly generate content, like loops, conditionals,
262             etc. Paragraphs or table rows around such directives are dismissed,
263             in order to avoid empty paragraphs or empty rows in the resulting document.
264              
265             =back
266              
267             The syntax of data and control directives depends on the backend
268             templating engine. The default engine is the L<Perl Template
269             Toolkit|Template>; other engines can be specified through parameters
270             to the L</new> method -- see the L</TEMPLATE ENGINE> section below.
271              
272              
273             =head2 Status
274              
275             This first release is a proof of concept. Some simple templates have
276             been successfully tried; however it is likely that a number of
277             improvements will have to be made before this system can be used at
278             large scale in production. If you use this module, please keep me
279             informed of your difficulties, tricks, suggestions, etc.
280              
281              
282             =head1 METHODS
283              
284             =head2 new
285              
286             my $template = MsOffice::Word::Template->new($docx);
287             # or : my $template = MsOffice::Word::Template->new($surgeon); # an instance of MsOffice::Word::Surgeon
288             # or : my $template = MsOffice::Word::Template->new(docx => $docx, %options);
289              
290             In its simplest form, the constructor takes a single argument which
291             is either a string (path to a F<docx> document), or an instance of
292             L<MsOffice::Word::Surgeon>. Otherwise the constructor takes a list of named parameters,
293             which can be
294              
295              
296             =over
297              
298             =item docx
299              
300             path to a MsWord document in F<docx> format. This will automatically create
301             an instance of L<MsOffice::Word::Surgeon> and pass it to the constructor
302             through the C<surgeon> keyword.
303              
304             =item surgeon
305              
306             an instance of L<MsOffice::Word::Surgeon>. This is a mandatory parameter, either
307             directly through the C<surgeon> keyword, or indirectly through the C<docx> keyword.
308              
309             =item data_color
310              
311             the Word highlight color for marking data directives (default : yellow)
312              
313             =item control_color
314              
315             the Word highlight color for marking control directives (default : green).
316             Such directives should produce no content. They are treated outside of the regular text flow.
317              
318             =back
319              
320             In addition to the attributes above, other attributes can be passed to the
321             constructor for specifying a templating engine different from the
322             default L<Perl Template Toolkit|Template>.
323             These are described in section L</TEMPLATE ENGINE> below.
324              
325              
326             =head2 process
327              
328             my $new_doc = $template->process(\%data);
329             $new_doc->save_as($path_for_new_doc);
330              
331             Process the template on a given data tree, and return a new document
332             (actually, a new instance of L<MsOffice::Word::Surgeon>).
333             That document can then be saved using L<MsOffice::Word::Surgeon/save_as>.
334              
335              
336             =head1 AUTHORING TEMPLATES
337              
338             A template is just a regular Word document, in which the highlighted
339             fragments represent templating directives.
340              
341             The data directives, i.e. the "holes" to be filled must be highlighted
342             in B<yellow>. Such zones must contain the names of variables to fill the
343             holes. If the template engine supports it, names of variables can be paths
344             into a complex datastructure, with dots separating the levels, like
345             C<foo.3.bar.-1> -- see L<Template::Manual::Directive/GET> and
346             L<Template::Manual::Variables> if you are using the Template Toolkit.
347              
348             Control directives like C<IF>, C<FOREACH>, etc. must be highlighted in
349             B<green>. When seeing a green zone, the system will remove markup for
350             the surrounding XML nodes (text, run and paragraph nodes). If this
351             occurs within a table, the markup for the current row is also
352             removed. This mechanism ensures that the final result will not contain
353             empty paragraphs or empty rows at places corresponding to control directives.
354              
355             In consequence of this distinction between yellow and green
356             highlights, templating zones cannot mix data directives with control
357             directives : a data directive within a green zone would generate output
358             outside of the regular XML flow (paragraph nodes, run nodes and text
359             nodes), and therefore MsWord would generate an error when trying to
360             open such content. There is a workaround, however : data directives
361             within a green zone will work if they I<also generate the appropriate markup>
362             for paragraph nodes, run nodes and text nodes; but in that case you must
363             also apply the "none" filter from L<Template::AutoFilter> so that
364             angle brackets in XML markup do not get translated into HTML entities.
365              
366              
367             =head1 TEMPLATE ENGINE
368              
369             This module invokes a backend I<templating engine> for interpreting the
370             template directives. In order to use an engine different from the default
371             L<Template Toolkit|Template>, you must supply the following parameters
372             to the N</new> method :
373              
374             =over
375              
376             =item start_tag
377              
378             The string for identifying the start of a template directive
379              
380             =item end_tag
381              
382             The string for identifying the end of a template directive
383              
384             =item engine
385              
386             A reference to a method that will perform the templating operation (explained below)
387              
388             =item engine_args
389              
390             An optional list of parameters that may be used by the engine
391              
392             =back
393              
394             Given a datatree in C<$vars>, the engine will be called as :
395              
396             my $engine = $self->engine;
397             my $new_XML = $self->$engine($vars);
398              
399             It is up to the engine method to exploit C<< $self->engine_args >> if needed.
400              
401             If the engine is called repetively, it may need to store some data to be
402             persistent between two calls, like for example a compiled version of the
403             parsed template. To this end, there is an internal hashref attribute
404             called C<engine_stash>. If necessary the stash can be cleared through
405             the C<clear_stash> method.
406              
407             Here is an example using L<Template::Mustache> :
408              
409             my $template = MsOffice::Word::Template->new(
410             docx => $template_file,
411             start_tag => "{{",
412             end_tag => "}}",
413             engine => sub {
414             my ($self, $vars) = @_;
415              
416             # at the first invocation, create a Mustache compiled template and store it in the stash.
417             # Further invocations will just reuse the object in stash.
418             my $stash = $self->{engine_stash} //= {};
419             $stash->{mustache} //= Template::Mustache->new(
420             template => $self->{template_text},
421             @{$self->engine_args}, # for ex. partials, partial_path, context
422             # -- see L<Template::Mustache> documentation
423             );
424              
425             # generate new XML by invoking the template on $vars
426             my $new_XML = $stash->{mustache}->render($vars);
427              
428             return $new_XML;
429             },
430             );
431              
432             The engine must make sure that ampersand characters and angle brackets
433             are automatically replaced by the corresponding HTML entities
434             (otherwise the resulting XML would be incorrect and could not be
435             opened by Microsoft Word). The Mustache engine does this
436             automatically. The Template Toolkit would normally require to
437             explicitly add an C<html> filter at each directive :
438              
439             [% foo.bar | html %]
440              
441             but thanks to the L<Template::AutoFilter>
442             module, this is performed automatically.
443              
444             =head1 AUTHOR
445              
446             Laurent Dami, E<lt>dami AT cpan DOT org<gt>
447              
448             =head1 COPYRIGHT AND LICENSE
449              
450             Copyright 2020, 2021 by Laurent Dami.
451              
452             This library is free software; you can redistribute it and/or modify
453             it under the same terms as Perl itself.
454              
455