File Coverage

blib/lib/Document/OOXML.pm
Criterion Covered Total %
statement 16 18 88.8
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 22 24 91.6


line stmt bran cond sub pod time code
1 2     2   161369 use utf8;
  2         5  
  2         11  
2             package Document::OOXML;
3 2     2   639 use Moose;
  2         729740  
  2         10  
4 2     2   12534 use namespace::autoclean;
  2         12224  
  2         5  
5              
6             # ABSTRACT: Manipulation of Office Open XML files
7             our $VERSION = '0.172610'; # VERSION
8              
9 2     2   514 use Archive::Zip qw( :ERROR_CODES :CONSTANTS );
  2         58360  
  2         259  
10 2     2   12 use Carp;
  2         4  
  2         81  
11 2     2   128 use XML::LibXML;
  0            
  0            
12              
13             use Document::OOXML::ContentTypes;
14             use Document::OOXML::Part::WordprocessingML;
15             use Document::OOXML::Document::Wordprocessor;
16              
17              
18             my $RELATIONSHIPS_NS = 'http://schemas.openxmlformats.org/package/2006/relationships';
19              
20             my %ROOT_PART_REL_TYPES = (
21             transitionalDocument => 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument',
22             strictDocument => 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument',
23             );
24              
25             my %PART_TYPES = (
26             WordprocessingML => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml'
27             );
28              
29              
30             sub read_document {
31             my $class = shift;
32             my $filename = shift;
33              
34             my $zip = Archive::Zip->new();
35              
36             my $zip_status = $zip->read($filename);
37             croak("Cannot read: $zip_status") unless $zip_status == AZ_OK;
38              
39             my $content_types = do {
40             my $ct_xml = $zip->contents('[Content_Types].xml')
41             or croak("No member named '/[Content_Types].xml'. Is it OOXML?");
42              
43             Document::OOXML::ContentTypes->new_from_xml($ct_xml);
44             };
45              
46             my $base_rels = $zip->contents('_rels/.rels')
47             or croak("No member named '_rels/.rels' in document. Is it OOXML?");
48              
49             my $xml = XML::LibXML->load_xml( string => $base_rels );
50             my $xpc = XML::LibXML::XPathContext->new();
51             $xpc->registerNs('r' => $RELATIONSHIPS_NS);
52              
53             # The "old"/transitional XML uses schemas.openxmlformats.org
54             # "New"/ISO standard/strict XML uses purl.oclc.org/ooxml
55             my ($document_part_relation) = $xpc->findnodes(
56             qq{
57             /r:Relationships/r:Relationship[
58             \@Type='$ROOT_PART_REL_TYPES{transitionalDocument}'
59             or
60             \@Type='$ROOT_PART_REL_TYPES{strictDocument}'
61             ]
62             },
63             $xml->documentElement,
64             );
65              
66             my $id = $xpc->findvalue('@Id', $document_part_relation);
67             my $type = $xpc->findvalue('@Type', $document_part_relation);
68             my $part_name = $xpc->findvalue('@Target', $document_part_relation);
69              
70             my $strict;
71             if ($type eq $ROOT_PART_REL_TYPES{strictDocument}) {
72             $strict = 1;
73             } else {
74             $strict = 0;
75             }
76              
77             my $part_contents = $zip->contents($part_name)
78             or croak("No member named '$part_name' in document. Is it OOXML?");
79              
80             my $doc_part = _parse_part(
81             content_type => $content_types->get_content_type_for_part("/$part_name"),
82             contents => $part_contents,
83             part_name => $part_name,
84             is_strict => $strict,
85             );
86              
87             my $document_class;
88             if ($doc_part->isa('Document::OOXML::Part::WordprocessingML')) {
89             $document_class = 'Document::OOXML::Document::Wordprocessor';
90             }
91             else {
92             croak("Unsupported document type");
93             }
94              
95             my $ooxml = $document_class->new(
96             content_types => $content_types,
97             filename => $filename,
98             source => $zip,
99             is_strict => $strict,
100             );
101              
102             # Parts have weak references to the document they're in, so they don't
103             # create reference loops.
104             #
105             # They can use this reference to find or add other parts (images,
106             # headers, footers, etc.) referenced by the main document.
107             $doc_part->document($ooxml);
108             $ooxml->set_document_part($doc_part);
109              
110             return $ooxml;
111             }
112              
113             sub _parse_part {
114             my %args = @_;
115              
116             if ($args{content_type} eq $PART_TYPES{WordprocessingML}) {
117             return Document::OOXML::Part::WordprocessingML->new_from_xml(
118             $args{part_name},
119             $args{contents},
120             $args{is_strict} ? 1 : 0,
121             );
122             }
123              
124             croak("Unknown part of type '$args{content_type}'");
125             }
126              
127             __PACKAGE__->meta->make_immutable;
128              
129             __END__
130              
131             =pod
132              
133             =encoding UTF-8
134              
135             =head1 NAME
136              
137             Document::OOXML - Manipulation of Office Open XML files
138              
139             =head1 VERSION
140              
141             version 0.172610
142              
143             =head1 SYNOPSIS
144              
145             my $doc = Document::OOXML->read_document('some.docx');
146              
147             $doc->replace_text('old', 'new');
148              
149             $doc->save_to_file('some_other.docx');
150              
151             =head1 DESCRIPTION
152              
153             This module provides a way to open, modify and save Office Open XML files
154             (also known as OOXML or Microsoft Office XML).
155              
156             =head1 METHODS
157              
158             =head2 read_document($filename)
159              
160             Opens the file named C<$filename> and parses it.
161              
162             If the file doesn't appear to be a valid package, it will croak.
163              
164             Returns an instance of a subclass of L<Document::OOXML::Document> that can
165             be used to manipulate the contents of the document:
166              
167             =over
168              
169             =item * L<Document::OOXML::Document::Wordprocessor>
170              
171             =back
172              
173             =head1 SEE ALSO
174              
175             The format of Office Open XML files is described in the
176             L<ISO/IEC 29500|https://www.iso.org/standard/71691.html> and
177             L<ECMA-376|https://www.ecma-international.org/publications/standards/Ecma-376.htm>
178             standards.
179              
180             =head1 AUTHOR
181              
182             Martijn van de Streek <martijn@vandestreek.net>
183              
184             =head1 COPYRIGHT AND LICENSE
185              
186             This software is copyright (c) 2017 by Martijn van de Streek.
187              
188             This is free software; you can redistribute it and/or modify it under
189             the same terms as the Perl 5 programming language system itself.
190              
191             =cut