File Coverage

blib/lib/Document/OOXML.pm
Criterion Covered Total %
statement 16 18 88.8
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 22 24 91.6


line stmt bran cond sub pod time code
1 2     2   178682 use utf8;
  2         17  
  2         14  
2             package Document::OOXML;
3 2     2   653 use Moose;
  2         775411  
  2         16  
4 2     2   13860 use namespace::autoclean;
  2         12070  
  2         7  
5              
6             # ABSTRACT: Manipulation of Office Open XML files
7             our $VERSION = '0.172650'; # VERSION
8              
9 2     2   545 use Archive::Zip qw( :ERROR_CODES :CONSTANTS );
  2         64485  
  2         289  
10 2     2   15 use Carp;
  2         2  
  2         87  
11 2     2   120 use XML::LibXML;
  0            
  0            
12              
13             use Document::OOXML::ContentTypes;
14             use Document::OOXML::Document::Wordprocessor;
15             use Document::OOXML::PartParser;
16             use Document::OOXML::Rels;
17              
18              
19             my %ROOT_PART_REL_TYPES = (
20             transitionalDocument => 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument',
21             strictDocument => 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument',
22             );
23              
24              
25             sub read_document {
26             my $class = shift;
27             my $filename = shift;
28              
29             my $zip = Archive::Zip->new();
30              
31             my $zip_status = $zip->read($filename);
32             croak("Cannot read: $zip_status") unless $zip_status == AZ_OK;
33              
34             my $content_types = do {
35             my $ct_xml = $zip->contents('[Content_Types].xml')
36             or croak("No member named '/[Content_Types].xml'. Is it OOXML?");
37              
38             Document::OOXML::ContentTypes->new_from_xml($ct_xml);
39             };
40              
41             my $base_rels_data = $zip->contents('_rels/.rels')
42             or croak("No member named '_rels/.rels' in document. Is it OOXML?");
43              
44             my $rels = Document::OOXML::Rels->new_from_xml($base_rels_data, '');
45              
46             # The "old"/transitional XML uses schemas.openxmlformats.org
47             # "New"/ISO standard/strict XML uses purl.oclc.org/ooxml
48             my %document_part_relation = %{
49             $rels->get_part_relation_by_type($ROOT_PART_REL_TYPES{transitionalDocument})
50             || $rels->get_part_relation_by_type($ROOT_PART_REL_TYPES{strictDocument})
51             };
52              
53             my $type = $document_part_relation{type};
54             my $part_name = $document_part_relation{part_name};
55              
56             my $strict;
57             if ($type eq $ROOT_PART_REL_TYPES{strictDocument}) {
58             $strict = 1;
59             } else {
60             $strict = 0;
61             }
62              
63             my $part_contents = $zip->contents($part_name)
64             or croak("No member named '$part_name' in document. Is it OOXML?");
65              
66             my $doc_part = Document::OOXML::PartParser->parse_part(
67             content_type => $content_types->get_content_type_for_part($part_name),
68             contents => $part_contents,
69             part_name => $part_name,
70             is_strict => $strict,
71             );
72              
73             my $document_class;
74             if ($doc_part->isa('Document::OOXML::Part::WordprocessingML')) {
75             $document_class = 'Document::OOXML::Document::Wordprocessor';
76             }
77             else {
78             croak("Unsupported document type");
79             }
80              
81             my $ooxml = $document_class->new(
82             content_types => $content_types,
83             filename => $filename,
84             source => $zip,
85             is_strict => $strict,
86             );
87              
88             # Parts have weak references to the document they're in, so they don't
89             # create reference loops.
90             #
91             # They can use this reference to find or add other parts (images,
92             # headers, footers, etc.) referenced by the main document.
93             $doc_part->document($ooxml);
94             $ooxml->set_document_part($doc_part);
95              
96             return $ooxml;
97             }
98              
99             __PACKAGE__->meta->make_immutable;
100              
101             __END__
102              
103             =pod
104              
105             =encoding UTF-8
106              
107             =head1 NAME
108              
109             Document::OOXML - Manipulation of Office Open XML files
110              
111             =head1 VERSION
112              
113             version 0.172650
114              
115             =head1 SYNOPSIS
116              
117             my $doc = Document::OOXML->read_document('some.docx');
118              
119             $doc->replace_text('old', 'new');
120              
121             $doc->save_to_file('some_other.docx');
122              
123             =head1 DESCRIPTION
124              
125             This module provides a way to open, modify and save Office Open XML files
126             (also known as OOXML or Microsoft Office XML).
127              
128             =head1 METHODS
129              
130             =head2 read_document($filename)
131              
132             Opens the file named C<$filename> and parses it.
133              
134             If the file doesn't appear to be a valid package, it will croak.
135              
136             Returns an instance of a subclass of L<Document::OOXML::Document> that can
137             be used to manipulate the contents of the document:
138              
139             =over
140              
141             =item * L<Document::OOXML::Document::Wordprocessor>
142              
143             =back
144              
145             =head1 SEE ALSO
146              
147             The format of Office Open XML files is described in the
148             L<ISO/IEC 29500|https://www.iso.org/standard/71691.html> and
149             L<ECMA-376|https://www.ecma-international.org/publications/standards/Ecma-376.htm>
150             standards.
151              
152             =head1 AUTHOR
153              
154             Martijn van de Streek <martijn@vandestreek.net>
155              
156             =head1 COPYRIGHT AND LICENSE
157              
158             This software is copyright (c) 2017 by Martijn van de Streek.
159              
160             This is free software; you can redistribute it and/or modify it under
161             the same terms as the Perl 5 programming language system itself.
162              
163             =cut