File Coverage

blib/lib/Renard/API/MuPDF/mutool.pm
Criterion Covered Total %
statement 25 129 31.0
branch 0 18 16.6
condition 0 3 0.0
subroutine 9 20 45.0
pod 9 9 100.0
total 43 179 34.0


line stmt bran cond sub pod time code
1 1     1   562317 use Renard::Incunabula::Common::Setup;
  1         3  
  1         6  
2             package Renard::API::MuPDF::mutool;
3             # ABSTRACT: Retrieve PDF image and text data via MuPDF's mutool
4             $Renard::API::MuPDF::mutool::VERSION = '0.006';
5 1     1   8750 use Capture::Tiny qw(capture);
  1         16071  
  1         64  
6 1     1   809 use XML::Simple;
  1         9351  
  1         7  
7 1     1   614 use Alien::MuPDF 0.007;
  1         8033  
  1         8  
8 1     1   22798 use Path::Tiny;
  1         2  
  1         59  
9              
10 1     1   540 use Log::Any qw($log);
  1         8398  
  1         6  
11 1     1   2197 use constant MUPDF_DEFAULT_RESOLUTION => 72; # dpi
  1         2  
  1         59  
12              
13 1     1   580 use Renard::API::MuPDF::mutool::ObjectParser;
  1         5  
  1         78  
14              
15             BEGIN {
16 1     1   11 our $MUTOOL_PATH = Alien::MuPDF->mutool_path;
17             }
18              
19 0     0     fun _call_mutool( @mutool_args ) {
  0            
20 0           my @args = ( $Renard::API::MuPDF::mutool::MUTOOL_PATH, @mutool_args );
21 0           my ($stdout, $exit);
22              
23             # Note: The code below is marked as uncoverable because it only applies
24             # on Windows and we are currently only automatically checking coverage
25             # on Linux via Travis-CI.
26             # uncoverable branch true
27 0 0         if( $^O eq 'MSWin32' ) {
28             # Need to redirect to a file for two reasons:
29             # - /SUBSYSTEM:WINDOWS closes stdin/stdout <https://github.com/project-renard/curie/issues/128>.
30             # - MuPDF does not set the mode on stdout to binary <http://bugs.ghostscript.com/show_bug.cgi?id=694954>.
31 0           my $temp_fh = File::Temp->new; # uncoverable statement
32 0           close $temp_fh; # to avoid Windows file locking # uncoverable statement
33              
34 0           my $output_param = 0; # uncoverable statement
35 0           for my $idx (1..@args-2) { # uncoverable statement
36             # uncoverable branch true
37 0 0 0       if( $args[$idx] eq '-o' # uncoverable statement
38             && $args[$idx+1] eq '-' ) {
39 0           $args[$idx+1] = $temp_fh->filename; # uncoverable statement
40 0           $output_param = 1; # uncoverable statement
41             }
42             }
43              
44             # uncoverable branch true
45 0 0         if( not $output_param ) { # uncoverable statement
46             # redirect into a temp file
47             my $cmd = join " ", # uncoverable statement
48 0 0         map { $_ =~ /\s/ ? "\"$_\"" : $_ } # uncoverable statement
  0            
49             @args; # uncoverable statement
50 0           my $redir = $temp_fh->filename; # uncoverable statement
51 0           @args = ("$cmd > \"$redir\""); # uncoverable statement
52             }
53              
54 0           $log->infof("running mutool: %s", \@args); # uncoverable statement
55 0           system( @args ); # uncoverable statement
56 0           $stdout = path( $temp_fh->filename )->slurp_raw; # uncoverable statement
57 0           $exit = $?; # uncoverable statement
58             } else {
59             # Make sure STDOUT is :raw
60 0 0         open my $dup, ">&=", *STDOUT or die $!;
61 0           local *STDOUT;
62 0           open(STDOUT, ">&=", $dup);
63 0           binmode *STDOUT, ':raw';
64              
65             ($stdout, undef, $exit) = capture {
66 0     0     $log->infof("running mutool: %s", \@args);
67 0           system( @args );
68 0           };
69             }
70              
71 0 0         die "Unexpected mutool exit: $exit" if $exit;
72              
73 0           return $stdout;
74             }
75              
76 0     0 1   fun get_mutool_pdf_page_as_png($pdf_filename, $pdf_page_no, $zoom_level) {
  0            
77 0           my $stdout = _call_mutool(
78             qw(draw),
79             qw( -r ), ($zoom_level * MUPDF_DEFAULT_RESOLUTION), # calculate the resolution
80             qw( -F png ),
81             qw( -o -),
82             $pdf_filename,
83             $pdf_page_no,
84             );
85              
86 0           return $stdout;
87             }
88              
89 0     0 1   fun get_mutool_text_stext_raw($pdf_filename, $pdf_page_no) {
  0            
90 0           my $stdout = _call_mutool(
91             qw(draw),
92             qw(-F stext),
93             qw(-o -),
94             $pdf_filename,
95             $pdf_page_no,
96             );
97              
98 0           return $stdout;
99             }
100              
101 0     0 1   fun get_mutool_text_stext_xml($pdf_filename, $pdf_page_no) {
  0            
102 0           my $stext_xml = get_mutool_text_stext_raw(
103             $pdf_filename,
104             $pdf_page_no,
105             );
106              
107 0           my $stext = XMLin( $stext_xml,
108             KeyAttr => [],
109             ForceArray => [ qw(page block line font char) ] );
110              
111 0           return $stext;
112             }
113              
114 0     0 1   fun get_mutool_page_info_raw($pdf_filename) {
  0            
115 0           my $stdout = _call_mutool(
116             qw(pages),
117             $pdf_filename
118             );
119              
120             # remove the first line
121 0           $stdout =~ s/^[^\n]*\n//s;
122              
123             # wraps the data with a root node
124 0           return "<document>$stdout</document>"
125             }
126              
127 0     0 1   fun get_mutool_page_info_xml($pdf_filename) {
  0            
128 0           my $page_info_xml = get_mutool_page_info_raw( $pdf_filename );
129              
130 0           my $page_info = XMLin( $page_info_xml,
131             KeyAttr => [],
132             ForceArray => [ qw(page) ] );
133              
134 0           my $root_media_box_p = Renard::API::MuPDF::mutool::ObjectParser->new(
135             filename => $pdf_filename,
136             string => Renard::API::MuPDF::mutool::get_mutool_get_object_raw($pdf_filename, 'Root/Pages/MediaBox'),
137             is_toplevel => 0,
138             );
139 0           my $root_media_box;
140 0 0         if( $root_media_box_p->data ) {
141 0           $root_media_box->{l} = $root_media_box_p->data->[0];
142 0           $root_media_box->{b} = $root_media_box_p->data->[1];
143              
144 0           $root_media_box->{r} = $root_media_box_p->data->[2];
145 0           $root_media_box->{t} = $root_media_box_p->data->[3];
146             }
147              
148 0           for my $page_hash (@{ $page_info->{page} }) {
  0            
149 0 0         unless( exists $page_hash->{CropBox} ) {
150 0 0         my $media_box = exists $page_hash->{MediaBox} ? $page_hash->{MediaBox} : $root_media_box;
151 0           $page_hash->{CropBox} = { %$media_box };
152             }
153             }
154              
155 0           return $page_info;
156             }
157              
158 0     0 1   fun get_mutool_outline_simple($pdf_filename) {
  0            
159 0           my $outline_text = _call_mutool(
160             qw(show),
161             $pdf_filename,
162             qw(outline)
163             );
164              
165 0           my @outline_items = ();
166 0           utf8::upgrade($outline_text);
167 0           open my $outline_fh, '<:crlf', \$outline_text;
168 0           while( defined( my $line = <$outline_fh> ) ) {
169 0           $line =~ /^
170             (?<prefix>[+|-])
171             (?<indent>\t*)
172             "(?<text>.*)"
173             \t
174             (?<reference>
175             # #123,20,40
176             ( \# (?<page>\d+)(,(?<dx>-?\d+),(?<dy>-?\d+))? )
177             |
178             # #page=123&zoom=nan,20,40
179             # #page=123&view=Fit
180             ( \# page=(?<page>\d+)(&(view|zoom)=[^&,]+?)*(,(?<dx>-?\d+),(?<dy>-?\d+))? )
181             |
182             \Q(null)\E
183             )
184             $
185             /x;
186 0           my %copy = %+;
187 0           $copy{level} = length($copy{indent}) - 1;
188 0           $copy{text} =~ s/\\x([0-9A-F]{2})/chr(hex($1))/ge;
  0            
189 0           $copy{open} = $copy{prefix} eq '-';
190 0           delete $copy{prefix};
191 0           delete $copy{indent};
192 0           delete $copy{reference};
193             # not storing the offsets yet and not every line has offsets
194 0           delete @copy{qw(dx dy)};
195 0           push @outline_items, \%copy;
196             }
197              
198 0           return \@outline_items;
199             }
200              
201 0     0 1   fun get_mutool_get_trailer_raw($pdf_filename) {
  0            
202 0           my $trailer_text = _call_mutool(
203             qw(show),
204             $pdf_filename,
205             qw(trailer)
206             );
207              
208 0           utf8::upgrade($trailer_text);
209 0           open my $trailer_fh, '<:crlf', \$trailer_text;
210 0           do { local $/ = ''; <$trailer_fh> };
  0            
  0            
211             }
212              
213 0     0 1   fun get_mutool_get_object_raw($pdf_filename, $object_id) {
  0            
214 0           my $object_text = _call_mutool(
215             qw(show),
216             $pdf_filename,
217             $object_id,
218             );
219              
220 0           utf8::upgrade($object_text);
221 0           open my $object_fh, '<:crlf', \$object_text;
222 0           do { local $/ = ''; <$object_fh> };
  0            
  0            
223             }
224              
225 0     0 1   fun get_mutool_get_info_object_parsed( $pdf_filename ) {
  0            
226 0           my $trailer = Renard::API::MuPDF::mutool::ObjectParser->new(
227             filename => $pdf_filename,
228             string => Renard::API::MuPDF::mutool::get_mutool_get_trailer_raw($pdf_filename),
229             );
230              
231 0           my $info = $trailer->resolve_key('Info');
232             }
233              
234              
235             1;
236              
237             __END__
238              
239             =pod
240              
241             =encoding UTF-8
242              
243             =head1 NAME
244              
245             Renard::API::MuPDF::mutool - Retrieve PDF image and text data via MuPDF's mutool
246              
247             =head1 VERSION
248              
249             version 0.006
250              
251             =head1 FUNCTIONS
252              
253             =head2 _call_mutool
254              
255             _call_mutool( @args )
256              
257             Helper function which calls C<mutool> with the contents of the C<@args> array.
258              
259             Returns the captured C<STDOUT> of the call.
260              
261             This function dies if C<mutool> unsuccessfully exits.
262              
263             =head2 get_mutool_pdf_page_as_png
264              
265             get_mutool_pdf_page_as_png($pdf_filename, $pdf_page_no)
266              
267             This function returns a PNG stream that renders page number C<$pdf_page_no> of
268             the PDF file C<$pdf_filename>.
269              
270             =head2 get_mutool_text_stext_raw
271              
272             get_mutool_text_stext_raw($pdf_filename, $pdf_page_no)
273              
274             This function returns an XML string that contains structured text from page
275             number C<$pdf_page_no> of the PDF file C<$pdf_filename>.
276              
277             The XML format is defined by the output of C<mutool> looks like this (for page
278             23 of the C<pdf_reference_1-7.pdf> file):
279              
280             <?xml version="1.0"?>
281             <document name="(null)">
282             <page height="666" width="531">
283             <block bbox="261.18 616.16397 269.77766 625.2532">
284             <line bbox="261.18 616.16397 269.77766 625.2532" dir="1 0" wmode="0">
285             <font name="MyriadPro-Semibold" size="7.98">
286             <char bbox="261.18 616.16397 265.45729 625.2532" c="2" x="261.18" y="623.2582"/>
287             <char bbox="265.50038 616.16397 269.77766 625.2532" c="3" x="265.50038" y="623.2582"/>
288             </font>
289             </line>
290             </block>
291             <block bbox="225.78 88.20229 305.18159 117.93829">
292             <line bbox="225.78 88.20229 305.18159 117.93829" dir="1 0" wmode="0">
293             <font name="MyriadPro-Bold" size="24">
294             <char bbox="225.78 88.20229 239.724 117.93829" c="P" x="225.78" y="111.93829"/>
295             <char bbox="239.5176 88.20229 248.63759 117.93829" c="r" x="239.5176" y="111.93829"/>
296             <char bbox="248.4552 88.20229 261.1272 117.93829" c="e" x="248.4552" y="111.93829"/>
297             <char bbox="261.1128 88.20229 269.29679 117.93829" c="f" x="261.1128" y="111.93829"/>
298             </font>
299             </line>
300             </block>
301             </page>
302             </document>
303              
304             Simplified, the high-level structure looks like:
305              
306             <page> -> [list of blocks]
307             <block> -> [list of blocks]
308             a block is either:
309             - stext
310             <line> -> [list of lines] (all have same baseline)
311             <font> -> [list of fonts] (horizontal spaces over a line)
312             <char> -> [list of chars]
313             - image
314             # TODO document the image data from mutool
315              
316             =head2 get_mutool_text_stext_xml
317              
318             get_mutool_text_stext_xml($pdf_filename, $pdf_page_no)
319              
320             Returns a HashRef of the structured text from from page
321             number C<$pdf_page_no> of the PDF file C<$pdf_filename>.
322              
323             See the function L<get_mutool_text_stext_raw|/get_mutool_text_stext_raw> for
324             details on the structure of this data.
325              
326             =head2 get_mutool_page_info_raw
327              
328             get_mutool_page_info_raw($pdf_filename)
329              
330             Returns an XML string of the page bounding boxes of PDF file C<$pdf_filename>.
331              
332             The data is in the form:
333              
334             <document>
335             <page pagenum="1">
336             <MediaBox l="0" b="0" r="531" t="666" />
337             <CropBox l="0" b="0" r="531" t="666" />
338             <Rotate v="0" />
339             </page>
340             <page pagenum="2">
341             ...
342             </page>
343             </document>
344              
345             =head2 get_mutool_page_info_xml
346              
347             get_mutool_page_info_xml($pdf_filename)
348              
349             Returns a HashRef containing the page bounding boxes of PDF file
350             C<$pdf_filename>.
351              
352             See function L<get_mutool_page_info_raw|/get_mutool_page_info_raw> for
353             information on the structure of the data.
354              
355             =head2 get_mutool_outline_simple
356              
357             fun get_mutool_outline_simple($pdf_filename)
358              
359             Returns an array of the outline of the PDF file C<$pdf_filename> as an
360             C<ArrayRef[HashRef]> which corresponds to the C<items> attribute of
361             L<Renard::Incunabula::Outline>.
362              
363             =head2 get_mutool_get_trailer_raw
364              
365             fun get_mutool_get_trailer_raw($pdf_filename)
366              
367             Returns the trailer of the PDF file C<$pdf_filename> as a string.
368              
369             =head2 get_mutool_get_object_raw
370              
371             fun get_mutool_get_object_raw($pdf_filename, $object_id)
372              
373             Returns the object given by the ID C<$object_id> for PDF file C<$pdf_filename>
374             as a string.
375              
376             =head2 get_mutool_get_info_object_parsed
377              
378             fun get_mutool_get_info_object_parsed( $pdf_filename )
379              
380             Returns the document information dictionary as a
381             L<Renard::API::MuPDF::mutool::ObjectParser> object.
382              
383             See Table 10.2 on pg. 844 of the I<PDF Reference, version 1.7> to see the
384             entries that usually used (e.g., Title, Author).
385              
386             =head1 SEE ALSO
387              
388             L<Repository information|http://project-renard.github.io/doc/development/repo/p5-Renard-API-MuPDF-mutool/>
389              
390             =head1 AUTHOR
391              
392             Project Renard
393              
394             =head1 COPYRIGHT AND LICENSE
395              
396             This software is copyright (c) 2017 by Project Renard.
397              
398             This is free software; you can redistribute it and/or modify it under
399             the same terms as the Perl 5 programming language system itself.
400              
401             =cut