File Coverage

blib/lib/Renard/Incunabula/MuPDF/mutool.pm
Criterion Covered Total %
statement 28 102 42.1
branch 0 34 8.8
condition 0 3 0.0
subroutine 10 18 55.5
pod 6 6 100.0
total 44 163 38.0


line stmt bran cond sub pod time code
1 1     1   437685 use Renard::Incunabula::Common::Setup;
  1         3  
  1         8  
2             package Renard::Incunabula::MuPDF::mutool;
3             # ABSTRACT: Retrieve PDF image and text data via MuPDF's mutool
4             $Renard::Incunabula::MuPDF::mutool::VERSION = '0.003';
5 1     1   10929 use Capture::Tiny qw(capture);
  1         48235  
  1         103  
6 1     1   1058 use XML::Simple;
  1         12463  
  1         15  
7 1     1   1008 use Alien::MuPDF 0.007;
  1         17564  
  1         15  
8 1     1   32805 use Path::Tiny;
  1         4  
  1         107  
9 1     1   9 use Function::Parameters;
  1         4  
  1         14  
10              
11 1     1   1577 use Log::Any qw($log);
  1         10998  
  1         9  
12 1     1   3338 use constant MUPDF_DEFAULT_RESOLUTION => 72; # dpi
  1         3  
  1         142  
13              
14             BEGIN {
15 1     1   15 our $MUTOOL_PATH = Alien::MuPDF->mutool_path;
16             }
17              
18 0     0     fun _call_mutool( @mutool_args ) {
  0            
19 0           my @args = ( $Renard::Incunabula::MuPDF::mutool::MUTOOL_PATH, @mutool_args );
20 0           my ($stdout, $exit);
21              
22             # Note: The code below is marked as uncoverable because it only applies
23             # on Windows and we are currently only automatically checking coverage
24             # on Linux via Travis-CI.
25             # uncoverable branch true
26 0 0         if( $^O eq 'MSWin32' ) {
27             # Need to redirect to a file for two reasons:
28             # - /SUBSYSTEM:WINDOWS closes stdin/stdout <https://github.com/project-renard/curie/issues/128>.
29             # - MuPDF does not set the mode on stdout to binary <http://bugs.ghostscript.com/show_bug.cgi?id=694954>.
30 0           my $temp_fh = File::Temp->new; # uncoverable statement
31 0           close $temp_fh; # to avoid Windows file locking # uncoverable statement
32              
33 0           my $output_param = 0; # uncoverable statement
34 0           for my $idx (1..@args-2) { # uncoverable statement
35             # uncoverable branch true
36 0 0 0       if( $args[$idx] eq '-o' # uncoverable statement
37             && $args[$idx+1] eq '-' ) {
38 0           $args[$idx+1] = $temp_fh->filename; # uncoverable statement
39 0           $output_param = 1; # uncoverable statement
40             }
41             }
42              
43             # uncoverable branch true
44 0 0         if( not $output_param ) { # uncoverable statement
45             # redirect into a temp file
46             my $cmd = join " ", # uncoverable statement
47 0 0         map { $_ =~ /\s/ ? "\"$_\"" : $_ } # uncoverable statement
  0            
48             @args; # uncoverable statement
49 0           my $redir = $temp_fh->filename; # uncoverable statement
50 0           @args = ("$cmd > \"$redir\""); # uncoverable statement
51             }
52              
53 0           $log->infof("running mutool: %s", \@args); # uncoverable statement
54 0           system( @args ); # uncoverable statement
55 0           $stdout = path( $temp_fh->filename )->slurp_raw; # uncoverable statement
56 0           $exit = $?; # uncoverable statement
57             } else {
58             ($stdout, undef, $exit) = capture {
59 0     0     $log->infof("running mutool: %s", \@args);
60 0           system( @args );
61 0           };
62             }
63              
64 0 0         die "Unexpected mutool exit: $exit" if $exit;
65              
66 0           return $stdout;
67             }
68              
69 0 0   0 1   fun get_mutool_pdf_page_as_png($pdf_filename, $pdf_page_no, $zoom_level) {
  0 0          
  0            
  0            
70 0           my $stdout = _call_mutool(
71             qw(draw),
72             qw( -r ), ($zoom_level * MUPDF_DEFAULT_RESOLUTION), # calculate the resolution
73             qw( -F png ),
74             qw( -o -),
75             $pdf_filename,
76             $pdf_page_no,
77             );
78              
79 0           return $stdout;
80             }
81              
82 0 0   0 1   fun get_mutool_text_stext_raw($pdf_filename, $pdf_page_no) {
  0 0          
  0            
  0            
83 0           my $stdout = _call_mutool(
84             qw(draw),
85             qw(-F stext),
86             qw(-o -),
87             $pdf_filename,
88             $pdf_page_no,
89             );
90              
91 0           return $stdout;
92             }
93              
94 0 0   0 1   fun get_mutool_text_stext_xml($pdf_filename, $pdf_page_no) {
  0 0          
  0            
  0            
95 0           my $stext_xml = get_mutool_text_stext_raw(
96             $pdf_filename,
97             $pdf_page_no,
98             );
99              
100 0           my $stext = XMLin( $stext_xml,
101             ForceArray => [ qw(page block line span char) ] );
102              
103 0           return $stext;
104             }
105              
106 0 0   0 1   fun get_mutool_page_info_raw($pdf_filename) {
  0 0          
  0            
  0            
107 0           my $stdout = _call_mutool(
108             qw(pages),
109             $pdf_filename
110             );
111              
112             # remove the first line
113 0           $stdout =~ s/^[^\n]*\n//s;
114              
115             # wraps the data with a root node
116 0           return "<document>$stdout</document>"
117             }
118              
119 0 0   0 1   fun get_mutool_page_info_xml($pdf_filename) {
  0 0          
  0            
  0            
120 0           my $page_info_xml = get_mutool_page_info_raw( $pdf_filename );
121              
122 0           my $page_info = XMLin( $page_info_xml,
123             ForceArray => [ qw(page) ] );
124              
125 0           return $page_info;
126             }
127              
128 0 0   0 1   fun get_mutool_outline_simple($pdf_filename) {
  0 0          
  0            
  0            
129 0           my $outline_text = _call_mutool(
130             qw(show),
131             $pdf_filename,
132             qw(outline)
133             );
134              
135 0           my @outline_items = ();
136 0           open my $outline_fh, '<:encoding(UTF-8):crlf', \$outline_text;
137 0           while( defined( my $line = <$outline_fh> ) ) {
138 0           $line =~ /^(?<indent>\t*)(?<text>.*)\t#(?<page>\d+)(,(?<dx>\d+),(?<dy>\d+))?$/;
139 1     1   3152 my %copy = %+;
  1         516  
  1         147  
  0            
140 0           $copy{level} = length $copy{indent};
141 0           delete $copy{indent};
142             # not storing the offsets yet and not every line has offsets
143 0           delete @copy{qw(dx dy)};
144 0           push @outline_items, \%copy;
145             }
146              
147 0           return \@outline_items;
148             }
149              
150              
151             1;
152              
153             __END__
154              
155             =pod
156              
157             =encoding UTF-8
158              
159             =head1 NAME
160              
161             Renard::Incunabula::MuPDF::mutool - Retrieve PDF image and text data via MuPDF's mutool
162              
163             =head1 VERSION
164              
165             version 0.003
166              
167             =head1 FUNCTIONS
168              
169             =head2 _call_mutool
170              
171             _call_mutool( @args )
172              
173             Helper function which calls C<mutool> with the contents of the C<@args> array.
174              
175             Returns the captured C<STDOUT> of the call.
176              
177             This function dies if C<mutool> unsuccessfully exits.
178              
179             =head2 get_mutool_pdf_page_as_png
180              
181             get_mutool_pdf_page_as_png($pdf_filename, $pdf_page_no)
182              
183             This function returns a PNG stream that renders page number C<$pdf_page_no> of
184             the PDF file C<$pdf_filename>.
185              
186             =head2 get_mutool_text_stext_raw
187              
188             get_mutool_text_stext_raw($pdf_filename, $pdf_page_no)
189              
190             This function returns an XML string that contains structured text from page
191             number C<$pdf_page_no> of the PDF file C<$pdf_filename>.
192              
193             The XML format is defined by the output of C<mutool> looks like this (for page
194             23 of the C<pdf_reference_1-7.pdf> file):
195              
196             <document name="test-data/test-data/PDF/Adobe/pdf_reference_1-7.pdf">
197             <page width="531" height="666">
198             <block bbox="261.18 616.16394 269.77765 625.2532">
199             <line bbox="261.18 616.16394 269.77765 625.2532">
200             <span bbox="261.18 616.16394 269.77765 625.2532" font="MyriadPro-Semibold" size="7.98">
201             <char bbox="261.18 616.16394 265.50037 625.2532" x="261.18" y="623.2582" c="2"/>
202             <char bbox="265.50037 616.16394 269.77765 625.2532" x="265.50037" y="623.2582" c="3"/>
203             </span>
204             </line>
205             </block>
206             <block bbox="225.78 88.20229 305.18158 117.93829">
207             <line bbox="225.78 88.20229 305.18158 117.93829">
208             <span bbox="225.78 88.20229 305.18158 117.93829" font="MyriadPro-Bold" size="24">
209             <char bbox="225.78 88.20229 239.5176 117.93829" x="225.78" y="111.93829" c="P"/>
210             <char bbox="239.5176 88.20229 248.4552 117.93829" x="239.5176" y="111.93829" c="r"/>
211             <char bbox="248.4552 88.20229 261.1128 117.93829" x="248.4552" y="111.93829" c="e"/>
212             <char bbox="261.1128 88.20229 269.28238 117.93829" x="261.1128" y="111.93829" c="f"/>
213             <char bbox="269.28238 88.20229 281.93997 117.93829" x="269.28238" y="111.93829" c="a"/>
214             <char bbox="281.93997 88.20229 292.50958 117.93829" x="281.93997" y="111.93829" c="c"/>
215             <char bbox="292.50958 88.20229 305.18158 117.93829" x="292.50958" y="111.93829" c="e"/>
216             </span>
217             </line>
218             </block>
219             </page>
220             </document>
221              
222             Simplified, the high-level structure looks like:
223              
224             <page> -> [list of blocks]
225             <block> -> [list of blocks]
226             a block is either:
227             - stext
228             <line> -> [list of lines] (all have same baseline)
229             <span> -> [list of spans] (horizontal spaces over a line)
230             <char> -> [list of chars]
231             - image
232             # TODO document the image data from mutool
233              
234             =head2 get_mutool_text_stext_xml
235              
236             get_mutool_text_stext_xml($pdf_filename, $pdf_page_no)
237              
238             Returns a HashRef of the structured text from from page
239             number C<$pdf_page_no> of the PDF file C<$pdf_filename>.
240              
241             See the function L<get_mutool_text_stext_raw|/get_mutool_text_stext_raw> for
242             details on the structure of this data.
243              
244             =head2 get_mutool_page_info_raw
245              
246             get_mutool_page_info_raw($pdf_filename)
247              
248             Returns an XML string of the page bounding boxes of PDF file C<$pdf_filename>.
249              
250             The data is in the form:
251              
252             <document>
253             <page pagenum="1">
254             <MediaBox l="0" b="0" r="531" t="666" />
255             <CropBox l="0" b="0" r="531" t="666" />
256             <Rotate v="0" />
257             </page>
258             <page pagenum="2">
259             ...
260             </page>
261             </document>
262              
263             =head2 get_mutool_page_info_xml
264              
265             get_mutool_page_info_xml($pdf_filename)
266              
267             Returns a HashRef containing the page bounding boxes of PDF file
268             C<$pdf_filename>.
269              
270             See function L<get_mutool_page_info_raw|/get_mutool_page_info_raw> for
271             information on the structure of the data.
272              
273             =head2 get_mutool_outline_simple
274              
275             fun get_mutool_outline_simple($pdf_filename)
276              
277             Returns an array of the outline of the PDF file C<$pdf_filename> as an
278             C<ArrayRef[HashRef]> which corresponds to the C<items> attribute of
279             L<Renard::Incunabula::Outline>.
280              
281             =head1 SEE ALSO
282              
283             L<Repository information|http://project-renard.github.io/doc/development/repo/p5-Renard-Incunabula-MuPDF-mutool/>
284              
285             =head1 AUTHOR
286              
287             Project Renard
288              
289             =head1 COPYRIGHT AND LICENSE
290              
291             This software is copyright (c) 2017 by Project Renard.
292              
293             This is free software; you can redistribute it and/or modify it under
294             the same terms as the Perl 5 programming language system itself.
295              
296             =cut