File Coverage

blib/lib/Renard/Curie/Data/PDF.pm
Criterion Covered Total %
statement 28 102 42.1
branch 0 34 8.8
condition 0 3 0.0
subroutine 10 18 55.5
pod 6 6 100.0
total 44 163 38.0


line stmt bran cond sub pod time code
1 3     3   858 use Renard::Curie::Setup;
  3         7  
  3         28  
2             package Renard::Curie::Data::PDF;
3             # ABSTRACT: Retrieve PDF image and text data via MuPDF's mutool
4             $Renard::Curie::Data::PDF::VERSION = '0.002';
5 3     3   1441 use Capture::Tiny qw(capture);
  3         37685  
  3         164  
6 3     3   5189 use XML::Simple;
  3         21251  
  3         23  
7 3     3   2506 use Alien::MuPDF 0.007;
  3         33302  
  3         26  
8 3     3   55720 use Path::Tiny;
  3         33  
  3         180  
9 3     3   18 use Function::Parameters;
  3         6  
  3         23  
10              
11 3     3   3950 use Log::Any qw($log);
  3         21555  
  3         15  
12 3     3   6870 use constant MUPDF_DEFAULT_RESOLUTION => 72; # dpi
  3         7  
  3         301  
13              
14             BEGIN {
15 3     3   28 our $MUTOOL_PATH = Alien::MuPDF->mutool_path;
16             }
17              
18 0     0     fun _call_mutool( @mutool_args ) {
  0            
19 0           my @args = ( $Renard::Curie::Data::PDF::MUTOOL_PATH, @mutool_args );
20 0           my ($stdout, $exit);
21              
22             # Note: The code below is marked as uncoverable because it only applies
23             # on Windows and we are currently only automatically checking coverage
24             # on Linux via Travis-CI.
25             # uncoverable branch true
26 0 0         if( $^O eq 'MSWin32' ) {
27             # Need to redirect to a file for two reasons:
28             # - /SUBSYSTEM:WINDOWS closes stdin/stdout <https://github.com/project-renard/curie/issues/128>.
29             # - MuPDF does not set the mode on stdout to binary <http://bugs.ghostscript.com/show_bug.cgi?id=694954>.
30 0           my $temp_fh = File::Temp->new; # uncoverable statement
31 0           close $temp_fh; # to avoid Windows file locking # uncoverable statement
32              
33 0           my $output_param = 0; # uncoverable statement
34 0           for my $idx (1..@args-2) { # uncoverable statement
35             # uncoverable branch true
36 0 0 0       if( $args[$idx] eq '-o' # uncoverable statement
37             && $args[$idx+1] eq '-' ) {
38 0           $args[$idx+1] = $temp_fh->filename; # uncoverable statement
39 0           $output_param = 1; # uncoverable statement
40             }
41             }
42              
43             # uncoverable branch true
44 0 0         if( not $output_param ) { # uncoverable statement
45             # redirect into a temp file
46             my $cmd = join " ", # uncoverable statement
47 0 0         map { $_ =~ /\s/ ? "\"$_\"" : $_ } # uncoverable statement
  0            
48             @args; # uncoverable statement
49 0           my $redir = $temp_fh->filename; # uncoverable statement
50 0           @args = ("$cmd > \"$redir\""); # uncoverable statement
51             }
52              
53 0           $log->infof("running mutool: %s", \@args); # uncoverable statement
54 0           system( @args ); # uncoverable statement
55 0           $stdout = path( $temp_fh->filename )->slurp_raw; # uncoverable statement
56 0           $exit = $?; # uncoverable statement
57             } else {
58             ($stdout, undef, $exit) = capture {
59 0     0     $log->infof("running mutool: %s", \@args);
60 0           system( @args );
61 0           };
62             }
63              
64 0 0         die "Unexpected mutool exit: $exit" if $exit;
65              
66 0           return $stdout;
67             }
68              
69 0 0   0 1   fun get_mutool_pdf_page_as_png($pdf_filename, $pdf_page_no, $zoom_level) {
  0 0          
  0            
  0            
70 0           my $stdout = _call_mutool(
71             qw(draw),
72             qw( -r ), ($zoom_level * MUPDF_DEFAULT_RESOLUTION), # calculate the resolution
73             qw( -F png ),
74             qw( -o -),
75             $pdf_filename,
76             $pdf_page_no,
77             );
78              
79 0           return $stdout;
80             }
81              
82 0 0   0 1   fun get_mutool_text_stext_raw($pdf_filename, $pdf_page_no) {
  0 0          
  0            
  0            
83 0           my $stdout = _call_mutool(
84             qw(draw),
85             qw(-F stext),
86             qw(-o -),
87             $pdf_filename,
88             $pdf_page_no,
89             );
90              
91 0           return $stdout;
92             }
93              
94 0 0   0 1   fun get_mutool_text_stext_xml($pdf_filename, $pdf_page_no) {
  0 0          
  0            
  0            
95 0           my $stext_xml = get_mutool_text_stext_raw(
96             $pdf_filename,
97             $pdf_page_no,
98             );
99             # page -> [list of blocks]
100             # block -> [list of blocks]
101             # block is either:
102             # - stext
103             # line -> [list of lines] (all have same baseline)
104             # span -> [list of spans] (horizontal spaces over a line)
105             # char -> [list of chars]
106             # - image
107             # TODO
108              
109 0           my $stext = XMLin( $stext_xml,
110             ForceArray => [ qw(page block line span char) ] );
111              
112 0           return $stext;
113             }
114              
115 0 0   0 1   fun get_mutool_page_info_raw($pdf_filename) {
  0 0          
  0            
  0            
116 0           my $stdout = _call_mutool(
117             qw(pages),
118             $pdf_filename
119             );
120              
121             # remove the first line
122 0           $stdout =~ s/^[^\n]*\n//s;
123              
124             # wraps the data with a root node
125 0           return "<document>$stdout</document>"
126             }
127              
128 0 0   0 1   fun get_mutool_page_info_xml($pdf_filename) {
  0 0          
  0            
  0            
129 0           my $page_info_xml = get_mutool_page_info_raw( $pdf_filename );
130              
131 0           my $page_info = XMLin( $page_info_xml,
132             ForceArray => [ qw(page) ] );
133              
134 0           return $page_info;
135             }
136              
137 0 0   0 1   fun get_mutool_outline_simple($pdf_filename) {
  0 0          
  0            
  0            
138 0           my $outline_text = _call_mutool(
139             qw(show),
140             $pdf_filename,
141             qw(outline)
142             );
143              
144 0           my @outline_items = ();
145 0           open my $outline_fh, '<:encoding(UTF-8):crlf', \$outline_text;
146 0           while( defined( my $line = <$outline_fh> ) ) {
147 0           $line =~ /^(?<indent>\t*)(?<text>.*)\t#(?<page>\d+)(,(?<dx>\d+),(?<dy>\d+))?$/;
148 3     3   7566 my %copy = %+;
  3         1019  
  3         286  
  0            
149 0           $copy{level} = length $copy{indent};
150 0           delete $copy{indent};
151             # not storing the offsets yet and not every line has offsets
152 0           delete @copy{qw(dx dy)};
153 0           push @outline_items, \%copy;
154             }
155              
156 0           return \@outline_items;
157             }
158              
159              
160             1;
161              
162             __END__
163              
164             =pod
165              
166             =encoding UTF-8
167              
168             =head1 NAME
169              
170             Renard::Curie::Data::PDF - Retrieve PDF image and text data via MuPDF's mutool
171              
172             =head1 VERSION
173              
174             version 0.002
175              
176             =head1 FUNCTIONS
177              
178             =head2 _call_mutool
179              
180             _call_mutool( @args )
181              
182             Helper function which calls C<mutool> with the contents of the C<@args> array.
183              
184             Returns the captured C<STDOUT> of the call.
185              
186             This function dies if C<mutool> unsuccessfully exits.
187              
188             =head2 get_mutool_pdf_page_as_png
189              
190             get_mutool_pdf_page_as_png($pdf_filename, $pdf_page_no)
191              
192             This function returns a PNG stream that renders page number C<$pdf_page_no> of
193             the PDF file C<$pdf_filename>.
194              
195             =head2 get_mutool_text_stext_raw
196              
197             get_mutool_text_stext_raw($pdf_filename, $pdf_page_no)
198              
199             This function returns an XML string that contains structured text from page
200             number C<$pdf_page_no> of the PDF file C<$pdf_filename>.
201              
202             The XML format is defined by the output of C<mutool> looks like this (for page
203             23 of the C<pdf_reference_1-7.pdf> file):
204              
205             <document name="test-data/test-data/PDF/Adobe/pdf_reference_1-7.pdf">
206             <page width="531" height="666">
207             <block bbox="261.18 616.16394 269.77765 625.2532">
208             <line bbox="261.18 616.16394 269.77765 625.2532">
209             <span bbox="261.18 616.16394 269.77765 625.2532" font="MyriadPro-Semibold" size="7.98">
210             <char bbox="261.18 616.16394 265.50037 625.2532" x="261.18" y="623.2582" c="2"/>
211             <char bbox="265.50037 616.16394 269.77765 625.2532" x="265.50037" y="623.2582" c="3"/>
212             </span>
213             </line>
214             </block>
215             <block bbox="225.78 88.20229 305.18158 117.93829">
216             <line bbox="225.78 88.20229 305.18158 117.93829">
217             <span bbox="225.78 88.20229 305.18158 117.93829" font="MyriadPro-Bold" size="24">
218             <char bbox="225.78 88.20229 239.5176 117.93829" x="225.78" y="111.93829" c="P"/>
219             <char bbox="239.5176 88.20229 248.4552 117.93829" x="239.5176" y="111.93829" c="r"/>
220             <char bbox="248.4552 88.20229 261.1128 117.93829" x="248.4552" y="111.93829" c="e"/>
221             <char bbox="261.1128 88.20229 269.28238 117.93829" x="261.1128" y="111.93829" c="f"/>
222             <char bbox="269.28238 88.20229 281.93997 117.93829" x="269.28238" y="111.93829" c="a"/>
223             <char bbox="281.93997 88.20229 292.50958 117.93829" x="281.93997" y="111.93829" c="c"/>
224             <char bbox="292.50958 88.20229 305.18158 117.93829" x="292.50958" y="111.93829" c="e"/>
225             </span>
226             </line>
227             </block>
228             </page>
229             </document>
230              
231             Simplified, the high-level structure looks like:
232              
233             <page> -> [list of blocks]
234             <block> -> [list of blocks]
235             a block is either:
236             - stext
237             <line> -> [list of lines] (all have same baseline)
238             <span> -> [list of spans] (horizontal spaces over a line)
239             <char> -> [list of chars]
240             - image
241             TODO
242              
243             =head2 get_mutool_text_stext_xml
244              
245             get_mutool_text_stext_xml($pdf_filename, $pdf_page_no)
246              
247             Returns a HashRef of the structured text from from page
248             number C<$pdf_page_no> of the PDF file C<$pdf_filename>.
249              
250             See the function L<get_mutool_text_stext_raw|/get_mutool_text_stext_raw> for
251             details on the structure of this data.
252              
253             =head2 get_mutool_page_info_raw
254              
255             get_mutool_page_info_raw($pdf_filename)
256              
257             Returns an XML string of the page bounding boxes of PDF file C<$pdf_filename>.
258              
259             The data is in the form:
260              
261             <document>
262             <page pagenum="1">
263             <MediaBox l="0" b="0" r="531" t="666" />
264             <CropBox l="0" b="0" r="531" t="666" />
265             <Rotate v="0" />
266             </page>
267             <page pagenum="2">
268             ...
269             </page>
270             </document>
271              
272             =head2 get_mutool_page_info_xml
273              
274             get_mutool_page_info_xml($pdf_filename)
275              
276             Returns a HashRef containing the page bounding boxes of PDF file
277             C<$pdf_filename>.
278              
279             See function L<get_mutool_page_info_raw|/get_mutool_page_info_raw> for
280             information on the structure of the data.
281              
282             =head2 get_mutool_outline_simple
283              
284             fun get_mutool_outline_simple($pdf_filename)
285              
286             Returns an array of the outline of the PDF file C<$pdf_filename> as an
287             C<ArrayRef[HashRef]> which corresponds to the C<items> attribute of
288             L<Renard::Curie::Model::Outline>.
289              
290             =head1 AUTHOR
291              
292             Project Renard
293              
294             =head1 COPYRIGHT AND LICENSE
295              
296             This software is copyright (c) 2016 by Project Renard.
297              
298             This is free software; you can redistribute it and/or modify it under
299             the same terms as the Perl 5 programming language system itself.
300              
301             =cut