line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
1
|
|
|
1
|
|
562317
|
use Renard::Incunabula::Common::Setup; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
6
|
|
2
|
|
|
|
|
|
|
package Renard::API::MuPDF::mutool; |
3
|
|
|
|
|
|
|
# ABSTRACT: Retrieve PDF image and text data via MuPDF's mutool |
4
|
|
|
|
|
|
|
$Renard::API::MuPDF::mutool::VERSION = '0.006'; |
5
|
1
|
|
|
1
|
|
8750
|
use Capture::Tiny qw(capture); |
|
1
|
|
|
|
|
16071
|
|
|
1
|
|
|
|
|
64
|
|
6
|
1
|
|
|
1
|
|
809
|
use XML::Simple; |
|
1
|
|
|
|
|
9351
|
|
|
1
|
|
|
|
|
7
|
|
7
|
1
|
|
|
1
|
|
614
|
use Alien::MuPDF 0.007; |
|
1
|
|
|
|
|
8033
|
|
|
1
|
|
|
|
|
8
|
|
8
|
1
|
|
|
1
|
|
22798
|
use Path::Tiny; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
59
|
|
9
|
|
|
|
|
|
|
|
10
|
1
|
|
|
1
|
|
540
|
use Log::Any qw($log); |
|
1
|
|
|
|
|
8398
|
|
|
1
|
|
|
|
|
6
|
|
11
|
1
|
|
|
1
|
|
2197
|
use constant MUPDF_DEFAULT_RESOLUTION => 72; # dpi |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
59
|
|
12
|
|
|
|
|
|
|
|
13
|
1
|
|
|
1
|
|
580
|
use Renard::API::MuPDF::mutool::ObjectParser; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
78
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
BEGIN { |
16
|
1
|
|
|
1
|
|
11
|
our $MUTOOL_PATH = Alien::MuPDF->mutool_path; |
17
|
|
|
|
|
|
|
} |
18
|
|
|
|
|
|
|
|
19
|
0
|
|
|
0
|
|
|
fun _call_mutool( @mutool_args ) { |
|
0
|
|
|
|
|
|
|
20
|
0
|
|
|
|
|
|
my @args = ( $Renard::API::MuPDF::mutool::MUTOOL_PATH, @mutool_args ); |
21
|
0
|
|
|
|
|
|
my ($stdout, $exit); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# Note: The code below is marked as uncoverable because it only applies |
24
|
|
|
|
|
|
|
# on Windows and we are currently only automatically checking coverage |
25
|
|
|
|
|
|
|
# on Linux via Travis-CI. |
26
|
|
|
|
|
|
|
# uncoverable branch true |
27
|
0
|
0
|
|
|
|
|
if( $^O eq 'MSWin32' ) { |
28
|
|
|
|
|
|
|
# Need to redirect to a file for two reasons: |
29
|
|
|
|
|
|
|
# - /SUBSYSTEM:WINDOWS closes stdin/stdout <https://github.com/project-renard/curie/issues/128>. |
30
|
|
|
|
|
|
|
# - MuPDF does not set the mode on stdout to binary <http://bugs.ghostscript.com/show_bug.cgi?id=694954>. |
31
|
0
|
|
|
|
|
|
my $temp_fh = File::Temp->new; # uncoverable statement |
32
|
0
|
|
|
|
|
|
close $temp_fh; # to avoid Windows file locking # uncoverable statement |
33
|
|
|
|
|
|
|
|
34
|
0
|
|
|
|
|
|
my $output_param = 0; # uncoverable statement |
35
|
0
|
|
|
|
|
|
for my $idx (1..@args-2) { # uncoverable statement |
36
|
|
|
|
|
|
|
# uncoverable branch true |
37
|
0
|
0
|
0
|
|
|
|
if( $args[$idx] eq '-o' # uncoverable statement |
38
|
|
|
|
|
|
|
&& $args[$idx+1] eq '-' ) { |
39
|
0
|
|
|
|
|
|
$args[$idx+1] = $temp_fh->filename; # uncoverable statement |
40
|
0
|
|
|
|
|
|
$output_param = 1; # uncoverable statement |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# uncoverable branch true |
45
|
0
|
0
|
|
|
|
|
if( not $output_param ) { # uncoverable statement |
46
|
|
|
|
|
|
|
# redirect into a temp file |
47
|
|
|
|
|
|
|
my $cmd = join " ", # uncoverable statement |
48
|
0
|
0
|
|
|
|
|
map { $_ =~ /\s/ ? "\"$_\"" : $_ } # uncoverable statement |
|
0
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
@args; # uncoverable statement |
50
|
0
|
|
|
|
|
|
my $redir = $temp_fh->filename; # uncoverable statement |
51
|
0
|
|
|
|
|
|
@args = ("$cmd > \"$redir\""); # uncoverable statement |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
0
|
|
|
|
|
|
$log->infof("running mutool: %s", \@args); # uncoverable statement |
55
|
0
|
|
|
|
|
|
system( @args ); # uncoverable statement |
56
|
0
|
|
|
|
|
|
$stdout = path( $temp_fh->filename )->slurp_raw; # uncoverable statement |
57
|
0
|
|
|
|
|
|
$exit = $?; # uncoverable statement |
58
|
|
|
|
|
|
|
} else { |
59
|
|
|
|
|
|
|
# Make sure STDOUT is :raw |
60
|
0
|
0
|
|
|
|
|
open my $dup, ">&=", *STDOUT or die $!; |
61
|
0
|
|
|
|
|
|
local *STDOUT; |
62
|
0
|
|
|
|
|
|
open(STDOUT, ">&=", $dup); |
63
|
0
|
|
|
|
|
|
binmode *STDOUT, ':raw'; |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
($stdout, undef, $exit) = capture { |
66
|
0
|
|
|
0
|
|
|
$log->infof("running mutool: %s", \@args); |
67
|
0
|
|
|
|
|
|
system( @args ); |
68
|
0
|
|
|
|
|
|
}; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
0
|
0
|
|
|
|
|
die "Unexpected mutool exit: $exit" if $exit; |
72
|
|
|
|
|
|
|
|
73
|
0
|
|
|
|
|
|
return $stdout; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
0
|
|
|
0
|
1
|
|
fun get_mutool_pdf_page_as_png($pdf_filename, $pdf_page_no, $zoom_level) { |
|
0
|
|
|
|
|
|
|
77
|
0
|
|
|
|
|
|
my $stdout = _call_mutool( |
78
|
|
|
|
|
|
|
qw(draw), |
79
|
|
|
|
|
|
|
qw( -r ), ($zoom_level * MUPDF_DEFAULT_RESOLUTION), # calculate the resolution |
80
|
|
|
|
|
|
|
qw( -F png ), |
81
|
|
|
|
|
|
|
qw( -o -), |
82
|
|
|
|
|
|
|
$pdf_filename, |
83
|
|
|
|
|
|
|
$pdf_page_no, |
84
|
|
|
|
|
|
|
); |
85
|
|
|
|
|
|
|
|
86
|
0
|
|
|
|
|
|
return $stdout; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
0
|
|
|
0
|
1
|
|
fun get_mutool_text_stext_raw($pdf_filename, $pdf_page_no) { |
|
0
|
|
|
|
|
|
|
90
|
0
|
|
|
|
|
|
my $stdout = _call_mutool( |
91
|
|
|
|
|
|
|
qw(draw), |
92
|
|
|
|
|
|
|
qw(-F stext), |
93
|
|
|
|
|
|
|
qw(-o -), |
94
|
|
|
|
|
|
|
$pdf_filename, |
95
|
|
|
|
|
|
|
$pdf_page_no, |
96
|
|
|
|
|
|
|
); |
97
|
|
|
|
|
|
|
|
98
|
0
|
|
|
|
|
|
return $stdout; |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
|
101
|
0
|
|
|
0
|
1
|
|
fun get_mutool_text_stext_xml($pdf_filename, $pdf_page_no) { |
|
0
|
|
|
|
|
|
|
102
|
0
|
|
|
|
|
|
my $stext_xml = get_mutool_text_stext_raw( |
103
|
|
|
|
|
|
|
$pdf_filename, |
104
|
|
|
|
|
|
|
$pdf_page_no, |
105
|
|
|
|
|
|
|
); |
106
|
|
|
|
|
|
|
|
107
|
0
|
|
|
|
|
|
my $stext = XMLin( $stext_xml, |
108
|
|
|
|
|
|
|
KeyAttr => [], |
109
|
|
|
|
|
|
|
ForceArray => [ qw(page block line font char) ] ); |
110
|
|
|
|
|
|
|
|
111
|
0
|
|
|
|
|
|
return $stext; |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
|
114
|
0
|
|
|
0
|
1
|
|
fun get_mutool_page_info_raw($pdf_filename) { |
|
0
|
|
|
|
|
|
|
115
|
0
|
|
|
|
|
|
my $stdout = _call_mutool( |
116
|
|
|
|
|
|
|
qw(pages), |
117
|
|
|
|
|
|
|
$pdf_filename |
118
|
|
|
|
|
|
|
); |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
# remove the first line |
121
|
0
|
|
|
|
|
|
$stdout =~ s/^[^\n]*\n//s; |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# wraps the data with a root node |
124
|
0
|
|
|
|
|
|
return "<document>$stdout</document>" |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
0
|
|
|
0
|
1
|
|
fun get_mutool_page_info_xml($pdf_filename) { |
|
0
|
|
|
|
|
|
|
128
|
0
|
|
|
|
|
|
my $page_info_xml = get_mutool_page_info_raw( $pdf_filename ); |
129
|
|
|
|
|
|
|
|
130
|
0
|
|
|
|
|
|
my $page_info = XMLin( $page_info_xml, |
131
|
|
|
|
|
|
|
KeyAttr => [], |
132
|
|
|
|
|
|
|
ForceArray => [ qw(page) ] ); |
133
|
|
|
|
|
|
|
|
134
|
0
|
|
|
|
|
|
my $root_media_box_p = Renard::API::MuPDF::mutool::ObjectParser->new( |
135
|
|
|
|
|
|
|
filename => $pdf_filename, |
136
|
|
|
|
|
|
|
string => Renard::API::MuPDF::mutool::get_mutool_get_object_raw($pdf_filename, 'Root/Pages/MediaBox'), |
137
|
|
|
|
|
|
|
is_toplevel => 0, |
138
|
|
|
|
|
|
|
); |
139
|
0
|
|
|
|
|
|
my $root_media_box; |
140
|
0
|
0
|
|
|
|
|
if( $root_media_box_p->data ) { |
141
|
0
|
|
|
|
|
|
$root_media_box->{l} = $root_media_box_p->data->[0]; |
142
|
0
|
|
|
|
|
|
$root_media_box->{b} = $root_media_box_p->data->[1]; |
143
|
|
|
|
|
|
|
|
144
|
0
|
|
|
|
|
|
$root_media_box->{r} = $root_media_box_p->data->[2]; |
145
|
0
|
|
|
|
|
|
$root_media_box->{t} = $root_media_box_p->data->[3]; |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
|
148
|
0
|
|
|
|
|
|
for my $page_hash (@{ $page_info->{page} }) { |
|
0
|
|
|
|
|
|
|
149
|
0
|
0
|
|
|
|
|
unless( exists $page_hash->{CropBox} ) { |
150
|
0
|
0
|
|
|
|
|
my $media_box = exists $page_hash->{MediaBox} ? $page_hash->{MediaBox} : $root_media_box; |
151
|
0
|
|
|
|
|
|
$page_hash->{CropBox} = { %$media_box }; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
0
|
|
|
|
|
|
return $page_info; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
0
|
|
|
0
|
1
|
|
fun get_mutool_outline_simple($pdf_filename) { |
|
0
|
|
|
|
|
|
|
159
|
0
|
|
|
|
|
|
my $outline_text = _call_mutool( |
160
|
|
|
|
|
|
|
qw(show), |
161
|
|
|
|
|
|
|
$pdf_filename, |
162
|
|
|
|
|
|
|
qw(outline) |
163
|
|
|
|
|
|
|
); |
164
|
|
|
|
|
|
|
|
165
|
0
|
|
|
|
|
|
my @outline_items = (); |
166
|
0
|
|
|
|
|
|
utf8::upgrade($outline_text); |
167
|
0
|
|
|
|
|
|
open my $outline_fh, '<:crlf', \$outline_text; |
168
|
0
|
|
|
|
|
|
while( defined( my $line = <$outline_fh> ) ) { |
169
|
0
|
|
|
|
|
|
$line =~ /^ |
170
|
|
|
|
|
|
|
(?<prefix>[+|-]) |
171
|
|
|
|
|
|
|
(?<indent>\t*) |
172
|
|
|
|
|
|
|
"(?<text>.*)" |
173
|
|
|
|
|
|
|
\t |
174
|
|
|
|
|
|
|
(?<reference> |
175
|
|
|
|
|
|
|
# #123,20,40 |
176
|
|
|
|
|
|
|
( \# (?<page>\d+)(,(?<dx>-?\d+),(?<dy>-?\d+))? ) |
177
|
|
|
|
|
|
|
| |
178
|
|
|
|
|
|
|
# #page=123&zoom=nan,20,40 |
179
|
|
|
|
|
|
|
# #page=123&view=Fit |
180
|
|
|
|
|
|
|
( \# page=(?<page>\d+)(&(view|zoom)=[^&,]+?)*(,(?<dx>-?\d+),(?<dy>-?\d+))? ) |
181
|
|
|
|
|
|
|
| |
182
|
|
|
|
|
|
|
\Q(null)\E |
183
|
|
|
|
|
|
|
) |
184
|
|
|
|
|
|
|
$ |
185
|
|
|
|
|
|
|
/x; |
186
|
0
|
|
|
|
|
|
my %copy = %+; |
187
|
0
|
|
|
|
|
|
$copy{level} = length($copy{indent}) - 1; |
188
|
0
|
|
|
|
|
|
$copy{text} =~ s/\\x([0-9A-F]{2})/chr(hex($1))/ge; |
|
0
|
|
|
|
|
|
|
189
|
0
|
|
|
|
|
|
$copy{open} = $copy{prefix} eq '-'; |
190
|
0
|
|
|
|
|
|
delete $copy{prefix}; |
191
|
0
|
|
|
|
|
|
delete $copy{indent}; |
192
|
0
|
|
|
|
|
|
delete $copy{reference}; |
193
|
|
|
|
|
|
|
# not storing the offsets yet and not every line has offsets |
194
|
0
|
|
|
|
|
|
delete @copy{qw(dx dy)}; |
195
|
0
|
|
|
|
|
|
push @outline_items, \%copy; |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
|
198
|
0
|
|
|
|
|
|
return \@outline_items; |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
0
|
|
|
0
|
1
|
|
fun get_mutool_get_trailer_raw($pdf_filename) { |
|
0
|
|
|
|
|
|
|
202
|
0
|
|
|
|
|
|
my $trailer_text = _call_mutool( |
203
|
|
|
|
|
|
|
qw(show), |
204
|
|
|
|
|
|
|
$pdf_filename, |
205
|
|
|
|
|
|
|
qw(trailer) |
206
|
|
|
|
|
|
|
); |
207
|
|
|
|
|
|
|
|
208
|
0
|
|
|
|
|
|
utf8::upgrade($trailer_text); |
209
|
0
|
|
|
|
|
|
open my $trailer_fh, '<:crlf', \$trailer_text; |
210
|
0
|
|
|
|
|
|
do { local $/ = ''; <$trailer_fh> }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
0
|
|
|
0
|
1
|
|
fun get_mutool_get_object_raw($pdf_filename, $object_id) { |
|
0
|
|
|
|
|
|
|
214
|
0
|
|
|
|
|
|
my $object_text = _call_mutool( |
215
|
|
|
|
|
|
|
qw(show), |
216
|
|
|
|
|
|
|
$pdf_filename, |
217
|
|
|
|
|
|
|
$object_id, |
218
|
|
|
|
|
|
|
); |
219
|
|
|
|
|
|
|
|
220
|
0
|
|
|
|
|
|
utf8::upgrade($object_text); |
221
|
0
|
|
|
|
|
|
open my $object_fh, '<:crlf', \$object_text; |
222
|
0
|
|
|
|
|
|
do { local $/ = ''; <$object_fh> }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
|
225
|
0
|
|
|
0
|
1
|
|
fun get_mutool_get_info_object_parsed( $pdf_filename ) { |
|
0
|
|
|
|
|
|
|
226
|
0
|
|
|
|
|
|
my $trailer = Renard::API::MuPDF::mutool::ObjectParser->new( |
227
|
|
|
|
|
|
|
filename => $pdf_filename, |
228
|
|
|
|
|
|
|
string => Renard::API::MuPDF::mutool::get_mutool_get_trailer_raw($pdf_filename), |
229
|
|
|
|
|
|
|
); |
230
|
|
|
|
|
|
|
|
231
|
0
|
|
|
|
|
|
my $info = $trailer->resolve_key('Info'); |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
1; |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
__END__ |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
=pod |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=encoding UTF-8 |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=head1 NAME |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
Renard::API::MuPDF::mutool - Retrieve PDF image and text data via MuPDF's mutool |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
=head1 VERSION |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
version 0.006 |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=head1 FUNCTIONS |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=head2 _call_mutool |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
_call_mutool( @args ) |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
Helper function which calls C<mutool> with the contents of the C<@args> array. |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
Returns the captured C<STDOUT> of the call. |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
This function dies if C<mutool> unsuccessfully exits. |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head2 get_mutool_pdf_page_as_png |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
get_mutool_pdf_page_as_png($pdf_filename, $pdf_page_no) |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
This function returns a PNG stream that renders page number C<$pdf_page_no> of |
268
|
|
|
|
|
|
|
the PDF file C<$pdf_filename>. |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
=head2 get_mutool_text_stext_raw |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
get_mutool_text_stext_raw($pdf_filename, $pdf_page_no) |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
This function returns an XML string that contains structured text from page |
275
|
|
|
|
|
|
|
number C<$pdf_page_no> of the PDF file C<$pdf_filename>. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
The XML format is defined by the output of C<mutool> looks like this (for page |
278
|
|
|
|
|
|
|
23 of the C<pdf_reference_1-7.pdf> file): |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
<?xml version="1.0"?> |
281
|
|
|
|
|
|
|
<document name="(null)"> |
282
|
|
|
|
|
|
|
<page height="666" width="531"> |
283
|
|
|
|
|
|
|
<block bbox="261.18 616.16397 269.77766 625.2532"> |
284
|
|
|
|
|
|
|
<line bbox="261.18 616.16397 269.77766 625.2532" dir="1 0" wmode="0"> |
285
|
|
|
|
|
|
|
<font name="MyriadPro-Semibold" size="7.98"> |
286
|
|
|
|
|
|
|
<char bbox="261.18 616.16397 265.45729 625.2532" c="2" x="261.18" y="623.2582"/> |
287
|
|
|
|
|
|
|
<char bbox="265.50038 616.16397 269.77766 625.2532" c="3" x="265.50038" y="623.2582"/> |
288
|
|
|
|
|
|
|
</font> |
289
|
|
|
|
|
|
|
</line> |
290
|
|
|
|
|
|
|
</block> |
291
|
|
|
|
|
|
|
<block bbox="225.78 88.20229 305.18159 117.93829"> |
292
|
|
|
|
|
|
|
<line bbox="225.78 88.20229 305.18159 117.93829" dir="1 0" wmode="0"> |
293
|
|
|
|
|
|
|
<font name="MyriadPro-Bold" size="24"> |
294
|
|
|
|
|
|
|
<char bbox="225.78 88.20229 239.724 117.93829" c="P" x="225.78" y="111.93829"/> |
295
|
|
|
|
|
|
|
<char bbox="239.5176 88.20229 248.63759 117.93829" c="r" x="239.5176" y="111.93829"/> |
296
|
|
|
|
|
|
|
<char bbox="248.4552 88.20229 261.1272 117.93829" c="e" x="248.4552" y="111.93829"/> |
297
|
|
|
|
|
|
|
<char bbox="261.1128 88.20229 269.29679 117.93829" c="f" x="261.1128" y="111.93829"/> |
298
|
|
|
|
|
|
|
</font> |
299
|
|
|
|
|
|
|
</line> |
300
|
|
|
|
|
|
|
</block> |
301
|
|
|
|
|
|
|
</page> |
302
|
|
|
|
|
|
|
</document> |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
Simplified, the high-level structure looks like: |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
<page> -> [list of blocks] |
307
|
|
|
|
|
|
|
<block> -> [list of blocks] |
308
|
|
|
|
|
|
|
a block is either: |
309
|
|
|
|
|
|
|
- stext |
310
|
|
|
|
|
|
|
<line> -> [list of lines] (all have same baseline) |
311
|
|
|
|
|
|
|
<font> -> [list of fonts] (horizontal spaces over a line) |
312
|
|
|
|
|
|
|
<char> -> [list of chars] |
313
|
|
|
|
|
|
|
- image |
314
|
|
|
|
|
|
|
# TODO document the image data from mutool |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
=head2 get_mutool_text_stext_xml |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
get_mutool_text_stext_xml($pdf_filename, $pdf_page_no) |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
Returns a HashRef of the structured text from from page |
321
|
|
|
|
|
|
|
number C<$pdf_page_no> of the PDF file C<$pdf_filename>. |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
See the function L<get_mutool_text_stext_raw|/get_mutool_text_stext_raw> for |
324
|
|
|
|
|
|
|
details on the structure of this data. |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=head2 get_mutool_page_info_raw |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
get_mutool_page_info_raw($pdf_filename) |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
Returns an XML string of the page bounding boxes of PDF file C<$pdf_filename>. |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
The data is in the form: |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
<document> |
335
|
|
|
|
|
|
|
<page pagenum="1"> |
336
|
|
|
|
|
|
|
<MediaBox l="0" b="0" r="531" t="666" /> |
337
|
|
|
|
|
|
|
<CropBox l="0" b="0" r="531" t="666" /> |
338
|
|
|
|
|
|
|
<Rotate v="0" /> |
339
|
|
|
|
|
|
|
</page> |
340
|
|
|
|
|
|
|
<page pagenum="2"> |
341
|
|
|
|
|
|
|
... |
342
|
|
|
|
|
|
|
</page> |
343
|
|
|
|
|
|
|
</document> |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=head2 get_mutool_page_info_xml |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
get_mutool_page_info_xml($pdf_filename) |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
Returns a HashRef containing the page bounding boxes of PDF file |
350
|
|
|
|
|
|
|
C<$pdf_filename>. |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
See function L<get_mutool_page_info_raw|/get_mutool_page_info_raw> for |
353
|
|
|
|
|
|
|
information on the structure of the data. |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
=head2 get_mutool_outline_simple |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
fun get_mutool_outline_simple($pdf_filename) |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
Returns an array of the outline of the PDF file C<$pdf_filename> as an |
360
|
|
|
|
|
|
|
C<ArrayRef[HashRef]> which corresponds to the C<items> attribute of |
361
|
|
|
|
|
|
|
L<Renard::Incunabula::Outline>. |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
=head2 get_mutool_get_trailer_raw |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
fun get_mutool_get_trailer_raw($pdf_filename) |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
Returns the trailer of the PDF file C<$pdf_filename> as a string. |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=head2 get_mutool_get_object_raw |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
fun get_mutool_get_object_raw($pdf_filename, $object_id) |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
Returns the object given by the ID C<$object_id> for PDF file C<$pdf_filename> |
374
|
|
|
|
|
|
|
as a string. |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
=head2 get_mutool_get_info_object_parsed |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
fun get_mutool_get_info_object_parsed( $pdf_filename ) |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
Returns the document information dictionary as a |
381
|
|
|
|
|
|
|
L<Renard::API::MuPDF::mutool::ObjectParser> object. |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
See Table 10.2 on pg. 844 of the I<PDF Reference, version 1.7> to see the |
384
|
|
|
|
|
|
|
entries that usually used (e.g., Title, Author). |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=head1 SEE ALSO |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
L<Repository information|http://project-renard.github.io/doc/development/repo/p5-Renard-API-MuPDF-mutool/> |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=head1 AUTHOR |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
Project Renard |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
This software is copyright (c) 2017 by Project Renard. |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
399
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
=cut |