| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Biblio::Document::Parser::Utils; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
###################################################################### |
|
4
|
|
|
|
|
|
|
# |
|
5
|
|
|
|
|
|
|
# ParaTools::Document::Parser::Utils; |
|
6
|
|
|
|
|
|
|
# |
|
7
|
|
|
|
|
|
|
###################################################################### |
|
8
|
|
|
|
|
|
|
# |
|
9
|
|
|
|
|
|
|
# This file is part of ParaCite Tools ((http://paracite.eprints.org/developers/) |
|
10
|
|
|
|
|
|
|
# |
|
11
|
|
|
|
|
|
|
# Copyright (c) 2002 University of Southampton, UK. SO17 1BJ. |
|
12
|
|
|
|
|
|
|
# |
|
13
|
|
|
|
|
|
|
# ParaTools is free software; you can redistribute it and/or modify |
|
14
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
|
15
|
|
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or |
|
16
|
|
|
|
|
|
|
# (at your option) any later version. |
|
17
|
|
|
|
|
|
|
# |
|
18
|
|
|
|
|
|
|
# ParaTools is distributed in the hope that it will be useful, |
|
19
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
20
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
21
|
|
|
|
|
|
|
# GNU General Public License for more details. |
|
22
|
|
|
|
|
|
|
# |
|
23
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
|
24
|
|
|
|
|
|
|
# along with ParaTools; if not, write to the Free Software |
|
25
|
|
|
|
|
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
26
|
|
|
|
|
|
|
# |
|
27
|
|
|
|
|
|
|
###################################################################### |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
|
|
30
|
1
|
|
|
1
|
|
921
|
use utf8; |
|
|
1
|
|
|
|
|
11
|
|
|
|
1
|
|
|
|
|
5
|
|
|
31
|
1
|
|
|
1
|
|
31
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
38
|
|
|
32
|
|
|
|
|
|
|
require Exporter; |
|
33
|
1
|
|
|
1
|
|
1002
|
use LWP::UserAgent; |
|
|
1
|
|
|
|
|
64510
|
|
|
|
1
|
|
|
|
|
45
|
|
|
34
|
1
|
|
|
1
|
|
3638
|
use File::Temp qw/ tempfile tempdir /; |
|
|
1
|
|
|
|
|
32219
|
|
|
|
1
|
|
|
|
|
168
|
|
|
35
|
1
|
|
|
1
|
|
8
|
use URI; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
37
|
|
|
36
|
1
|
|
|
1
|
|
5
|
use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAG $CHAR_MATCHES %CHAR_TRANSFORMS %CONVERTERS $DEBUG); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
1323
|
|
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
@ISA = qw( Exporter ); |
|
39
|
|
|
|
|
|
|
@EXPORT_OK = qw( &normalise_multichars ); |
|
40
|
|
|
|
|
|
|
@EXPORT = qw( &get_content ); |
|
41
|
|
|
|
|
|
|
$DEBUG = 0; |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=pod |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head1 NAME |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
@ - utility module for handling International characters and document conversion |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Biblio::Document::Parser::Utils provides some utility functions for handling international |
|
52
|
|
|
|
|
|
|
characters and for conversion of documents to plaintext. |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
use Biblio::Document::Parser::Utils qw( normalise_multichars ); |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
print normalise_multichars( $str ); |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head1 METHODS |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=over 4 |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=item $str = normalise_multichar( $str ) |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
Convert multi-char international characters into single UTF-8 chars, e.g.: |
|
67
|
|
|
|
|
|
|
¨o => ö |
|
68
|
|
|
|
|
|
|
These appear in pdftotext output from PDFs generated by pdflatex. |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=cut |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
$CHAR_MATCHES = '[\x{5e}\x{60}\x{a8}\x{b4}\x{7e}][aeounzn]'; |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
%CHAR_TRANSFORMS = ( |
|
75
|
|
|
|
|
|
|
"\x{5e}a"=>"\x{e2}", |
|
76
|
|
|
|
|
|
|
"\x{5e}e"=>"\x{ea}", |
|
77
|
|
|
|
|
|
|
"\x{5e}o"=>"\x{f4}", |
|
78
|
|
|
|
|
|
|
"\x{5e}u"=>"\x{fb}", |
|
79
|
|
|
|
|
|
|
"\x{60}a"=>"\x{e0}", |
|
80
|
|
|
|
|
|
|
"\x{60}e"=>"\x{e8}", |
|
81
|
|
|
|
|
|
|
"\x{60}o"=>"\x{f2}", |
|
82
|
|
|
|
|
|
|
"\x{60}u"=>"\x{f9}", |
|
83
|
|
|
|
|
|
|
"\x{a8}a"=>"\x{e4}", |
|
84
|
|
|
|
|
|
|
"\x{a8}e"=>"\x{eb}", |
|
85
|
|
|
|
|
|
|
"\x{a8}o"=>"\x{f6}", |
|
86
|
|
|
|
|
|
|
"\x{a8}u"=>"\x{fc}", |
|
87
|
|
|
|
|
|
|
"\x{b4}a"=>"\x{e1}", |
|
88
|
|
|
|
|
|
|
"\x{b4}e"=>"\x{e9}", |
|
89
|
|
|
|
|
|
|
"\x{b4}o"=>"\x{f3}", |
|
90
|
|
|
|
|
|
|
"\x{b4}u"=>"\x{fa}", |
|
91
|
|
|
|
|
|
|
"\x{b4}n"=>"\x{144}", |
|
92
|
|
|
|
|
|
|
"\x{b4}z"=>"\x{17a}", |
|
93
|
|
|
|
|
|
|
"\x{7e}n"=>"\x{f1}", |
|
94
|
|
|
|
|
|
|
); |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
%CONVERTERS = |
|
97
|
|
|
|
|
|
|
( |
|
98
|
|
|
|
|
|
|
doc => "wvText _IN_ _OUT_", |
|
99
|
|
|
|
|
|
|
pdf => "pdftotext -raw _IN_ _OUT_", |
|
100
|
|
|
|
|
|
|
ps => "pstotext -output _OUT_ _IN_", |
|
101
|
|
|
|
|
|
|
htm => "links --dump _IN_ > _OUT_", |
|
102
|
|
|
|
|
|
|
html => "links --dump _IN_ > _OUT_", |
|
103
|
|
|
|
|
|
|
); |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
if($DEBUG) { |
|
106
|
|
|
|
|
|
|
binmode(STDOUT,":utf8"); |
|
107
|
|
|
|
|
|
|
for(sort { $a cmp $b } keys %CHAR_TRANSFORMS) { |
|
108
|
|
|
|
|
|
|
print "$_ => $CHAR_TRANSFORMS{$_}\n"; |
|
109
|
|
|
|
|
|
|
} |
|
110
|
|
|
|
|
|
|
} |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub normalise_multichars { |
|
113
|
6
|
|
|
6
|
0
|
9
|
my $str = shift; |
|
114
|
6
|
|
|
|
|
28
|
$str =~ s/($CHAR_MATCHES)/$CHAR_TRANSFORMS{$1}/sgo; |
|
115
|
6
|
|
|
|
|
29
|
$str; |
|
116
|
|
|
|
|
|
|
} |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=pod |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=item $content = ParaTools::Utils::get_content($location) |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
This function takes either a filename or a URL as a parameter, and |
|
124
|
|
|
|
|
|
|
aims to return a string containing the lines in the file. A hash of |
|
125
|
|
|
|
|
|
|
converters is provided in ParaTools/Utils.pm, which should be customised |
|
126
|
|
|
|
|
|
|
for your system. |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
For URLs, the file is first downloaded to a temporary directory, then |
|
129
|
|
|
|
|
|
|
converted, whereas local files are copied straight into the temporary |
|
130
|
|
|
|
|
|
|
directory. For this reason, some care should be taken when handling very |
|
131
|
|
|
|
|
|
|
large files. |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=cut |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub get_content |
|
136
|
|
|
|
|
|
|
{ |
|
137
|
1
|
|
|
1
|
1
|
11
|
my($location) = @_; |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# Get some temporary files ready. |
|
140
|
1
|
|
|
|
|
9
|
my $dir = tempdir( CLEANUP => 1 ); |
|
141
|
1
|
|
|
|
|
837
|
my (undef, $tofile) = tempfile(UNLINK => 1, DIR => $dir, SUFFIX => ".txt"); |
|
142
|
|
|
|
|
|
|
|
|
143
|
1
|
|
|
|
|
518
|
my $type = "txt"; |
|
144
|
1
|
|
|
|
|
3
|
my $converter = ""; |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
# Set up the type. |
|
147
|
1
|
50
|
|
|
|
10
|
if ($location =~ /\.(\w+?)$/) |
|
148
|
|
|
|
|
|
|
{ |
|
149
|
1
|
|
|
|
|
4
|
$type = $1; |
|
150
|
|
|
|
|
|
|
} |
|
151
|
|
|
|
|
|
|
|
|
152
|
1
|
50
|
|
|
|
6
|
if ($location =~ /^http:\/\//) |
|
153
|
|
|
|
|
|
|
{ |
|
154
|
0
|
0
|
|
|
|
0
|
if (!$type) |
|
155
|
|
|
|
|
|
|
{ |
|
156
|
0
|
|
|
|
|
0
|
print STDERR "Unknown type - assuming HTML\n"; |
|
157
|
0
|
|
|
|
|
0
|
$type = "html"; |
|
158
|
|
|
|
|
|
|
} |
|
159
|
|
|
|
|
|
|
} |
|
160
|
|
|
|
|
|
|
else |
|
161
|
|
|
|
|
|
|
{ |
|
162
|
1
|
50
|
|
|
|
5
|
if (!$type) |
|
163
|
|
|
|
|
|
|
{ |
|
164
|
0
|
|
|
|
|
0
|
print STDERR "Unknown type - assuming plaintext\n"; |
|
165
|
0
|
|
|
|
|
0
|
$type = "txt"; |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
} |
|
168
|
|
|
|
|
|
|
|
|
169
|
1
|
|
|
|
|
5
|
my (undef, $fromfile) = tempfile(UNLINK => 1, DIR => $dir, SUFFIX => ".$type"); |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# Now we know the type, grab the files. |
|
172
|
1
|
50
|
|
|
|
415
|
if ($location =~ /^http:\/\//) |
|
173
|
|
|
|
|
|
|
{ |
|
174
|
|
|
|
|
|
|
# If it's remote, use the LWP mirror function to grab it. |
|
175
|
0
|
|
|
|
|
0
|
my $ua = new LWP::UserAgent(); |
|
176
|
0
|
|
|
|
|
0
|
$ua->mirror($location, $fromfile); |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
else |
|
179
|
|
|
|
|
|
|
{ |
|
180
|
|
|
|
|
|
|
# If it's local, mirror it straight to the $fromfile. |
|
181
|
1
|
50
|
|
|
|
40
|
open(FIN, $location) or die $!; |
|
182
|
1
|
50
|
|
|
|
65
|
open(FOUT, ">$fromfile") or die $!; |
|
183
|
1
|
|
|
|
|
41
|
foreach() { print FOUT $_; } |
|
|
15
|
|
|
|
|
41
|
|
|
184
|
1
|
50
|
|
|
|
55
|
close FOUT or die $!; |
|
185
|
1
|
50
|
|
|
|
14
|
close FIN or die $!; |
|
186
|
|
|
|
|
|
|
} |
|
187
|
|
|
|
|
|
|
|
|
188
|
1
|
50
|
|
|
|
5
|
if ($type ne "txt") |
|
189
|
|
|
|
|
|
|
{ |
|
190
|
|
|
|
|
|
|
# Convert from the $fromfile to the $tofile. |
|
191
|
0
|
0
|
|
|
|
0
|
if (!$CONVERTERS{$type}) |
|
192
|
|
|
|
|
|
|
{ |
|
193
|
0
|
|
|
|
|
0
|
print STDERR "Sorry, no converters available for type $type\n"; |
|
194
|
0
|
|
|
|
|
0
|
return; |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
else |
|
197
|
|
|
|
|
|
|
{ |
|
198
|
0
|
|
|
|
|
0
|
$converter = $CONVERTERS{$type}; |
|
199
|
0
|
|
|
|
|
0
|
$converter =~ s/_IN_/$fromfile/g; |
|
200
|
0
|
|
|
|
|
0
|
$converter =~ s/_OUT_/$tofile/g; |
|
201
|
|
|
|
|
|
|
} |
|
202
|
0
|
|
|
|
|
0
|
system($converter); |
|
203
|
|
|
|
|
|
|
} |
|
204
|
|
|
|
|
|
|
else |
|
205
|
|
|
|
|
|
|
{ |
|
206
|
|
|
|
|
|
|
# If we have text, just use the fromfile. |
|
207
|
1
|
|
|
|
|
3
|
$tofile = $fromfile; |
|
208
|
|
|
|
|
|
|
} |
|
209
|
|
|
|
|
|
|
|
|
210
|
1
|
|
|
|
|
2
|
my $content = ""; |
|
211
|
1
|
50
|
|
|
|
47
|
open( INPUT, $tofile ) or return; |
|
212
|
1
|
|
|
|
|
22
|
read( INPUT, $content, -s INPUT ); |
|
213
|
1
|
50
|
|
|
|
31
|
close INPUT or die $!; |
|
214
|
|
|
|
|
|
|
|
|
215
|
1
|
|
|
|
|
6
|
return $content; |
|
216
|
|
|
|
|
|
|
} |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=pod |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=item $escaped_url = ParaTools::Utils::url_escape($string) |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
Simple function to convert a string into an encoded |
|
223
|
|
|
|
|
|
|
URL (i.e. spaces to %20, etc). Takes the unencoded |
|
224
|
|
|
|
|
|
|
URL as a parameter, and returns the encoded version. |
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=cut |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sub url_escape |
|
229
|
|
|
|
|
|
|
{ |
|
230
|
0
|
|
|
0
|
1
|
|
my( $url ) = @_; |
|
231
|
0
|
|
|
|
|
|
$url =~ s/%3C/g; |
|
232
|
0
|
|
|
|
|
|
$url =~ s/>/%3E/g; |
|
233
|
0
|
|
|
|
|
|
$url =~ s/#/%23/g; |
|
234
|
0
|
|
|
|
|
|
$url =~ s/;/%3B/g; |
|
235
|
0
|
|
|
|
|
|
$url =~ s/&/%26/g; |
|
236
|
0
|
|
|
|
|
|
my $uri = URI->new( $url ); |
|
237
|
0
|
|
|
|
|
|
my $out = $uri->as_string; |
|
238
|
0
|
|
|
|
|
|
return $out; |
|
239
|
|
|
|
|
|
|
} |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
1; |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
__END__ |