File Coverage

blib/lib/Biblio/Document/Parser/Utils.pm
Criterion Covered Total %
statement 45 68 66.1
branch 11 26 42.3
condition n/a
subroutine 8 9 88.8
pod 2 3 66.6
total 66 106 62.2


line stmt bran cond sub pod time code
1             package Biblio::Document::Parser::Utils;
2              
3             ######################################################################
4             #
5             # ParaTools::Document::Parser::Utils;
6             #
7             ######################################################################
8             #
9             # This file is part of ParaCite Tools ((http://paracite.eprints.org/developers/)
10             #
11             # Copyright (c) 2002 University of Southampton, UK. SO17 1BJ.
12             #
13             # ParaTools is free software; you can redistribute it and/or modify
14             # it under the terms of the GNU General Public License as published by
15             # the Free Software Foundation; either version 2 of the License, or
16             # (at your option) any later version.
17             #
18             # ParaTools is distributed in the hope that it will be useful,
19             # but WITHOUT ANY WARRANTY; without even the implied warranty of
20             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21             # GNU General Public License for more details.
22             #
23             # You should have received a copy of the GNU General Public License
24             # along with ParaTools; if not, write to the Free Software
25             # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26             #
27             ######################################################################
28              
29              
30 1     1   921 use utf8;
  1         11  
  1         5  
31 1     1   31 use strict;
  1         2  
  1         38  
32             require Exporter;
33 1     1   1002 use LWP::UserAgent;
  1         64510  
  1         45  
34 1     1   3638 use File::Temp qw/ tempfile tempdir /;
  1         32219  
  1         168  
35 1     1   8 use URI;
  1         2  
  1         37  
36 1     1   5 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAG $CHAR_MATCHES %CHAR_TRANSFORMS %CONVERTERS $DEBUG);
  1         2  
  1         1323  
37              
38             @ISA = qw( Exporter );
39             @EXPORT_OK = qw( &normalise_multichars );
40             @EXPORT = qw( &get_content );
41             $DEBUG = 0;
42              
43             =pod
44              
45             =head1 NAME
46              
47             @ - utility module for handling International characters and document conversion
48              
49             =head1 DESCRIPTION
50              
51             Biblio::Document::Parser::Utils provides some utility functions for handling international
52             characters and for conversion of documents to plaintext.
53              
54             =head1 SYNOPSIS
55              
56             use Biblio::Document::Parser::Utils qw( normalise_multichars );
57              
58             print normalise_multichars( $str );
59              
60             =head1 METHODS
61              
62             =over 4
63              
64             =item $str = normalise_multichar( $str )
65              
66             Convert multi-char international characters into single UTF-8 chars, e.g.:
67             ¨o => ö
68             These appear in pdftotext output from PDFs generated by pdflatex.
69              
70             =cut
71              
72             $CHAR_MATCHES = '[\x{5e}\x{60}\x{a8}\x{b4}\x{7e}][aeounzn]';
73              
74             %CHAR_TRANSFORMS = (
75             "\x{5e}a"=>"\x{e2}",
76             "\x{5e}e"=>"\x{ea}",
77             "\x{5e}o"=>"\x{f4}",
78             "\x{5e}u"=>"\x{fb}",
79             "\x{60}a"=>"\x{e0}",
80             "\x{60}e"=>"\x{e8}",
81             "\x{60}o"=>"\x{f2}",
82             "\x{60}u"=>"\x{f9}",
83             "\x{a8}a"=>"\x{e4}",
84             "\x{a8}e"=>"\x{eb}",
85             "\x{a8}o"=>"\x{f6}",
86             "\x{a8}u"=>"\x{fc}",
87             "\x{b4}a"=>"\x{e1}",
88             "\x{b4}e"=>"\x{e9}",
89             "\x{b4}o"=>"\x{f3}",
90             "\x{b4}u"=>"\x{fa}",
91             "\x{b4}n"=>"\x{144}",
92             "\x{b4}z"=>"\x{17a}",
93             "\x{7e}n"=>"\x{f1}",
94             );
95              
96             %CONVERTERS =
97             (
98             doc => "wvText _IN_ _OUT_",
99             pdf => "pdftotext -raw _IN_ _OUT_",
100             ps => "pstotext -output _OUT_ _IN_",
101             htm => "links --dump _IN_ > _OUT_",
102             html => "links --dump _IN_ > _OUT_",
103             );
104              
105             if($DEBUG) {
106             binmode(STDOUT,":utf8");
107             for(sort { $a cmp $b } keys %CHAR_TRANSFORMS) {
108             print "$_ => $CHAR_TRANSFORMS{$_}\n";
109             }
110             }
111              
112             sub normalise_multichars {
113 6     6 0 9 my $str = shift;
114 6         28 $str =~ s/($CHAR_MATCHES)/$CHAR_TRANSFORMS{$1}/sgo;
115 6         29 $str;
116             }
117              
118              
119             =pod
120              
121             =item $content = ParaTools::Utils::get_content($location)
122              
123             This function takes either a filename or a URL as a parameter, and
124             aims to return a string containing the lines in the file. A hash of
125             converters is provided in ParaTools/Utils.pm, which should be customised
126             for your system.
127              
128             For URLs, the file is first downloaded to a temporary directory, then
129             converted, whereas local files are copied straight into the temporary
130             directory. For this reason, some care should be taken when handling very
131             large files.
132              
133             =cut
134              
135             sub get_content
136             {
137 1     1 1 11 my($location) = @_;
138              
139             # Get some temporary files ready.
140 1         9 my $dir = tempdir( CLEANUP => 1 );
141 1         837 my (undef, $tofile) = tempfile(UNLINK => 1, DIR => $dir, SUFFIX => ".txt");
142              
143 1         518 my $type = "txt";
144 1         3 my $converter = "";
145              
146             # Set up the type.
147 1 50       10 if ($location =~ /\.(\w+?)$/)
148             {
149 1         4 $type = $1;
150             }
151              
152 1 50       6 if ($location =~ /^http:\/\//)
153             {
154 0 0       0 if (!$type)
155             {
156 0         0 print STDERR "Unknown type - assuming HTML\n";
157 0         0 $type = "html";
158             }
159             }
160             else
161             {
162 1 50       5 if (!$type)
163             {
164 0         0 print STDERR "Unknown type - assuming plaintext\n";
165 0         0 $type = "txt";
166             }
167             }
168              
169 1         5 my (undef, $fromfile) = tempfile(UNLINK => 1, DIR => $dir, SUFFIX => ".$type");
170              
171             # Now we know the type, grab the files.
172 1 50       415 if ($location =~ /^http:\/\//)
173             {
174             # If it's remote, use the LWP mirror function to grab it.
175 0         0 my $ua = new LWP::UserAgent();
176 0         0 $ua->mirror($location, $fromfile);
177             }
178             else
179             {
180             # If it's local, mirror it straight to the $fromfile.
181 1 50       40 open(FIN, $location) or die $!;
182 1 50       65 open(FOUT, ">$fromfile") or die $!;
183 1         41 foreach() { print FOUT $_; }
  15         41  
184 1 50       55 close FOUT or die $!;
185 1 50       14 close FIN or die $!;
186             }
187            
188 1 50       5 if ($type ne "txt")
189             {
190             # Convert from the $fromfile to the $tofile.
191 0 0       0 if (!$CONVERTERS{$type})
192             {
193 0         0 print STDERR "Sorry, no converters available for type $type\n";
194 0         0 return;
195             }
196             else
197             {
198 0         0 $converter = $CONVERTERS{$type};
199 0         0 $converter =~ s/_IN_/$fromfile/g;
200 0         0 $converter =~ s/_OUT_/$tofile/g;
201             }
202 0         0 system($converter);
203             }
204             else
205             {
206             # If we have text, just use the fromfile.
207 1         3 $tofile = $fromfile;
208             }
209              
210 1         2 my $content = "";
211 1 50       47 open( INPUT, $tofile ) or return;
212 1         22 read( INPUT, $content, -s INPUT );
213 1 50       31 close INPUT or die $!;
214              
215 1         6 return $content;
216             }
217              
218             =pod
219              
220             =item $escaped_url = ParaTools::Utils::url_escape($string)
221              
222             Simple function to convert a string into an encoded
223             URL (i.e. spaces to %20, etc). Takes the unencoded
224             URL as a parameter, and returns the encoded version.
225              
226             =cut
227              
228             sub url_escape
229             {
230 0     0 1   my( $url ) = @_;
231 0           $url =~ s/
232 0           $url =~ s/>/%3E/g;
233 0           $url =~ s/#/%23/g;
234 0           $url =~ s/;/%3B/g;
235 0           $url =~ s/&/%26/g;
236 0           my $uri = URI->new( $url );
237 0           my $out = $uri->as_string;
238 0           return $out;
239             }
240              
241             1;
242              
243             __END__