File Coverage

blib/lib/WWW/Babelfish.pm
Criterion Covered Total %
statement 21 152 13.8
branch 0 48 0.0
condition 0 23 0.0
subroutine 7 15 46.6
pod 6 7 85.7
total 34 245 13.8


line stmt bran cond sub pod time code
1             package WWW::Babelfish;
2              
3             require 5.008;
4              
5 1     1   582 use strict;
  1         1  
  1         32  
6 1     1   3 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
  1         2  
  1         100  
7              
8             require Exporter;
9             require AutoLoader;
10              
11             @ISA = qw(Exporter AutoLoader);
12             # Items to export into callers namespace by default. Note: do not export
13             # names by default without a very good reason. Use EXPORT_OK instead.
14             # Do not simply export all your public functions/methods/constants.
15             @EXPORT = qw();
16              
17             $VERSION = '0.16';
18              
19             # Preloaded methods go here.
20              
21 1     1   719 use HTTP::Request::Common qw(POST);
  1         55179  
  1         98  
22 1     1   1705 use LWP::UserAgent;
  1         37803  
  1         33  
23 1     1   742 use HTML::TokeParser;
  1         14152  
  1         32  
24 1     1   889 use IO::String;
  1         4650  
  1         29  
25 1     1   3056 use Encode;
  1         12827  
  1         2463  
26              
27             my $MAXCHUNK = 1000; # Maximum number of characters
28             # Bablefish will translate at one time
29              
30             my $MAXRETRIES = 50; # Maximum number of retries for a chunk of text
31             $| = 1;
32              
33             my $Services = {
34             Babelfish => {
35             agent => $0 . ":" . __PACKAGE__ . "/" . $VERSION,
36              
37             languagesrequest => sub {
38             my $req = new HTTP::Request(GET => 'http://babelfish.altavista.com/babelfish/tr?il=en');
39             return $req;
40             },
41              
42             translaterequest => sub {
43             my($langpair, $text) = @_;
44             my $req = POST ( 'http://babelfish.altavista.com/babelfish/tr?il=en',
45             [ 'doit' => 'done', 'urltext' => encode("utf8",$text), 'lp' => $langpair, 'Submit' => 'Translate', 'enc' => 'utf8' ], qw(Accept-Charset utf-8) );
46             return $req;
47             },
48              
49             # Extract the text from the html we get back from babelfish and return
50             # it (keying on the fact that it's the first thing after a
tag,
51             # possibly removing a textarea tag after it).
52              
53             # extract_text => sub {
54             # my($html) = @_;
55             # my $p = HTML::TokeParser->new(\$html);
56             # my $tag;
57             # while ($tag = $p->get_tag('input')) {
58             # $_ = @{$tag}[1]->{value} if @{$tag}[1]->{name} eq 'q';
59             # return decode("utf8",$_);
60             # }
61              
62             extract_text => sub {
63             my($html) = @_;
64             my $p = HTML::TokeParser->new(\$html);
65             while ( my $_tag = $p->get_tag('div') ) {
66             my($tag,$attr,$attrseq) = @$_tag;
67             next unless @$attrseq == 1
68             && $attrseq->[-1] eq 'style'
69             && $attr->{style} eq 'padding:10px;';
70             my($token) = $p->get_token or return;
71             my ( $type, $text, $is_data ) = @$token;
72             next if $type ne 'T';
73             return decode( utf8 => $text );
74             }
75              
76              
77             }
78             },
79              
80             Google => {
81             agent => 'Mozilla/5.0', # Google is finicky
82              
83             languagesrequest => sub {
84             my $req = new HTTP::Request(GET => 'http://www.google.com/language_tools?hl=en');
85             return $req;
86             },
87              
88             translaterequest => sub {
89             my($langpair, $text) = @_;
90             my $req = POST ( 'http://translate.google.com/translate_t',
91             [ 'text' => encode("utf8",$text), 'langpair' => $langpair, hl => 'en', ie => "UTF8", oe => "UTF8",]);
92             return $req;
93             },
94              
95             extract_text => sub {
96             my($html) = @_;
97             my $p = HTML::TokeParser->new(\$html);
98             my $tag;
99             while ($tag = $p->get_tag('div')) {
100             if (@{$tag}[1]->{id} eq 'result_box') {
101             $_ = $p->get_text;
102             return decode("utf8",$_);
103             }
104             }
105             }
106             },
107              
108             Yahoo => {
109             agent => $0 . ":" . __PACKAGE__ . "/" . $VERSION,
110              
111             languagesrequest => sub {
112             my $req = new HTTP::Request(GET => 'http://babelfish.yahoo.com/translate_txt');
113             return $req;
114             },
115              
116             translaterequest => sub {
117             my($langpair, $text) = @_;
118             my $req = POST ( 'http://babelfish.yahoo.com/translate_txt',
119             [ 'ei' => 'UTF-8', 'doit' => 'done', 'tt' => 'urltext', 'trtext' => encode("utf8",$text), 'lp' => $langpair, 'btnTrTxt' => 'Translate', 'intl' => '1' ]);
120             return $req;
121             },
122              
123             # Extract the text from the html we get back from Yahoo
124             extract_text => sub {
125             my($html) = @_;
126             my $p = HTML::TokeParser->new(\$html);
127             my $tag;
128             while ($tag = $p->get_tag('div')) {
129             next if (@{$tag}[1]->{id} ne 'result');
130             $_ = $p->get_text('/div');
131             return decode("utf8",$_);
132             }
133             }
134             },
135            
136             };
137              
138              
139             sub new {
140 0     0 1   my ($this, @args) = @_;
141 0   0       my $class = ref($this) || $this;
142 0           my $self = {};
143 0           bless $self, $class;
144 0 0         return undef unless( $self->initialize(@args) );
145 0           return $self;
146             }
147              
148             sub initialize {
149 0     0 0   my($self, %params) = @_;
150              
151 0   0       $self->{service} = $params{service} || 'Babelfish';
152 0 0         die "No such service: " . $self->{service} unless defined $Services->{ $self->{service} };
153              
154             # Caller can set user agent; we default to "script:WWW::Babelfish/0.01"
155 0   0       $self->{agent} = $params{agent} || $Services->{agent};
156              
157 0 0         $self->{proxy} = $params{proxy} if defined $params{proxy};
158              
159             # Get the page
160 0           my $ua = new LWP::UserAgent;
161 0 0         $ua->proxy('http','http://' . $self->{proxy}) if defined $self->{proxy};
162 0           $ua->agent($self->{agent});
163 0           $self->{ua} = $ua;
164              
165 0           my $req = &{ $Services->{ $self->{service} }->{languagesrequest} };
  0            
166 0           my $res = $ua->request($req);
167 0 0         unless($res->is_success){
168 0           warn(__PACKAGE__ . ":" . $res->status_line);
169 0           return 0;
170             }
171 0           my $page = $res->content;
172              
173             # Extract the language names and the mapping of languages to options to
174             # be passed back, and store them on our object in "Langs" hash of hashes
175             # Incredibly, this works for both Babelfish and Google; it should really
176             # be a method in $Services
177 0           my $p = HTML::TokeParser->new(\$page);
178 0           my $a2b;
179 0 0         if ( $p->get_tag("select") ) {
180 0           while ( $_ = $p->get_tag("option") ) {
181 0           $a2b = $p->get_trimmed_text;
182 0 0         next if $a2b =~ /Select from and to languages/; # This for babelfish
183 0 0         $a2b =~ /(\S+)\sto\s(\S+)/ or next;
184 0           $self->{Langs}{$1}{$2} = $_->[1]{value};
185 0   0       $self->{Langs}{$2} ||= {};
186             }
187             }
188              
189 0           return 1;
190             }
191              
192             sub services {
193 0     0 1   my $self = shift;
194 0 0         if($self){
195 0           return keys %{$self->Services};
  0            
196             }
197             else{
198 0           return keys %{$Services};
  0            
199             }
200             }
201              
202             sub languages {
203 0     0 1   my $self = shift;
204 0           return sort keys %{$self->{Langs}};
  0            
205             }
206              
207             sub languagepairs {
208 0     0 1   my $self = shift;
209 0           return $self->{Langs};
210             }
211              
212             sub translate {
213 0     0 1   my ($self, %params) = @_;
214              
215             # Paragraph separator is "\n\n" by default
216 0   0       local $/ = $params{delimiter} || "\n\n";
217 0           local $_;
218              
219 0 0         $params{delimiter} = "\n\n" if ( ! defined( $params{delimiter} ) );
220              
221 0           undef $self->{error};
222 0 0         unless ( exists($self->{Langs}->{$params{source}}) ) {
223 0           $self->{error} = qq(Language "$params{source}" is not available);
224 0           warn(__PACKAGE__ . ": " . $self->{error} . "\n");
225 0           return undef;
226             }
227              
228             # This "feature" is actually useful as a pass-thru filter.
229             # Babelfish doesn't do same-to-same anyway (though it would be
230             # pretty interesting if it did)
231 0 0         return $params{text} if $params{source} eq $params{destination};
232              
233 0 0         unless ( exists($self->{Langs}->{$params{source}}{$params{destination}}) ) {
234 0           $self->{error} =
235             qq(Cannot translate from "$params{source}" to "$params{destination}");
236 0           warn(__PACKAGE__ . ": " . $self->{error} . "\n");
237 0           return undef;
238             }
239              
240 0           my $langopt = $self->{Langs}{$params{source}}{$params{destination}};
241              
242 0           my $th; # "Text Handle"
243 0 0         if ( ref $params{text} ) { # We've been passed a filehandle
244 0           $th = $params{text};
245             } else { # We've been passed a string
246 0           $th = new IO::String($params{text});
247             }
248              
249 0           my $Text = "";
250 0           my $WANT_STRING_RETURNED = 0;
251 0 0         unless ( defined $params{ofh} ) {
252 0           $params{ofh} = new IO::String($Text);
253 0           $WANT_STRING_RETURNED = 1;
254             }
255              
256             # Variables we use in the next mega-block
257 0           my $para; # paragraph
258 0           my $num_paras = 0; # number of paragraphs
259 0           my $transpara; # translated paragraph
260 0           my $para_start_ws = ""; # initial whitespace in paragraph
261 0           my $chunk; # paragraph piece to feed to babelfish
262             my $req; # LWP request object
263 0           my $ua; # LWP user agent
264 0           my $res; # LWP result
265 0           my $text; # translated chunk
266 0           my $i; # a counter
267 0           while ($para = <$th>) {
268 0           $num_paras++;
269 0           $transpara = "";
270              
271             # Extract any leading whitespace from the start of the paragraph
272             # Babelfish will eat it anyway.
273 0 0         if ($para =~ s/(^\s+)(\S)/$2/) {
274 0   0       $para_start_ws = $1 || "";
275             }
276 0           $para =~ s/$params{delimiter}//; # Remove the para delimiter
277              
278             CHUNK:
279 0           foreach $chunk ( $self->_chunk_text($MAXCHUNK, $para) ) {
280 0           $req = &{ $Services->{ $self->{service} }->{translaterequest} }($langopt, $chunk);
  0            
281 0           $ua = $self->{ua};
282              
283             RETRY:
284 0           for ($i = 0; $i <= $MAXRETRIES; $i++) {
285 0           $res = $ua->request($req);
286              
287 0 0         if ( $res->is_success ) {
288              
289             #$text = $self->_extract_text($res->as_string); #REMOVE
290 0           $text = &{ $Services->{ $self->{service} }->{extract_text} }($res->as_string);
  0            
291 0 0 0       if ( ( ! defined( $text ) ) ||
292             ( $text =~ /^\*\*time-out\*\*/ )
293             ) # in-band signalling; yuck
294             {
295 0           next RETRY;
296              
297             } ## end if
298              
299 0           $text =~ s/\n$//; # Babelfish likes to append newlines
300 0           $transpara .= $text;
301              
302 0           next CHUNK;
303             }
304             }
305 0           $self->{error} = "Request timed out more than $MAXRETRIES times";
306 0           return undef;
307             }
308 0 0         print { $params{ofh} } $/ if $num_paras > 1;
  0            
309 0           print { $params{ofh} } $para_start_ws . $transpara;
  0            
310             }
311              
312 0 0         if ( $WANT_STRING_RETURNED ) {
313 0           return $Text;
314             } else {
315 0           return 1;
316             }
317             }
318              
319             sub error {
320 0     0 1   my $self = shift;
321 0           return $self->{error};
322             }
323              
324             # Given a maximum chunk size and some text, return
325             # an array of pieces of the text chopped up in a
326             # logical way and less than or equal to the chunk size
327             sub _chunk_text {
328 0     0     my($self, $max, $text) = @_;
329              
330 0           my @result;
331              
332             # The trivial case
333 0 0         return($text) if length($text) <= $max;
334              
335             # Hmmm. There are a couple of ways we could do this.
336             # I'm guessing that Babelfish doesn't look at any structure larger than
337             # a sentence; in fact I'm often tempted to guess that it doesn't look
338             # at anything larger than a word, but we'll give it the benefit of the doubt.
339             #
340              
341             # FIXME there are no built-in regexps for matching sentence
342             # breaks; I'm not sure if terminal punctuation will work for all
343             # languages...
344              
345 0           my $total = length($text);
346 0           my $offset = 0;
347 0           my $lastoffset = 0;
348 0           my $test;
349             my $chunk;
350              
351 0           while ( ($total - $lastoffset) > $max) {
352 0           $test = $lastoffset + $max;
353            
354             # Split by terminal punctuation...
355 0           @_ = sort {$b <=> $a} ( rindex($text, '.', $test),
  0            
356             rindex($text, '!', $test),
357             rindex($text, '?', $test),
358             );
359 0           $offset = shift(@_) + 1;
360              
361             # or by clause...
362 0 0 0       if ( $offset == -1 or $offset <= $lastoffset ) {
363 0           @_ = sort {$b <=> $a} ( rindex($text, ',', $test),
  0            
364             rindex($text, ';', $test),
365             rindex($text, ':', $test),
366             );
367 0           $offset = shift(@_) + 1;
368              
369              
370             # or by word
371 0 0 0       if ( $offset == -1 or $offset <= $lastoffset) {
372 0           $offset = rindex($text, " ", $test);
373             }
374              
375             # or give up
376 0 0         return undef if $offset == -1;
377             }
378            
379 0           $chunk = substr($text, $lastoffset, $offset - $lastoffset);
380              
381 0           push( @result, $chunk);
382 0           $lastoffset = $offset;
383             }
384              
385 0           push( @result, substr($text, $lastoffset) );
386 0           return @result;
387             }
388              
389             # This code is now obsoleted by the new result page format, but I'm
390             # leaving it here commented out in case we end up needing the
391             # whitespace hack again.
392             #
393             # my ($tag,$token);
394             # my $text="";
395             # if ($tag = $p->get_tag('br')) {
396             # while ($token = $p->get_token) {
397             # next if shift(@{$token}) ne "T";
398             # $text = shift(@{$token});
399              
400             # #$text =~ s/[\r\n]//g;
401             # # This patch for whitespace handling from Olivier Scherler
402             # $text =~ s/[\r\n]/ /g;
403             # $text =~ s/^\s*//;
404             # $text =~ s/\s+/ /g;
405             # $text =~ s/\s+$//;
406              
407             # last if defined($text) and $text ne "";
408             # }
409             # }
410             # return $text;
411              
412              
413             #}
414              
415              
416             # Autoload methods go after =cut, and are processed by the autosplit program.
417              
418             1;
419             __END__