File Coverage

blib/lib/Lingua/Translate/Babelfish.pm
Criterion Covered Total %
statement 74 108 68.5
branch 21 48 43.7
condition 2 9 22.2
subroutine 12 14 85.7
pod 5 5 100.0
total 114 184 61.9


line stmt bran cond sub pod time code
1             #!/usr/bin/perl -w
2              
3             # Copyright (c) 2002, Sam Vilain. All rights reserved. This program
4             # is free software; you may use it under the same terms as Perl
5             # itself.
6             #
7             # Portions taken from WWW::Babelfish, by Daniel J. Urist
8             #
9              
10             package Lingua::Translate::Babelfish;
11              
12 3     3   916 use strict;
  3         6  
  3         111  
13 3     3   15 use Carp;
  3         5  
  3         223  
14 3     3   3152 use LWP::UserAgent;
  3         160777  
  3         110  
15 3     3   2905 use HTTP::Request::Common qw(GET POST);
  3         7397  
  3         335  
16              
17             # package globals:
18             # %config is default values to use for new objects
19             # %valid_langs is a hash from a babelfish URI to a hash of "XX_YY"
20             # language pair tags to true values.
21 3     3   26 use vars qw($VERSION %config %valid_langs);
  3         5  
  3         264  
22              
23             # WARNING: Some constants have their default values extracted from the
24             # POD. See the bottom of the file for the code that does this.
25              
26             =head1 NAME
27              
28             Lingua::Translate::Babelfish - Translation back-end for Altavista's
29             Babelfish, version 0.01
30              
31             =head1 SYNOPSIS
32              
33             use Lingua::Translate;
34              
35             Lingua::Translate::config
36             (
37             backend => "Babelfish",
38             babelfish_uri =>
39             'http://babelfish.yahoo.com/translate_txt',
40             ua => LWP::UserAgent->new(),
41             );
42              
43             my $xl8r = Lingua::Translate->new(src => "de", dest => "en");
44              
45             # prints "My hovercraft is full of eels"
46             print $xl8r->translate("Mein Luftkissenfahrzeug ist voll von den Aalen");
47              
48             =head1 DESCRIPTION
49              
50             Lingua::Translate::Babelfish is a translation back-end for
51             Lingua::Translate that contacts babelfish.altavisa.com to do the real
52             work.
53              
54             It is normally invoked by Lingua::Translate; there should be no need
55             to call it directly. If you do call it directly, you will lose the
56             ability to easily switch your programs over to alternate back-ends
57             that are later produced.
58              
59             =head1 CONSTRUCTOR
60              
61             =head2 new(src => $lang, dest => lang, option => $value)
62              
63             Creates a new translation handle. This method contacts Babelfish to
64             determine whether the requested language pair is available.
65              
66             =over
67              
68             =item src
69              
70             Source language, in RFC-3066 form. See L for a
71             discussion of RFC-3066 language tags.
72              
73             =item dest
74              
75             Destination Language
76              
77             =item ua
78              
79             =back
80              
81             Other options that may be passed to the config() function (see below)
82             may also be passed as arguments to this constructor.
83              
84             =cut
85              
86 3     3   1272 use I18N::LangTags qw(is_language_tag);
  3         2834  
  3         1916  
87              
88             sub new {
89 2     2 1 496 my ($class, %options) = (@_);
90              
91 2         17 my $self = bless { %config }, $class;
92              
93 2 50 33     21 croak "Must supply source and destination language"
94             unless (defined $options{src} and defined $options{dest});
95              
96 2 50       23 is_language_tag($self->{src} = delete $options{src})
97             or croak "$self->{src} is not a valid RFC3066 language tag";
98              
99 2 50       35 is_language_tag($self->{dest} = delete $options{dest})
100             or croak "$self->{dest} is not a valid RFC3066 language tag";
101              
102 2         28 $self->config(%options);
103              
104 2         7 return $self;
105             }
106              
107             =head1 METHODS
108              
109             The following methods may be called on Lingua::Translate::Babelfish
110             objects.
111              
112             =head2 translate($text) : $translated
113              
114             Translates the given text. die's on any kind of error.
115              
116             If too large a block of text is given to translate, it is broken up to
117             the nearest sentence, which may fail if you have extremely long
118             sentences that wouldn't normally be found in normal language, unless
119             you were either sending in some text that has no punctuation at all in
120             it or for some reason some person was rambling on and on about totally
121             irrelevant things such as cheese, one of the finest foods produced by
122             mankind, or simply was the sort of person who don't like ending
123             sentences but instead merely keep going with one big sentence with the
124             hope that it keeps readers going, though its usual effect is merely to
125             confuse.
126              
127             The previous paragraph gets translated by Babelfish OK.
128              
129              
130             =cut
131              
132             sub translate {
133 4     4 1 339 my $self = shift;
134 4 50       27 UNIVERSAL::isa($self, __PACKAGE__)
135             or croak __PACKAGE__."::translate() called as function";
136              
137 4         8 my $text = shift;
138              
139             # chunkify the text. Knowing how to do this properly is really
140             # the job of an expert system, so I'll keep it simple and break on
141             # English sentence terminators (which might be completely the
142             # wrong thing to do for other languages. Oh well)
143 4         129 my @chunks = ($text =~ m/\G\s* # strip excess white space
144             (
145             # some non-whitespace, then some data
146             \S.{0,$self->{chunk_size}}
147              
148             # either a full stop or the end of
149             # string
150             (?:[\.;:!\?]|$)
151             )
152             /xsg);
153 4 50 33     25 die "Could not break up given text into chunks"
154             if (pos($text) and pos($text) < length($text));
155              
156             # the translated text
157 4         8 my ($translated, $error);
158              
159             CHUNK:
160 4         9 for my $chunk ( @chunks ) {
161             # make a new request object
162 4         48 my $req = POST ($self->{babelfish_uri},
163             [
164             'doit' => 'done',
165             'intl' => '1',
166             'tt' => 'urltext',
167             'trtext' => $chunk,
168 4         12 'lp' => join("_", @{$self}{qw(src dest)}),
169             'Submit' => 'Translate',
170             'ei' => 'UTF-8',
171             'fr' => 'bf-res',
172             ]);
173              
174 4         20936 $req->header("Accept-Charset", "utf-8");
175              
176             RETRY:
177             # try several times to reach babelfish
178 4         186 for my $attempt ( 1 .. $self->{retries} + 1 ) {
179              
180             # go go gadget LWP::UserAgent
181 12         220 my $res = $self->agent->request($req);
182              
183 12 50       2812324 if( $res->is_success ){
184              
185 0         0 my $output = $self->_extract_text($res->as_string, $res->header("Content-Type"));
186              
187             # Reject outputs that are too useless.
188 0 0       0 next RETRY if $output =~ /^\*\*time-out\*\*|^ $/;
189              
190             # babelfish errors (will these always be reported in
191             # English?)
192 0 0       0 next RETRY if $output =~ /We were unable to process your? request within the available time/;
193              
194             # Babelfish likes to append newlines
195 0         0 $output =~ s/\n$//;
196              
197 0         0 $translated .= $output;
198              
199 0         0 next CHUNK;
200             } else {
201 12         210 $error .= "Request $attempt:".$res->status_line."; ";
202             }
203             }
204              
205             # give up
206 4         373 die "Request timed out more than $self->{retries} times ($error)";
207              
208             } # for my $chunk ...
209              
210 0         0 return $translated;
211              
212             }
213              
214 3     3   1132 use Unicode::MapUTF8 qw(to_utf8);
  3         353944  
  3         3656  
215              
216             # Extract the text from the html we get back from babelfish and return
217             # it. It seems that SysTrans are really trying to make this hard to
218             # screen scrape, oh well.
219             sub _extract_text {
220 0     0   0 my($self, $html, $contenttype) = @_;
221              
222 0 0       0 my ($translated) =
223             ($html =~ m{
224
            (?:
)?
225             ([^<]*)
226             or die "Babelfish response unparsable, brain needed";
227              
228 0         0 my ($encoding) = ($contenttype =~ m/charset=(\S*)/);
229              
230 0 0       0 if ( $encoding =~ /^utf-?8$/i ) {
231 0         0 return $translated
232             } else {
233 0         0 return to_utf8({ -string => $translated, -charset => $encoding });
234             }
235             }
236              
237             =head2 available() : @list
238              
239             Returns a list of available language pairs, in the form of "XX_YY",
240             where XX is the source language and YY is the destination. If you
241             want the english name of a language tag, call
242             I18N::LangTags::List::name() on it. See L.
243              
244             =cut
245              
246             sub available {
247              
248 0     0 1 0 my $self = shift;
249 0 0       0 UNIVERSAL::isa($self, __PACKAGE__)
250             or croak __PACKAGE__."::available() called as function";
251              
252             # return a cached result
253 0 0       0 if ( my $ok_langs = $valid_langs{$self->{babelfish_uri}} ) {
254 0         0 return keys %$ok_langs;
255             }
256              
257             # create a new request
258 0         0 my $req = GET $self->{babelfish_uri};
259              
260             # go get it
261 0         0 my $res = $self->agent->request($req);
262 0 0       0 die "Babelfish fetch failed; ".$res->status_line
263             unless $res->is_success;
264              
265             # extract out the languages
266 0         0 my $page = $res->content;
267              
268             # OK, so this only works for languages with two letter language
269             # codes. But can YOU see babelfish supporting any language that
270             # don't have a two letter code in the near future?
271 0         0 my @list;
272 0         0 for my $pair ($page =~ m/option value="([a-z][a-z]_[a-z][a-z])"/g) {
273              
274             # check that the pair is really a language
275 0         0 my ($src, $dest) = split /_/, $pair;
276 0 0 0     0 unless ( is_language_tag($src) and is_language_tag($dest) ) {
277 0         0 warn "Don't recognise `$pair' as a valid language pair";
278 0         0 next;
279             };
280              
281 0         0 push @list, $pair;
282             }
283              
284             # save the result
285 0         0 $valid_langs{$self->{babelfish_uri}} = map { $_ => 1 } @list;
  0         0  
286              
287 0         0 return @list;
288             }
289              
290             =head2 agent() : LWP::UserAgent
291              
292             Returns the LWP::UserAgent object used to contact Babelfish.
293              
294             =cut
295              
296             sub agent {
297              
298 12     12 1 22 my $self;
299 12 50       81 if ( UNIVERSAL::isa($_[0], __PACKAGE__) ) {
300 12         22 $self = shift;
301             } else {
302 0         0 $self = \%config;
303             }
304              
305 12 100       59 unless ( $self->{ua} ) {
306 2         25 $self->{ua} = LWP::UserAgent->new();
307 2         6758 $self->{ua}->agent($self->{agent});
308 2         161 $self->{ua}->env_proxy();
309             }
310              
311 12         11532 $self->{ua};
312             }
313              
314              
315             =head1 CONFIGURATION FUNCTIONS
316              
317             The following are functions, and not method calls. You will probably
318             not need them with normal use of the module.
319              
320             =head2 config(option => $value)
321              
322             This function sets defaults for use when constructing objects.
323              
324             =cut
325              
326             sub config {
327              
328 14     14 1 16 my $self;
329 14 100       94 if ( UNIVERSAL::isa($_[0], __PACKAGE__) ) {
330 2         7 $self = shift;
331             } else {
332 12         22 $self = \%config;
333             }
334              
335 14         61 while ( my ($option, $value) = splice @_, 0, 2 ) {
336              
337 12 100       64 if ( $option eq "babelfish_uri" ) {
    50          
    100          
    100          
    50          
338              
339             # set the Babelfish URI
340 3 50       39 ($self->{babelfish_uri} = $value) =~ m/\?(.*&)?$/
341             or croak "Babelfish URI `$value' not a query URI";
342              
343             } elsif ( $option eq "ua" ) {
344 0         0 $self->{ua} = $value;
345              
346             } elsif ( $option eq "agent" ) {
347              
348             # set the user-agent
349 3         8 $self->{agent} = $value;
350 3 50       27 $self->{ua}->agent($value) if $self->{ua};
351              
352             } elsif ( $option eq "chunk_size" ) {
353              
354 3         19 $self->{chunk_size} = $value;
355              
356             } elsif ( $option eq "retries" ) {
357              
358 3         23 $self->{retries} = $value;
359              
360             } else {
361              
362 0           croak "Unknown configuration option $option";
363             }
364             }
365             }
366              
367             =over
368              
369             =item babelfish_uri
370              
371             The uri to use when contacting Babelfish.
372              
373             The default value is
374             "http://babelfish.yahoo.com/translate_txt?"
375              
376             =item agent
377              
378             The User-Agent to pretend to be when contacting Babelfish.
379              
380             The default value is "Lingua::Translate::Babelfish/", plus the version
381             number of the package.
382              
383             =item chunk_size
384              
385             The size to break chunks into before handing them off to Babelfish.
386             The default value is "1000" (bytes).
387              
388             =item retries
389              
390             The number of times to retry contacting Babelfish if the first attempt
391             fails. The default value is "2".
392              
393             =back
394              
395             =cut
396              
397             # extract configuration options from the POD
398             use Pod::Constants
399 3         2310 'NAME' => sub { ($VERSION) = (m/(\d+\.\d+)/); },
400             'CONFIGURATION FUNCTIONS' => sub {
401             Pod::Constants::add_hook
402             ('*item' => sub {
403 12         2998 my ($varname) = m/(\w+)/;
404             #my ($default) = m/The default value is\s+"(.*)"\./s;
405 12         56 my ($default) = m/The default value is\s+"(.*)"/s;
406 12         34 config($varname => $default);
407             }
408 3         9557 );
409             Pod::Constants::add_hook
410             (
411             '*back' => sub {
412              
413             # an ugly hack?
414 3         729 $config{agent} .= $VERSION;
415              
416 3         14 Pod::Constants::delete_hook('*item');
417 3         54 Pod::Constants::delete_hook('*back');
418             }
419 3         103 );
420 3     3   1264 };
  3         28780  
  3         51  
421              
422             1;
423              
424             __END__