File Coverage

blib/lib/Lingua/Ident.pm
Criterion Covered Total %
statement 68 68 100.0
branch 10 12 83.3
condition n/a
subroutine 6 6 100.0
pod 3 3 100.0
total 87 89 97.7


line stmt bran cond sub pod time code
1             # Time-stamp: <2010-05-14T16:53:47 mxp>
2             # Copyright © 2000 Michael Piotrowski. All Rights Reserved.
3              
4             =head1 NAME
5              
6             Lingua::Ident - Statistical language identification
7              
8             =head1 SYNOPSIS
9              
10             use Lingua::Ident;
11             $classifier = new Lingua::Ident("filename 1", ..., "filename n");
12             $lang = $classifier->identify("text to classify");
13             $probabilities = $classifier->calculate("text to classify");
14              
15             =head1 DESCRIPTION
16              
17             This module implements a statistical language identifier based on the
18             approach Ted Dunning described in his 1994 report I
19             Identification of Language>.
20              
21             =head1 METHODS
22              
23             =cut
24              
25             ###############################################################################
26              
27             package Lingua::Ident;
28              
29             $VERSION='1.7';
30              
31 3     3   1777 use Carp;
  3         5  
  3         284  
32 3     3   2790 use bytes;
  3         24  
  3         16  
33 3     3   74 use strict;
  3         8  
  3         2510  
34              
35             =head2 Lingua::Ident->new($filename, ...)
36              
37             Construct a new classifier. The filename arguments to the constructor
38             must refer to files containing tables of n-gram probabilites for
39             languages (language models). These tables can be generated using the
40             trainlid(1) utility program.
41              
42             =cut
43              
44             sub new
45             {
46 2     2 1 70 my $class = shift;
47 2         6 my @files = @_;
48 2         6 my $self = {};
49              
50 2         5 my ($filename, $matrix, @matrices, @languages, %bigrams, @bigrams, @n_alph);
51              
52 2         5 foreach $filename (@files)
53             {
54 14 50       788 open MATRIX, "<$filename" or croak "$!";
55              
56 14         38 $matrix = {};
57 14         45 my %bigrams = ();
58              
59 14         306 while ()
60             {
61 51087         52343 chomp;
62              
63 51087 100       119723 if (/:/)
    50          
64             {
65 38855         78375 (my $key, my $val) = split(/:/);
66 38855         156393 $matrix->{$key} = $val;
67             }
68             elsif (/;/)
69             {
70 12232         26427 (my $key, my $val) = split(/;/);
71 12232         51857 $bigrams{$key} = $val;
72             }
73             }
74              
75 14         47 push @matrices, $matrix;
76 14         49 push @languages, $matrix->{'_LANG'};
77 14         31 push @bigrams, \%bigrams;
78 14         48 push @n_alph, $matrix->{'#ALPH'};
79              
80 14         418 close MATRIX;
81             }
82              
83 2         9 $self->{MATRICES} = \@matrices;
84 2         5 $self->{LANGUAGES} = \@languages;
85 2         6 $self->{BIGRAMS} = \@bigrams;
86              
87             # Calculate the average alphabet size over all loaded language models
88 2         6 my $s;
89 2         7 map { $s += $_ } @n_alph;
  14         31  
90 2         11 $self->{AVG_ALPH} = $s / @n_alph;
91              
92 2         30 return bless $self, $class;
93             }
94              
95             =head2 $classifier->identify($string)
96              
97             Identify the language of a text given in $string. The identify()
98             method returns the value specified in the B<_LANG> field of the
99             probabilities table of the language in which the text is most likely
100             written (see L<"WARNINGS"> below).
101              
102             Internally, the identify() method calls the calculate() method.
103              
104             =cut
105              
106             sub identify
107             {
108 61     61 1 2237 my $self = shift;
109 61         90 my $text = shift;
110              
111 61         69 return ${$self->calculate($text)}[0]->[0];
  61         120  
112             }
113              
114             =head2 $classifier->calculate($string)
115              
116             Calculate the probabilities for a text to be in the languages known to
117             the classifier. This method returns a reference to an array. The
118             array represents a table of languages and the probabiliy for each
119             language. Each array element is a reference to an array containing
120             two elements: The language name and the associated probability. For
121             example, you may get something like this:
122              
123             [['de.iso-8859-1', -317.980835274509],
124             ['en.iso-8859-1', -450.804230119916], ...]
125              
126             The elements are sorted in descending order by probability. You can
127             use this data to assess the reliability of the categorization and make
128             your own decision using application-specific metrics.
129              
130             When neither a trigram nor a bigram is found, the calculation deviates
131             slightly from the formula given by Dunning (1994). According to
132             Dunning's formula, one would estimate the probability as:
133              
134             p = log(1/#alph)
135              
136             where #alph is the size of the alphabet of a particular language.
137             This penalizes different language models with different values because
138             the alphabet sizes of the languages differ.
139              
140             However, the size of the alphabet is much larger for Asian languages
141             than for European languages. For example, for the sample data in the
142             Lingua::Ident distribution trainlid(1) reports #alph = 127 for zh.big5
143             vs. #alph = 31 for de.iso-8859-1. This means that Asian languages are
144             penalized much harder than European languages when an estimation must
145             be made.
146              
147             To use the I penalty for all languages, calculate() now uses the
148             average of all alphabet sizes instead.
149              
150             B This has only been lightly tested yet--feedback is welcome.
151              
152             =cut
153              
154             sub calculate
155             {
156 61     61 1 81 my $self = shift;
157 61         62 my $text = shift;
158              
159 61         64 my @matrices = @{$self->{MATRICES}};
  61         189  
160 61         76 my @bigrams = @{$self->{BIGRAMS}};
  61         135  
161 61         164 my @prob = (0) x @matrices;
162 61         86 my ($c, $i, @chars, $trigram);
163              
164             # for ($i = 0; $i <= $#matrices; $i++) {
165             # print "bigram3 size: " . keys(%{$bigrams[$i]}) . "\n";
166             # }
167              
168 61         633 foreach $c (split //, $text)
169             {
170 3862         5484 push @chars, $c;
171 3862 100       6808 if (@chars == 3)
172             {
173 3740         6668 $trigram = lc(join("", @chars));
174             # $trigram = join("", @chars);
175             # $trigram =~ s/[\d\W]/ /og;
176 3740         5482 $trigram =~ s/[\x00-\x1f\x21-\x40\x7b-\x7f]/ /og;
177              
178 3740         7645 for ($i = 0; $i <= $#matrices; $i++)
179             {
180 29850 100       57286 if (exists $matrices[$i]->{$trigram})
181             {
182 11373         34789 $prob[$i] += log $matrices[$i]->{$trigram};
183             }
184             else
185             {
186             # $prob[$i] += log $matrices[$i]->{'_NULL'};
187 18477 100       34538 if (exists $bigrams[$i]->{substr($trigram, 0, 2)})
188             {
189 6586         21291 $prob[$i] +=
190             log (1 / $bigrams[$i]->{substr($trigram, 0, 2)});
191             }
192             else
193             {
194             # When neither a trigram nor a bigram is found,
195             # according to Dunning's formula, we would now
196             # calculate:
197              
198             # $prob[$i] += log (1 / $matrices[$i]->{'#ALPH'});
199              
200             # Thus, we penalize different language models with
201             # different values because of the language's
202             # alphabet size.
203              
204             # However, the size of the alphabet (#ALPH) for
205             # Asian languages is much larger than for European
206             # languages, e.g., with the sample data we get 127
207             # for zh.big5 vs. 31 for de.iso-8859-1. This means
208             # that these languages are penalized much harder
209             # than European languages. (This was pointed out by
210             # James Shaw .)
211              
212             # To use the same penalty for all languages, we use
213             # the average of the alphabet sizes instead.
214              
215             # NOTE: This has only been lightly tested yet.
216              
217 11891         31080 $prob[$i] += log (1 / $self->{AVG_ALPH});
218             }
219             }
220             }
221 3740         6036 shift @chars;
222             }
223             }
224              
225             # Assemble the results into an array of arrays. Each array
226             # contains two elements: The language name and the associated
227             # probability, e.g., @results may look like this:
228              
229             # (['de.iso-8859-1', '-317.980835274509'],
230             # ['en.iso-8859-1', '-450.804230119916'], ...)
231              
232 61         381 my @results;
233              
234 61         86 for ($i = 0; $i < @{$self->{'LANGUAGES'}}; $i++)
  547         1091  
235             {
236 486         1159 push @results, [$self->{'LANGUAGES'}->[$i], $prob[$i]];
237             }
238              
239             # Sort results in descending order by probability
240 61         207 my @sorted = sort { $b->[1] <=> $a->[1] } @results;
  850         1097  
241              
242 61         427 return \@sorted;
243             }
244              
245             =head1 WARNINGS
246              
247             Since Lingua::Ident is based on statistics it cannot be 100% accurate.
248             More precisely, Dunning (see below) reports his implementation to
249             achieve 92% accuracy with 50 KB of training text for 20-character
250             strings discriminating between English and Spanish. This
251             implementation should be as accurate as Dunning's. However, not only
252             the size but also the quality of the training text plays a role.
253              
254             The current implementation doesn't use a threshold to determine if the
255             most probable language has a high enough probability; if you're trying
256             to classify a text in a language for which there is no probability
257             table, this results in getting an incorrect language.
258              
259             =head1 AUTHOR
260              
261             Lingua::Ident was developed by Michael Piotrowski .
262              
263             =head1 LICENSE
264              
265             This program is free software; you may redistribute it and/or modify
266             it under the same terms as Perl itself.
267              
268             =head1 SEE ALSO
269              
270             Dunning, Ted (1994). I
271             Technical report CRL MCCS-94-273. Computing Research Lab, New Mexico
272             State University.
273              
274             =cut
275              
276             1;