line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# Time-stamp: <2010-05-14T16:53:47 mxp> |
2
|
|
|
|
|
|
|
# Copyright © 2000 Michael Piotrowski. All Rights Reserved. |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
=head1 NAME |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
Lingua::Ident - Statistical language identification |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 SYNOPSIS |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
use Lingua::Ident; |
11
|
|
|
|
|
|
|
$classifier = new Lingua::Ident("filename 1", ..., "filename n"); |
12
|
|
|
|
|
|
|
$lang = $classifier->identify("text to classify"); |
13
|
|
|
|
|
|
|
$probabilities = $classifier->calculate("text to classify"); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 DESCRIPTION |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
This module implements a statistical language identifier based on the |
18
|
|
|
|
|
|
|
approach Ted Dunning described in his 1994 report I
|
19
|
|
|
|
|
|
|
Identification of Language>. |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 METHODS |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=cut |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
############################################################################### |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
package Lingua::Ident; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
$VERSION='1.7'; |
30
|
|
|
|
|
|
|
|
31
|
3
|
|
|
3
|
|
1777
|
use Carp; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
284
|
|
32
|
3
|
|
|
3
|
|
2790
|
use bytes; |
|
3
|
|
|
|
|
24
|
|
|
3
|
|
|
|
|
16
|
|
33
|
3
|
|
|
3
|
|
74
|
use strict; |
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
2510
|
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=head2 Lingua::Ident->new($filename, ...) |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
Construct a new classifier. The filename arguments to the constructor |
38
|
|
|
|
|
|
|
must refer to files containing tables of n-gram probabilites for |
39
|
|
|
|
|
|
|
languages (language models). These tables can be generated using the |
40
|
|
|
|
|
|
|
trainlid(1) utility program. |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=cut |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
sub new |
45
|
|
|
|
|
|
|
{ |
46
|
2
|
|
|
2
|
1
|
70
|
my $class = shift; |
47
|
2
|
|
|
|
|
6
|
my @files = @_; |
48
|
2
|
|
|
|
|
6
|
my $self = {}; |
49
|
|
|
|
|
|
|
|
50
|
2
|
|
|
|
|
5
|
my ($filename, $matrix, @matrices, @languages, %bigrams, @bigrams, @n_alph); |
51
|
|
|
|
|
|
|
|
52
|
2
|
|
|
|
|
5
|
foreach $filename (@files) |
53
|
|
|
|
|
|
|
{ |
54
|
14
|
50
|
|
|
|
788
|
open MATRIX, "<$filename" or croak "$!"; |
55
|
|
|
|
|
|
|
|
56
|
14
|
|
|
|
|
38
|
$matrix = {}; |
57
|
14
|
|
|
|
|
45
|
my %bigrams = (); |
58
|
|
|
|
|
|
|
|
59
|
14
|
|
|
|
|
306
|
while () |
60
|
|
|
|
|
|
|
{ |
61
|
51087
|
|
|
|
|
52343
|
chomp; |
62
|
|
|
|
|
|
|
|
63
|
51087
|
100
|
|
|
|
119723
|
if (/:/) |
|
|
50
|
|
|
|
|
|
64
|
|
|
|
|
|
|
{ |
65
|
38855
|
|
|
|
|
78375
|
(my $key, my $val) = split(/:/); |
66
|
38855
|
|
|
|
|
156393
|
$matrix->{$key} = $val; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
elsif (/;/) |
69
|
|
|
|
|
|
|
{ |
70
|
12232
|
|
|
|
|
26427
|
(my $key, my $val) = split(/;/); |
71
|
12232
|
|
|
|
|
51857
|
$bigrams{$key} = $val; |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
14
|
|
|
|
|
47
|
push @matrices, $matrix; |
76
|
14
|
|
|
|
|
49
|
push @languages, $matrix->{'_LANG'}; |
77
|
14
|
|
|
|
|
31
|
push @bigrams, \%bigrams; |
78
|
14
|
|
|
|
|
48
|
push @n_alph, $matrix->{'#ALPH'}; |
79
|
|
|
|
|
|
|
|
80
|
14
|
|
|
|
|
418
|
close MATRIX; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
2
|
|
|
|
|
9
|
$self->{MATRICES} = \@matrices; |
84
|
2
|
|
|
|
|
5
|
$self->{LANGUAGES} = \@languages; |
85
|
2
|
|
|
|
|
6
|
$self->{BIGRAMS} = \@bigrams; |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# Calculate the average alphabet size over all loaded language models |
88
|
2
|
|
|
|
|
6
|
my $s; |
89
|
2
|
|
|
|
|
7
|
map { $s += $_ } @n_alph; |
|
14
|
|
|
|
|
31
|
|
90
|
2
|
|
|
|
|
11
|
$self->{AVG_ALPH} = $s / @n_alph; |
91
|
|
|
|
|
|
|
|
92
|
2
|
|
|
|
|
30
|
return bless $self, $class; |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=head2 $classifier->identify($string) |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Identify the language of a text given in $string. The identify() |
98
|
|
|
|
|
|
|
method returns the value specified in the B<_LANG> field of the |
99
|
|
|
|
|
|
|
probabilities table of the language in which the text is most likely |
100
|
|
|
|
|
|
|
written (see L<"WARNINGS"> below). |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
Internally, the identify() method calls the calculate() method. |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=cut |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
sub identify |
107
|
|
|
|
|
|
|
{ |
108
|
61
|
|
|
61
|
1
|
2237
|
my $self = shift; |
109
|
61
|
|
|
|
|
90
|
my $text = shift; |
110
|
|
|
|
|
|
|
|
111
|
61
|
|
|
|
|
69
|
return ${$self->calculate($text)}[0]->[0]; |
|
61
|
|
|
|
|
120
|
|
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head2 $classifier->calculate($string) |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
Calculate the probabilities for a text to be in the languages known to |
117
|
|
|
|
|
|
|
the classifier. This method returns a reference to an array. The |
118
|
|
|
|
|
|
|
array represents a table of languages and the probabiliy for each |
119
|
|
|
|
|
|
|
language. Each array element is a reference to an array containing |
120
|
|
|
|
|
|
|
two elements: The language name and the associated probability. For |
121
|
|
|
|
|
|
|
example, you may get something like this: |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
[['de.iso-8859-1', -317.980835274509], |
124
|
|
|
|
|
|
|
['en.iso-8859-1', -450.804230119916], ...] |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
The elements are sorted in descending order by probability. You can |
127
|
|
|
|
|
|
|
use this data to assess the reliability of the categorization and make |
128
|
|
|
|
|
|
|
your own decision using application-specific metrics. |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
When neither a trigram nor a bigram is found, the calculation deviates |
131
|
|
|
|
|
|
|
slightly from the formula given by Dunning (1994). According to |
132
|
|
|
|
|
|
|
Dunning's formula, one would estimate the probability as: |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
p = log(1/#alph) |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
where #alph is the size of the alphabet of a particular language. |
137
|
|
|
|
|
|
|
This penalizes different language models with different values because |
138
|
|
|
|
|
|
|
the alphabet sizes of the languages differ. |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
However, the size of the alphabet is much larger for Asian languages |
141
|
|
|
|
|
|
|
than for European languages. For example, for the sample data in the |
142
|
|
|
|
|
|
|
Lingua::Ident distribution trainlid(1) reports #alph = 127 for zh.big5 |
143
|
|
|
|
|
|
|
vs. #alph = 31 for de.iso-8859-1. This means that Asian languages are |
144
|
|
|
|
|
|
|
penalized much harder than European languages when an estimation must |
145
|
|
|
|
|
|
|
be made. |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
To use the I penalty for all languages, calculate() now uses the |
148
|
|
|
|
|
|
|
average of all alphabet sizes instead. |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
B This has only been lightly tested yet--feedback is welcome. |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=cut |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
sub calculate |
155
|
|
|
|
|
|
|
{ |
156
|
61
|
|
|
61
|
1
|
81
|
my $self = shift; |
157
|
61
|
|
|
|
|
62
|
my $text = shift; |
158
|
|
|
|
|
|
|
|
159
|
61
|
|
|
|
|
64
|
my @matrices = @{$self->{MATRICES}}; |
|
61
|
|
|
|
|
189
|
|
160
|
61
|
|
|
|
|
76
|
my @bigrams = @{$self->{BIGRAMS}}; |
|
61
|
|
|
|
|
135
|
|
161
|
61
|
|
|
|
|
164
|
my @prob = (0) x @matrices; |
162
|
61
|
|
|
|
|
86
|
my ($c, $i, @chars, $trigram); |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
# for ($i = 0; $i <= $#matrices; $i++) { |
165
|
|
|
|
|
|
|
# print "bigram3 size: " . keys(%{$bigrams[$i]}) . "\n"; |
166
|
|
|
|
|
|
|
# } |
167
|
|
|
|
|
|
|
|
168
|
61
|
|
|
|
|
633
|
foreach $c (split //, $text) |
169
|
|
|
|
|
|
|
{ |
170
|
3862
|
|
|
|
|
5484
|
push @chars, $c; |
171
|
3862
|
100
|
|
|
|
6808
|
if (@chars == 3) |
172
|
|
|
|
|
|
|
{ |
173
|
3740
|
|
|
|
|
6668
|
$trigram = lc(join("", @chars)); |
174
|
|
|
|
|
|
|
# $trigram = join("", @chars); |
175
|
|
|
|
|
|
|
# $trigram =~ s/[\d\W]/ /og; |
176
|
3740
|
|
|
|
|
5482
|
$trigram =~ s/[\x00-\x1f\x21-\x40\x7b-\x7f]/ /og; |
177
|
|
|
|
|
|
|
|
178
|
3740
|
|
|
|
|
7645
|
for ($i = 0; $i <= $#matrices; $i++) |
179
|
|
|
|
|
|
|
{ |
180
|
29850
|
100
|
|
|
|
57286
|
if (exists $matrices[$i]->{$trigram}) |
181
|
|
|
|
|
|
|
{ |
182
|
11373
|
|
|
|
|
34789
|
$prob[$i] += log $matrices[$i]->{$trigram}; |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
else |
185
|
|
|
|
|
|
|
{ |
186
|
|
|
|
|
|
|
# $prob[$i] += log $matrices[$i]->{'_NULL'}; |
187
|
18477
|
100
|
|
|
|
34538
|
if (exists $bigrams[$i]->{substr($trigram, 0, 2)}) |
188
|
|
|
|
|
|
|
{ |
189
|
6586
|
|
|
|
|
21291
|
$prob[$i] += |
190
|
|
|
|
|
|
|
log (1 / $bigrams[$i]->{substr($trigram, 0, 2)}); |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
else |
193
|
|
|
|
|
|
|
{ |
194
|
|
|
|
|
|
|
# When neither a trigram nor a bigram is found, |
195
|
|
|
|
|
|
|
# according to Dunning's formula, we would now |
196
|
|
|
|
|
|
|
# calculate: |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# $prob[$i] += log (1 / $matrices[$i]->{'#ALPH'}); |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
# Thus, we penalize different language models with |
201
|
|
|
|
|
|
|
# different values because of the language's |
202
|
|
|
|
|
|
|
# alphabet size. |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
# However, the size of the alphabet (#ALPH) for |
205
|
|
|
|
|
|
|
# Asian languages is much larger than for European |
206
|
|
|
|
|
|
|
# languages, e.g., with the sample data we get 127 |
207
|
|
|
|
|
|
|
# for zh.big5 vs. 31 for de.iso-8859-1. This means |
208
|
|
|
|
|
|
|
# that these languages are penalized much harder |
209
|
|
|
|
|
|
|
# than European languages. (This was pointed out by |
210
|
|
|
|
|
|
|
# James Shaw .) |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# To use the same penalty for all languages, we use |
213
|
|
|
|
|
|
|
# the average of the alphabet sizes instead. |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# NOTE: This has only been lightly tested yet. |
216
|
|
|
|
|
|
|
|
217
|
11891
|
|
|
|
|
31080
|
$prob[$i] += log (1 / $self->{AVG_ALPH}); |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
} |
221
|
3740
|
|
|
|
|
6036
|
shift @chars; |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# Assemble the results into an array of arrays. Each array |
226
|
|
|
|
|
|
|
# contains two elements: The language name and the associated |
227
|
|
|
|
|
|
|
# probability, e.g., @results may look like this: |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# (['de.iso-8859-1', '-317.980835274509'], |
230
|
|
|
|
|
|
|
# ['en.iso-8859-1', '-450.804230119916'], ...) |
231
|
|
|
|
|
|
|
|
232
|
61
|
|
|
|
|
381
|
my @results; |
233
|
|
|
|
|
|
|
|
234
|
61
|
|
|
|
|
86
|
for ($i = 0; $i < @{$self->{'LANGUAGES'}}; $i++) |
|
547
|
|
|
|
|
1091
|
|
235
|
|
|
|
|
|
|
{ |
236
|
486
|
|
|
|
|
1159
|
push @results, [$self->{'LANGUAGES'}->[$i], $prob[$i]]; |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
# Sort results in descending order by probability |
240
|
61
|
|
|
|
|
207
|
my @sorted = sort { $b->[1] <=> $a->[1] } @results; |
|
850
|
|
|
|
|
1097
|
|
241
|
|
|
|
|
|
|
|
242
|
61
|
|
|
|
|
427
|
return \@sorted; |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=head1 WARNINGS |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Since Lingua::Ident is based on statistics it cannot be 100% accurate. |
248
|
|
|
|
|
|
|
More precisely, Dunning (see below) reports his implementation to |
249
|
|
|
|
|
|
|
achieve 92% accuracy with 50 KB of training text for 20-character |
250
|
|
|
|
|
|
|
strings discriminating between English and Spanish. This |
251
|
|
|
|
|
|
|
implementation should be as accurate as Dunning's. However, not only |
252
|
|
|
|
|
|
|
the size but also the quality of the training text plays a role. |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
The current implementation doesn't use a threshold to determine if the |
255
|
|
|
|
|
|
|
most probable language has a high enough probability; if you're trying |
256
|
|
|
|
|
|
|
to classify a text in a language for which there is no probability |
257
|
|
|
|
|
|
|
table, this results in getting an incorrect language. |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=head1 AUTHOR |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
Lingua::Ident was developed by Michael Piotrowski . |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head1 LICENSE |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
This program is free software; you may redistribute it and/or modify |
266
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
=head1 SEE ALSO |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
Dunning, Ted (1994). I |
271
|
|
|
|
|
|
|
Technical report CRL MCCS-94-273. Computing Research Lab, New Mexico |
272
|
|
|
|
|
|
|
State University. |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
=cut |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
1; |