File Coverage

blib/lib/Lingua/Ident.pm

Criterion	Covered	Total	%
statement	68	68	100.0
branch	10	12	83.3
condition			n/a
subroutine	6	6	100.0
pod	3	3	100.0
total	87	89	97.7

line	stmt	bran	sub	pod	time	code
1						# Time-stamp: <2010-05-14T16:53:47 mxp>
2						# Copyright � 2000 Michael Piotrowski. All Rights Reserved.
3
4						=head1 NAME
5
6						Lingua::Ident - Statistical language identification
7
8						=head1 SYNOPSIS
9
10						use Lingua::Ident;
11						$classifier = new Lingua::Ident("filename 1", ..., "filename n");
12						$lang = $classifier->identify("text to classify");
13						$probabilities = $classifier->calculate("text to classify");
14
15						=head1 DESCRIPTION
16
17						This module implements a statistical language identifier based on the
18						approach Ted Dunning described in his 1994 report I
19						Identification of Language>.
20
21						=head1 METHODS
22
23						=cut
24
25						###############################################################################
26
27						package Lingua::Ident;
28
29						$VERSION='1.7';
30
31	3		3		1777	use Carp;
	3				5
	3				284
32	3		3		2790	use bytes;
	3				24
	3				16
33	3		3		74	use strict;
	3				8
	3				2510
34
35						=head2 Lingua::Ident->new($filename, ...)
36
37						Construct a new classifier. The filename arguments to the constructor
38						must refer to files containing tables of n-gram probabilites for
39						languages (language models). These tables can be generated using the
40						trainlid(1) utility program.
41
42						=cut
43
44						sub new
45						{
46	2		2	1	70	my $class = shift;
47	2				6	my @files = @_;
48	2				6	my $self = {};
49
50	2				5	my ($filename, $matrix, @matrices, @languages, %bigrams, @bigrams, @n_alph);
51
52	2				5	foreach $filename (@files)
53						{
54	14	50			788	open MATRIX, "<$filename" or croak "$!";
55
56	14				38	$matrix = {};
57	14				45	my %bigrams = ();
58
59	14				306	while ()
60						{
61	51087				52343	chomp;
62
63	51087	100			119723	if (/:/)
		50
64						{
65	38855				78375	(my $key, my $val) = split(/:/);
66	38855				156393	$matrix->{$key} = $val;
67						}
68						elsif (/;/)
69						{
70	12232				26427	(my $key, my $val) = split(/;/);
71	12232				51857	$bigrams{$key} = $val;
72						}
73						}
74
75	14				47	push @matrices, $matrix;
76	14				49	push @languages, $matrix->{'_LANG'};
77	14				31	push @bigrams, \%bigrams;
78	14				48	push @n_alph, $matrix->{'#ALPH'};
79
80	14				418	close MATRIX;
81						}
82
83	2				9	$self->{MATRICES} = \@matrices;
84	2				5	$self->{LANGUAGES} = \@languages;
85	2				6	$self->{BIGRAMS} = \@bigrams;
86
87						# Calculate the average alphabet size over all loaded language models
88	2				6	my $s;
89	2				7	map { $s += $_ } @n_alph;
	14				31
90	2				11	$self->{AVG_ALPH} = $s / @n_alph;
91
92	2				30	return bless $self, $class;
93						}
94
95						=head2 $classifier->identify($string)
96
97						Identify the language of a text given in $string. The identify()
98						method returns the value specified in the B<_LANG> field of the
99						probabilities table of the language in which the text is most likely
100						written (see L<"WARNINGS"> below).
101
102						Internally, the identify() method calls the calculate() method.
103
104						=cut
105
106						sub identify
107						{
108	61		61	1	2237	my $self = shift;
109	61				90	my $text = shift;
110
111	61				69	return ${$self->calculate($text)}[0]->[0];
	61				120
112						}
113
114						=head2 $classifier->calculate($string)
115
116						Calculate the probabilities for a text to be in the languages known to
117						the classifier. This method returns a reference to an array. The
118						array represents a table of languages and the probabiliy for each
119						language. Each array element is a reference to an array containing
120						two elements: The language name and the associated probability. For
121						example, you may get something like this:
122
123						[['de.iso-8859-1', -317.980835274509],
124						['en.iso-8859-1', -450.804230119916], ...]
125
126						The elements are sorted in descending order by probability. You can
127						use this data to assess the reliability of the categorization and make
128						your own decision using application-specific metrics.
129
130						When neither a trigram nor a bigram is found, the calculation deviates
131						slightly from the formula given by Dunning (1994). According to
132						Dunning's formula, one would estimate the probability as:
133
134						p = log(1/#alph)
135
136						where #alph is the size of the alphabet of a particular language.
137						This penalizes different language models with different values because
138						the alphabet sizes of the languages differ.
139
140						However, the size of the alphabet is much larger for Asian languages
141						than for European languages. For example, for the sample data in the
142						Lingua::Ident distribution trainlid(1) reports #alph = 127 for zh.big5
143						vs. #alph = 31 for de.iso-8859-1. This means that Asian languages are
144						penalized much harder than European languages when an estimation must
145						be made.
146
147						To use the I penalty for all languages, calculate() now uses the
148						average of all alphabet sizes instead.
149
150						B This has only been lightly tested yet--feedback is welcome.
151
152						=cut
153
154						sub calculate
155						{
156	61		61	1	81	my $self = shift;
157	61				62	my $text = shift;
158
159	61				64	my @matrices = @{$self->{MATRICES}};
	61				189
160	61				76	my @bigrams = @{$self->{BIGRAMS}};
	61				135
161	61				164	my @prob = (0) x @matrices;
162	61				86	my ($c, $i, @chars, $trigram);
163
164						# for ($i = 0; $i <= $#matrices; $i++) {
165						# print "bigram3 size: " . keys(%{$bigrams[$i]}) . "\n";
166						# }
167
168	61				633	foreach $c (split //, $text)
169						{
170	3862				5484	push @chars, $c;
171	3862	100			6808	if (@chars == 3)
172						{
173	3740				6668	$trigram = lc(join("", @chars));
174						# $trigram = join("", @chars);
175						# $trigram =~ s/[\d\W]/ /og;
176	3740				5482	$trigram =~ s/[\x00-\x1f\x21-\x40\x7b-\x7f]/ /og;
177
178	3740				7645	for ($i = 0; $i <= $#matrices; $i++)
179						{
180	29850	100			57286	if (exists $matrices[$i]->{$trigram})
181						{
182	11373				34789	$prob[$i] += log $matrices[$i]->{$trigram};
183						}
184						else
185						{
186						# $prob[$i] += log $matrices[$i]->{'_NULL'};
187	18477	100			34538	if (exists $bigrams[$i]->{substr($trigram, 0, 2)})
188						{
189	6586				21291	$prob[$i] +=
190						log (1 / $bigrams[$i]->{substr($trigram, 0, 2)});
191						}
192						else
193						{
194						# When neither a trigram nor a bigram is found,
195						# according to Dunning's formula, we would now
196						# calculate:
197
198						# $prob[$i] += log (1 / $matrices[$i]->{'#ALPH'});
199
200						# Thus, we penalize different language models with
201						# different values because of the language's
202						# alphabet size.
203
204						# However, the size of the alphabet (#ALPH) for
205						# Asian languages is much larger than for European
206						# languages, e.g., with the sample data we get 127
207						# for zh.big5 vs. 31 for de.iso-8859-1. This means
208						# that these languages are penalized much harder
209						# than European languages. (This was pointed out by
210						# James Shaw .)
211
212						# To use the same penalty for all languages, we use
213						# the average of the alphabet sizes instead.
214
215						# NOTE: This has only been lightly tested yet.
216
217	11891				31080	$prob[$i] += log (1 / $self->{AVG_ALPH});
218						}
219						}
220						}
221	3740				6036	shift @chars;
222						}
223						}
224
225						# Assemble the results into an array of arrays. Each array
226						# contains two elements: The language name and the associated
227						# probability, e.g., @results may look like this:
228
229						# (['de.iso-8859-1', '-317.980835274509'],
230						# ['en.iso-8859-1', '-450.804230119916'], ...)
231
232	61				381	my @results;
233
234	61				86	for ($i = 0; $i < @{$self->{'LANGUAGES'}}; $i++)
	547				1091
235						{
236	486				1159	push @results, [$self->{'LANGUAGES'}->[$i], $prob[$i]];
237						}
238
239						# Sort results in descending order by probability
240	61				207	my @sorted = sort { $b->[1] <=> $a->[1] } @results;
	850				1097
241
242	61				427	return \@sorted;
243						}
244
245						=head1 WARNINGS
246
247						Since Lingua::Ident is based on statistics it cannot be 100% accurate.
248						More precisely, Dunning (see below) reports his implementation to
249						achieve 92% accuracy with 50 KB of training text for 20-character
250						strings discriminating between English and Spanish. This
251						implementation should be as accurate as Dunning's. However, not only
252						the size but also the quality of the training text plays a role.
253
254						The current implementation doesn't use a threshold to determine if the
255						most probable language has a high enough probability; if you're trying
256						to classify a text in a language for which there is no probability
257						table, this results in getting an incorrect language.
258
259						=head1 AUTHOR
260
261						Lingua::Ident was developed by Michael Piotrowski .
262
263						=head1 LICENSE
264
265						This program is free software; you may redistribute it and/or modify
266						it under the same terms as Perl itself.
267
268						=head1 SEE ALSO
269
270						Dunning, Ted (1994). I
271						Technical report CRL MCCS-94-273. Computing Research Lab, New Mexico
272						State University.
273
274						=cut
275
276						1;