line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::YALI::Identifier; |
2
|
|
|
|
|
|
|
# ABSTRACT: Module for language identification with custom models. |
3
|
|
|
|
|
|
|
|
4
|
19
|
|
|
19
|
|
164708
|
use strict; |
|
19
|
|
|
|
|
28
|
|
|
19
|
|
|
|
|
580
|
|
5
|
19
|
|
|
19
|
|
71
|
use warnings; |
|
19
|
|
|
|
|
26
|
|
|
19
|
|
|
|
|
430
|
|
6
|
19
|
|
|
19
|
|
3805
|
use Moose; |
|
19
|
|
|
|
|
2755133
|
|
|
19
|
|
|
|
|
99
|
|
7
|
19
|
|
|
19
|
|
86836
|
use Carp; |
|
19
|
|
|
|
|
29
|
|
|
19
|
|
|
|
|
1297
|
|
8
|
19
|
|
|
19
|
|
9630
|
use PerlIO::gzip; |
|
19
|
|
|
|
|
10149
|
|
|
19
|
|
|
|
|
562
|
|
9
|
19
|
|
|
19
|
|
5729
|
use Lingua::YALI; |
|
19
|
|
|
|
|
98
|
|
|
19
|
|
|
|
|
8949
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = '0.014_01'; # VERSION |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
# hash with paths to models |
14
|
|
|
|
|
|
|
# format: { 'class1' => 'file1', 'class2' => 'file2' } |
15
|
|
|
|
|
|
|
has '_model_file' => ( |
16
|
|
|
|
|
|
|
is => 'rw', |
17
|
|
|
|
|
|
|
isa => 'HashRef' |
18
|
|
|
|
|
|
|
); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# hash with n-gram frequencies retrieved from models |
21
|
|
|
|
|
|
|
# format: { 'ngram1' => { 'class1' => 0.5, 'class2' => 0.3}, 'ngram2' => ...} |
22
|
|
|
|
|
|
|
has '_frequency' => ( |
23
|
|
|
|
|
|
|
is => 'rw', |
24
|
|
|
|
|
|
|
isa => 'HashRef' |
25
|
|
|
|
|
|
|
); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# n-gram size |
28
|
|
|
|
|
|
|
has '_ngram' => ( |
29
|
|
|
|
|
|
|
is => 'rw', |
30
|
|
|
|
|
|
|
isa => 'Int' |
31
|
|
|
|
|
|
|
); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# list of identified classes |
34
|
|
|
|
|
|
|
has '_classes' => ( |
35
|
|
|
|
|
|
|
is => 'rw', |
36
|
|
|
|
|
|
|
isa => 'ArrayRef' |
37
|
|
|
|
|
|
|
); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub BUILD |
42
|
|
|
|
|
|
|
{ |
43
|
16
|
|
|
16
|
1
|
20074
|
my $self = shift; |
44
|
16
|
|
|
|
|
43
|
my %frequency = (); |
45
|
16
|
|
|
|
|
37
|
my @classes = (); |
46
|
16
|
|
|
|
|
60
|
$self->{_frequency} = \%frequency; |
47
|
16
|
|
|
|
|
32
|
$self->{_classes} = \@classes; |
48
|
|
|
|
|
|
|
|
49
|
16
|
|
|
|
|
45
|
return; |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub add_class |
54
|
|
|
|
|
|
|
{ |
55
|
45
|
|
|
45
|
1
|
5216
|
my ( $self, $class, $file ) = @_; |
56
|
|
|
|
|
|
|
|
57
|
45
|
100
|
|
|
|
159
|
if ( defined( $self->{_model_file}->{$class} ) ) { |
58
|
5
|
|
|
|
|
15
|
return 0; |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# parameter check |
62
|
40
|
100
|
|
|
|
112
|
if ( ! defined($file) ) { |
63
|
1
|
|
|
|
|
17
|
croak("Model has to be specified."); |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
39
|
100
|
|
|
|
1233
|
if ( ! -r $file ) { |
67
|
1
|
|
|
|
|
15
|
croak("Model $file is not readable."); |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
|
70
|
38
|
|
|
|
|
240
|
$self->_load_model($class, $file); |
71
|
|
|
|
|
|
|
|
72
|
37
|
|
|
|
|
224
|
return 1; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
sub remove_class |
77
|
|
|
|
|
|
|
{ |
78
|
17
|
|
|
17
|
1
|
4027
|
my ( $self, $class ) = @_; |
79
|
|
|
|
|
|
|
|
80
|
17
|
100
|
|
|
|
53
|
if ( defined( $self->{_model_file}->{$class} ) ) { |
81
|
11
|
|
|
|
|
41
|
$self->_unload_model($class); |
82
|
|
|
|
|
|
|
|
83
|
11
|
|
|
|
|
38
|
return 1; |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
6
|
|
|
|
|
18
|
return 0; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
sub get_classes |
91
|
|
|
|
|
|
|
{ |
92
|
42
|
|
|
42
|
1
|
90
|
my $self = shift; |
93
|
42
|
|
|
|
|
147
|
return $self->{_classes}; |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
sub identify_file |
98
|
|
|
|
|
|
|
{ |
99
|
10
|
|
|
10
|
1
|
1814
|
my ( $self, $file ) = @_; |
100
|
|
|
|
|
|
|
|
101
|
10
|
50
|
|
|
|
28
|
if ( ! defined($file) ) { |
102
|
0
|
|
|
|
|
0
|
return; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
10
|
|
|
|
|
44
|
my $fh = Lingua::YALI::_open($file); |
106
|
|
|
|
|
|
|
|
107
|
10
|
|
|
|
|
72
|
return $self->identify_handle($fh); |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub identify_string |
112
|
|
|
|
|
|
|
{ |
113
|
5
|
|
|
5
|
1
|
3349
|
my ( $self, $string ) = @_; |
114
|
5
|
50
|
|
3
|
|
190
|
open(my $fh, "<", \$string) or croak $!; |
|
3
|
|
|
|
|
23
|
|
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
19
|
|
115
|
|
|
|
|
|
|
|
116
|
5
|
50
|
|
|
|
2862
|
if ( ! defined($string) ) { |
117
|
0
|
|
|
|
|
0
|
return; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
5
|
|
|
|
|
33
|
my $result = $self->identify_handle($fh); |
121
|
|
|
|
|
|
|
|
122
|
5
|
|
|
|
|
52
|
close($fh); |
123
|
|
|
|
|
|
|
|
124
|
5
|
|
|
|
|
39
|
return $result; |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
sub identify_handle |
129
|
|
|
|
|
|
|
{ |
130
|
19
|
|
|
19
|
1
|
1493
|
my ($self, $fh) = @_; |
131
|
19
|
|
|
|
|
44
|
my %actRes = (); |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# parameter check |
134
|
19
|
|
|
|
|
76
|
my $ngram = $self->{_ngram}; |
135
|
19
|
100
|
|
|
|
68
|
if ( ! defined($ngram) ) { |
136
|
1
|
|
|
|
|
36
|
croak("At least one class must be specified."); |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
18
|
50
|
|
|
|
104
|
if ( ! defined($fh) ) { |
|
|
50
|
|
|
|
|
|
140
|
0
|
|
|
|
|
0
|
return; |
141
|
|
|
|
|
|
|
} elsif ( ref $fh ne "GLOB" ) { |
142
|
0
|
|
|
|
|
0
|
croak("Expected file handler but " . (ref $fh) . " was used."); |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# read input file |
146
|
18
|
|
|
|
|
193
|
while ( <$fh> ) { |
147
|
93
|
|
|
|
|
131
|
chomp; |
148
|
93
|
|
|
|
|
2163
|
s/ +/ /g; |
149
|
93
|
|
|
|
|
108
|
s/^ +//g; |
150
|
93
|
|
|
|
|
620
|
s/ +$//g; |
151
|
93
|
50
|
|
|
|
165
|
if ( ! $_ ) { |
152
|
0
|
|
|
|
|
0
|
next; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# $_ = $padding . $_ . $padding; |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
{ |
158
|
19
|
|
|
19
|
|
9192
|
use bytes; |
|
19
|
|
|
|
|
150
|
|
|
19
|
|
|
|
|
72
|
|
|
93
|
|
|
|
|
91
|
|
159
|
93
|
|
|
|
|
290
|
for my $i (0 .. bytes::length($_) - $ngram) { |
160
|
36783
|
|
|
|
|
31306
|
my $w = substr($_, $i, $ngram); |
161
|
|
|
|
|
|
|
|
162
|
36783
|
100
|
|
|
|
48775
|
if ( defined($self->{_frequency}->{$w}) ) { |
163
|
18277
|
|
|
|
|
10184
|
for my $lang (keys %{$self->{_frequency}->{$w}}) { |
|
18277
|
|
|
|
|
21257
|
|
164
|
|
|
|
|
|
|
# print STDERR "$w - $lang - $frequency{$w}{$lang}\n"; |
165
|
19304
|
|
|
|
|
22091
|
$actRes{$lang} += $self->{_frequency}->{$w}{$lang}; |
166
|
|
|
|
|
|
|
# print STDERR "Lang: $lang - $actRes{$lang}\n"; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
# sum scores of all classifiers |
174
|
18
|
|
|
|
|
35
|
my @allLanguages = @ { $self->get_classes() }; |
|
18
|
|
|
|
|
147
|
|
175
|
|
|
|
|
|
|
|
176
|
18
|
|
|
|
|
37
|
my $sum = 0; |
177
|
18
|
|
|
|
|
37
|
for my $l (@allLanguages) { |
178
|
33
|
|
|
|
|
34
|
my $score = 0; |
179
|
33
|
100
|
|
|
|
83
|
if ( defined($actRes{$l}) ) { |
180
|
27
|
|
|
|
|
37
|
$score = $actRes{$l}; |
181
|
|
|
|
|
|
|
} |
182
|
33
|
|
|
|
|
54
|
$sum += $score; |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
# normalize results |
186
|
18
|
|
|
|
|
31
|
my @res = (); |
187
|
|
|
|
|
|
|
|
188
|
18
|
|
|
|
|
34
|
for my $l (@allLanguages) { |
189
|
33
|
|
|
|
|
38
|
my $score = 0; |
190
|
33
|
100
|
|
|
|
78
|
if ( defined($actRes{$l}) ) { |
191
|
27
|
|
|
|
|
118
|
$score = $actRes{$l} / $sum; |
192
|
|
|
|
|
|
|
} |
193
|
33
|
|
|
|
|
71
|
my @pair = ($l, $score); |
194
|
33
|
|
|
|
|
125
|
push(@res, \@pair); |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
# sort according to score |
198
|
18
|
|
|
|
|
135
|
my @sortedRes = sort { $b->[1] <=> $a->[1] } @res; |
|
15
|
|
|
|
|
95
|
|
199
|
|
|
|
|
|
|
|
200
|
18
|
|
|
|
|
203
|
return \@sortedRes; |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
# recompute classes after manipulation with classes |
204
|
|
|
|
|
|
|
sub _compute_classes |
205
|
|
|
|
|
|
|
{ |
206
|
48
|
|
|
48
|
|
83
|
my $self = shift; |
207
|
48
|
|
|
|
|
59
|
my @classes = keys %{ $self->{_model_file} }; |
|
48
|
|
|
|
|
238
|
|
208
|
|
|
|
|
|
|
|
209
|
48
|
|
|
|
|
110
|
$self->{_classes} = \@classes; |
210
|
|
|
|
|
|
|
|
211
|
48
|
|
|
|
|
132
|
return; |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# load model |
215
|
|
|
|
|
|
|
sub _load_model |
216
|
|
|
|
|
|
|
{ |
217
|
38
|
|
|
38
|
|
66
|
my ($self, $class, $file) = @_; |
218
|
|
|
|
|
|
|
|
219
|
38
|
50
|
|
|
|
194
|
if ( $self->{_model_file}->{$class} ) { |
220
|
0
|
|
|
|
|
0
|
return; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
|
223
|
38
|
50
|
|
|
|
2172
|
open(my $fh, "<:gzip:bytes", $file) or croak($!); |
224
|
38
|
|
|
|
|
2416
|
my $ngram = <$fh>; |
225
|
38
|
|
|
|
|
73
|
my $total_line = <$fh>; |
226
|
|
|
|
|
|
|
|
227
|
38
|
100
|
|
|
|
145
|
if ( ! defined($self->{_ngram}) ) { |
228
|
18
|
|
|
|
|
51
|
$self->{_ngram} = $ngram; |
229
|
|
|
|
|
|
|
} else { |
230
|
20
|
100
|
|
|
|
103
|
if ( $ngram != $self->{_ngram} ) { |
231
|
1
|
|
|
|
|
22
|
croak("Incompatible model for '$class'. Expected $self->{_ngram}-grams, but was $ngram-gram."); |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
37
|
|
|
|
|
64
|
my $sum = 0; |
236
|
37
|
|
|
|
|
127
|
while ( <$fh> ) { |
237
|
28825
|
|
|
|
|
17409
|
chomp; |
238
|
28825
|
|
|
|
|
27100
|
my @p = split(/\t/, $_); |
239
|
28825
|
|
|
|
|
18065
|
my $word = $p[0]; |
240
|
28825
|
|
|
|
|
48681
|
$self->{_frequency}->{$word}{$class} = $p[1]; |
241
|
28825
|
|
|
|
|
51516
|
$sum += $p[1]; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
37
|
|
|
|
|
57
|
for my $word (keys %{$self->{_frequency}}) { |
|
37
|
|
|
|
|
7816
|
|
245
|
46079
|
100
|
|
|
|
61305
|
if ( defined($self->{_frequency}->{$word}{$class}) ) { |
246
|
28828
|
|
|
|
|
29500
|
$self->{_frequency}->{$word}{$class} /= $sum; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
|
250
|
37
|
|
|
|
|
3472
|
close($fh); |
251
|
|
|
|
|
|
|
|
252
|
37
|
|
|
|
|
172
|
$self->{_model_file}->{$class} = $file; |
253
|
37
|
|
|
|
|
281
|
$self->_compute_classes(); |
254
|
|
|
|
|
|
|
|
255
|
37
|
|
|
|
|
155
|
return; |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
# unload model |
259
|
|
|
|
|
|
|
sub _unload_model |
260
|
|
|
|
|
|
|
{ |
261
|
11
|
|
|
11
|
|
17
|
my ($self, $class) = @_; |
262
|
|
|
|
|
|
|
|
263
|
11
|
50
|
|
|
|
32
|
if ( ! $self->{_model_file}->{$class} ) { |
264
|
0
|
|
|
|
|
0
|
return; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
|
267
|
11
|
|
|
|
|
22
|
delete( $self->{_model_file}->{$class} ); |
268
|
11
|
|
|
|
|
27
|
$self->_compute_classes(); |
269
|
|
|
|
|
|
|
|
270
|
11
|
|
|
|
|
27
|
my $classes = $self->get_classes(); |
271
|
|
|
|
|
|
|
# print STDERR "\nX=removing $class\n" . (join("\t", @$classes)) . "\n" . (scalar @$classes) . "\nX\n"; |
272
|
11
|
100
|
|
|
|
36
|
if ( scalar @$classes == 0 ) { |
273
|
4
|
|
|
|
|
11
|
delete($self->{_ngram}); |
274
|
4
|
|
|
|
|
7
|
$self->{_ngram} = undef; |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
|
280
|
11
|
|
|
|
|
16
|
return; |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
1; |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
__END__ |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
=pod |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=encoding UTF-8 |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=head1 NAME |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
Lingua::YALI::Identifier - Module for language identification with custom models. |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=head1 VERSION |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
version 0.014_01 |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
=head1 SYNOPSIS |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
This modul identify languages with moduls provided by the user. If you want to use pretrained models use L<Lingua::YALI::LanguageIdentifier|Lingua::YALI::LanguageIdentifier>. |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
Models trained on texts from specific domain outperforms the general ones. |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
use Lingua::YALI::Builder; |
307
|
|
|
|
|
|
|
use Lingua::YALI::Identifier; |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
# create models |
310
|
|
|
|
|
|
|
my $builder_a = Lingua::YALI::Builder->new(ngrams=>[2]); |
311
|
|
|
|
|
|
|
$builder_a->train_string("aaaaa aaaa aaa aaa aaa aaaaa aa"); |
312
|
|
|
|
|
|
|
$builder_a->store("model_a.2_all.gz", 2); |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
my $builder_b = Lingua::YALI::Builder->new(ngrams=>[2]); |
315
|
|
|
|
|
|
|
$builder_b->train_string("bbbbbb bbbb bbbb bbb bbbb bbbb bbb"); |
316
|
|
|
|
|
|
|
$builder_b->store("model_b.2_all.gz", 2); |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
# create identifier and load models |
319
|
|
|
|
|
|
|
my $identifier = Lingua::YALI::Identifier->new(); |
320
|
|
|
|
|
|
|
$identifier->add_class("a", "model_a.2_all.gz"); |
321
|
|
|
|
|
|
|
$identifier->add_class("b", "model_b.2_all.gz"); |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# identify strings |
324
|
|
|
|
|
|
|
my $result1 = $identifier->identify_string("aaaaaaaaaaaaaaaaaaa"); |
325
|
|
|
|
|
|
|
print $result1->[0]->[0] . "\t" . $result1->[0]->[1]; |
326
|
|
|
|
|
|
|
# prints out a 1 |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
my $result2 = $identifier->identify_string("bbbbbbbbbbbbbbbbbbb"); |
329
|
|
|
|
|
|
|
print $result2->[0]->[0] . "\t" . $result2->[0]->[1]; |
330
|
|
|
|
|
|
|
# prints out b 1 |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
More examples is presented in L<Lingua::YALI::Examples|Lingua::YALI::Examples>. |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=head1 METHODS |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=head2 BUILD |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
Initializes internal variables. |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
# create identifier |
341
|
|
|
|
|
|
|
my $identifier = Lingua::YALI::Identifier->new(); |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=head2 add_class |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
$added = $identifier->add_class($class, $model) |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
Adds model stored in file C<$model> with class C<$class> and |
348
|
|
|
|
|
|
|
returns whether it was added or not. |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
print $identifier->add_class("a", "model.a1.gz") . "\n"; |
351
|
|
|
|
|
|
|
# prints out 1 |
352
|
|
|
|
|
|
|
print $identifier->add_class("a", "model.a2.gz") . "\n"; |
353
|
|
|
|
|
|
|
# prints out 0 - class a was already added |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
=head2 remove_class |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
my $removed = $identifier->remove_class($class); |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
Removes model for class C<$class>. |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
$identifier->add_class("a", "model.a1.gz"); |
362
|
|
|
|
|
|
|
print $identifier->remove_class("a") . "\n"; |
363
|
|
|
|
|
|
|
# prints out 1 |
364
|
|
|
|
|
|
|
print $identifier->remove_class("a") . "\n"; |
365
|
|
|
|
|
|
|
# prints out 0 - class a was already removed |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=head2 get_classes |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
my \@classes = $identifier->get_classes(); |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
Returns all registered classes. |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
=head2 identify_file |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
my $result = $identifier->identify_file($file) |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
Identifies class for file C<$file>. |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
=over |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
=item * It returns undef if C<$file> is undef. |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
=item * It croaks if the file C<$file> does not exist or is not readable. |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
=item * Otherwise look for more details at method L</identify_handle>. |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
=back |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=head2 identify_string |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
my $result = $identifier->identify_string($string) |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
Identifies class for string C<$string>. |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=over |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
=item * It returns undef if C<$string> is undef. |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
=item * Otherwise look for more details at method L</identify_handle>. |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
=back |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=head2 identify_handle |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
my $result = $identifier->identify_handle($fh) |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
Identifies class for file handle C<$fh> and returns: |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
=over |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
=item * It returns undef if C<$fh> is undef. |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
=item * It croaks if the C<$fh> is not file handle. |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
=item * It returns array reference in format [ ['class1', score1], ['class2', score2], ...] sorted |
416
|
|
|
|
|
|
|
according to score descendently, so the most probable class is the first. |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=back |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head1 SEE ALSO |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=over |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
=item * Identifier with pretrained models for language identification is L<Lingua::YALI::LanguageIdentifier|Lingua::YALI::LanguageIdentifier>. |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
=item * Builder for these models is L<Lingua::YALI::Builder|Lingua::YALI::Builder>. |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=item * There is also command line tool L<yali-identifier|Lingua::YALI::yali-identifier> with similar functionality. |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=item * Source codes are available at L<https://github.com/martin-majlis/YALI>. |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=back |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=head1 AUTHOR |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
Martin Majlis <martin@majlis.cz> |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
This software is Copyright (c) 2012 by Martin Majlis. |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
This is free software, licensed under: |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
The (three-clause) BSD License |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head1 AUTHOR |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
Martin Majlis <martin@majlis.cz> |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
This software is Copyright (c) 2012 by Martin Majlis. |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
This is free software, licensed under: |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
The (three-clause) BSD License |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=cut |