line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Email::Store::Language;
|
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
33566
|
use strict;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
38
|
|
4
|
1
|
|
|
1
|
|
5
|
use warnings;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
45
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
our $VERSION = '0.03';
|
7
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
6
|
use base qw( Email::Store::DBI );
|
|
1
|
|
|
|
|
6
|
|
|
1
|
|
|
|
|
877
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $OPTIONS = {};
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 NAME
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Email::Store::Language - Add language identification to emails and lists
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
Remember to create the database table:
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
% make install
|
21
|
|
|
|
|
|
|
% perl -MEmail::Store="..." -e 'Email::Store->setup'
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
And now:
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
for( $mail->languages ) {
|
26
|
|
|
|
|
|
|
print $_->language . "\n";
|
27
|
|
|
|
|
|
|
}
|
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
or
|
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
for( $list->languages ) {
|
32
|
|
|
|
|
|
|
print $_->language . "\n";
|
33
|
|
|
|
|
|
|
}
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
This module will help you auto-identify the language of
|
38
|
|
|
|
|
|
|
your messages and lists. There are some options you can use
|
39
|
|
|
|
|
|
|
to help refine the process.
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head2 set_active_languages
|
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
This is a method from L that will let you
|
44
|
|
|
|
|
|
|
limit what languages your messages should be checked against.
|
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# limit to english and french
|
47
|
|
|
|
|
|
|
use Lingua::Identify qw( set_active_languages );
|
48
|
|
|
|
|
|
|
set_active_languages( qw( en fr ) );
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head2 $EMAIL::Store::Language::OPTIONS
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
This is a hashref of options that will be passed as the
|
53
|
|
|
|
|
|
|
first argument to C. There is one exception:
|
54
|
|
|
|
|
|
|
the C option. C should be a number
|
55
|
|
|
|
|
|
|
(percentage) between 0 and 1. The default is 0.5.
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
$Email::Store::Language::OPTIONS = { threshold => 0.35 };
|
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
In the above example, a threshold of 0.35 means that, for mail
|
60
|
|
|
|
|
|
|
language identification, if L claims to be 35%
|
61
|
|
|
|
|
|
|
sure that the message is a given language it will store that language.
|
62
|
|
|
|
|
|
|
If no languages are above the threshold, then the language of most
|
63
|
|
|
|
|
|
|
confidence will be used.
|
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
For list identification, it means that if 35% of the messages are
|
66
|
|
|
|
|
|
|
identified as being a given language, then it will store that language.
|
67
|
|
|
|
|
|
|
If no languages are above the threshold, then the language of most
|
68
|
|
|
|
|
|
|
confidence will be used.
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=head1 SEE ALSO
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=over 4
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=item * Email::Store
|
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=item * Lingua::Identify
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=back
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=head1 AUTHOR
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=over 4
|
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=item * Brian Cassidy Ebricas@cpan.orgE
|
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=back
|
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
Copyright 2006 by Brian Cassidy
|
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify
|
93
|
|
|
|
|
|
|
it under the same terms as Perl itself.
|
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=cut
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
package Email::Store::Mail::Language;
|
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
use base qw( Email::Store::DBI );
|
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
use strict;
|
102
|
|
|
|
|
|
|
use warnings;
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
use Email::Store::Mail;
|
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
__PACKAGE__->table( 'mail_language' );
|
107
|
|
|
|
|
|
|
__PACKAGE__->columns( All => qw( id mail language ) );
|
108
|
|
|
|
|
|
|
__PACKAGE__->columns( Primary => qw( id ) );
|
109
|
|
|
|
|
|
|
__PACKAGE__->has_a( mail => 'Email::Store::Mail' );
|
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
Email::Store::Mail->has_many( languages => 'Email::Store::Mail::Language' );
|
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
sub on_store_order { 81 }
|
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
sub on_store {
|
116
|
|
|
|
|
|
|
my( $self, $mail ) = @_;
|
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
$mail->calculate_language;
|
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
for my $list ( $mail->lists ) {
|
121
|
|
|
|
|
|
|
my $probability = 1 / scalar( $list->posts );
|
122
|
|
|
|
|
|
|
$list->calculate_language if rand( 1 ) <= $probability;
|
123
|
|
|
|
|
|
|
}
|
124
|
|
|
|
|
|
|
}
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
package Email::Store::Mail;
|
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
use Lingua::Identify qw( langof );
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
sub calculate_language {
|
131
|
|
|
|
|
|
|
my $self = shift;
|
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
my %options = %{ $Email::Store::Language::OPTIONS };
|
134
|
|
|
|
|
|
|
my $thresh = delete $options{ threshold } || '0.5';
|
135
|
|
|
|
|
|
|
my %languages = langof( \%options, $self->simple->body );
|
136
|
|
|
|
|
|
|
my @langs = sort { $languages{ $b } <=> $languages{ $a } } keys %languages;
|
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
push @langs, 'en' unless @langs;
|
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
$_->delete for $self->languages;
|
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
my $count = 0;
|
143
|
|
|
|
|
|
|
for( keys %languages ) {
|
144
|
|
|
|
|
|
|
next unless $languages{ $_ } >= $thresh;
|
145
|
|
|
|
|
|
|
$count++;
|
146
|
|
|
|
|
|
|
$self->add_to_languages( { language => $_ } );
|
147
|
|
|
|
|
|
|
}
|
148
|
|
|
|
|
|
|
unless( $count ) {
|
149
|
|
|
|
|
|
|
$self->add_to_languages( { language => $langs[ 0 ] } );
|
150
|
|
|
|
|
|
|
}
|
151
|
|
|
|
|
|
|
}
|
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
package Email::Store::List;
|
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
use strict;
|
156
|
|
|
|
|
|
|
use warnings;
|
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub calculate_language {
|
159
|
|
|
|
|
|
|
my $self = shift;
|
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
my %languages;
|
162
|
|
|
|
|
|
|
my $total = 0;
|
163
|
|
|
|
|
|
|
for my $post ( $self->posts ) {
|
164
|
|
|
|
|
|
|
my @languages = $post->languages;
|
165
|
|
|
|
|
|
|
next unless @languages;
|
166
|
|
|
|
|
|
|
$languages{ $_->language }++ for @languages;
|
167
|
|
|
|
|
|
|
$total++;
|
168
|
|
|
|
|
|
|
}
|
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
$_->delete for $self->languages;
|
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
my $thresh = $Email::Store::Language::OPTIONS->{ threshold } || 0.5;
|
173
|
|
|
|
|
|
|
my @langs = sort { $languages{ $b } <=> $languages{ $a } } keys %languages;
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
my $count = 0;
|
176
|
|
|
|
|
|
|
for( @langs ) {
|
177
|
|
|
|
|
|
|
next unless $languages { $_ } / $total >= $thresh;
|
178
|
|
|
|
|
|
|
$count++;
|
179
|
|
|
|
|
|
|
$self->add_to_languages( { language => $_ } );
|
180
|
|
|
|
|
|
|
}
|
181
|
|
|
|
|
|
|
unless( $count ) {
|
182
|
|
|
|
|
|
|
$self->add_to_languages( { language => $langs[ 0 ] } );
|
183
|
|
|
|
|
|
|
}
|
184
|
|
|
|
|
|
|
}
|
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
package Email::Store::List::Language;
|
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
use strict;
|
189
|
|
|
|
|
|
|
use warnings;
|
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
use base 'Email::Store::DBI';
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
use Email::Store::List;
|
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
__PACKAGE__->table( 'list_language' );
|
196
|
|
|
|
|
|
|
__PACKAGE__->columns( All => qw( id list language ) );
|
197
|
|
|
|
|
|
|
__PACKAGE__->columns( Primary => qw( id ) );
|
198
|
|
|
|
|
|
|
__PACKAGE__->has_a( list => 'Email::Store::List' );
|
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
Email::Store::List->has_many( languages => 'Email::Store::List::Language' );
|
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
package Email::Store::Language;
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
1;
|
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
__DATA__
|