| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package AI::Classifier::Text::FileLearner; |
|
2
|
|
|
|
|
|
|
{ |
|
3
|
|
|
|
|
|
|
$AI::Classifier::Text::FileLearner::VERSION = '0.03'; |
|
4
|
|
|
|
|
|
|
} |
|
5
|
1
|
|
|
1
|
|
25422
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
36
|
|
|
6
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
44
|
|
|
7
|
1
|
|
|
1
|
|
21
|
use 5.010; |
|
|
1
|
|
|
|
|
7
|
|
|
|
1
|
|
|
|
|
37
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
1510
|
use Moose; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
use File::Find::Rule; |
|
11
|
|
|
|
|
|
|
use File::Spec; |
|
12
|
|
|
|
|
|
|
use List::Util 'max'; |
|
13
|
|
|
|
|
|
|
use Carp 'croak'; |
|
14
|
|
|
|
|
|
|
use AI::NaiveBayes::Learner; |
|
15
|
|
|
|
|
|
|
use AI::Classifier::Text; |
|
16
|
|
|
|
|
|
|
use AI::Classifier::Text::Analyzer; |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
has term_weighting => (is => 'ro', isa => 'Str'); |
|
19
|
|
|
|
|
|
|
has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } ); |
|
20
|
|
|
|
|
|
|
has learner => ( is => 'ro', default => sub{ AI::NaiveBayes::Learner->new() } ); |
|
21
|
|
|
|
|
|
|
has training_dir => ( is => 'ro', isa => 'Str', required => 1 ); |
|
22
|
|
|
|
|
|
|
has iterator => ( is => 'ro', lazy_build => 1 ); |
|
23
|
|
|
|
|
|
|
sub _build_iterator { |
|
24
|
|
|
|
|
|
|
my $self = shift; |
|
25
|
|
|
|
|
|
|
my $rule = File::Find::Rule->new( ); |
|
26
|
|
|
|
|
|
|
$rule->file; |
|
27
|
|
|
|
|
|
|
$rule->not_name('*.data'); |
|
28
|
|
|
|
|
|
|
$rule->start( $self->training_dir ); |
|
29
|
|
|
|
|
|
|
return $rule; |
|
30
|
|
|
|
|
|
|
} |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub get_category { |
|
33
|
|
|
|
|
|
|
my( $self, $file ) = @_; |
|
34
|
|
|
|
|
|
|
my $training_dir = $self->training_dir; |
|
35
|
|
|
|
|
|
|
my $rest = File::Spec->abs2rel( $file, $training_dir ); |
|
36
|
|
|
|
|
|
|
my @dirs = File::Spec->splitdir( $rest ); |
|
37
|
|
|
|
|
|
|
return $dirs[0] |
|
38
|
|
|
|
|
|
|
} |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub next { |
|
42
|
|
|
|
|
|
|
my $self = shift; |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
my $file = $self->iterator->match; |
|
45
|
|
|
|
|
|
|
return if !defined($file); |
|
46
|
|
|
|
|
|
|
my $category = $self->get_category( $file ); |
|
47
|
|
|
|
|
|
|
open(my $fh, "<:encoding(UTF-8)", $file ) |
|
48
|
|
|
|
|
|
|
|| Carp::croak( |
|
49
|
|
|
|
|
|
|
"Unable to read the specified training file: $file\n"); |
|
50
|
|
|
|
|
|
|
my $content = join('', <$fh>); |
|
51
|
|
|
|
|
|
|
close $fh; |
|
52
|
|
|
|
|
|
|
my $initial_features = {}; |
|
53
|
|
|
|
|
|
|
if( -f "$file.data" ){ |
|
54
|
|
|
|
|
|
|
my $data = do "$file.data"; |
|
55
|
|
|
|
|
|
|
$initial_features = $data->{initial_features} |
|
56
|
|
|
|
|
|
|
} |
|
57
|
|
|
|
|
|
|
my $features = $self->analyzer->analyze( $content, $initial_features ); |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
return { |
|
60
|
|
|
|
|
|
|
file => $file, |
|
61
|
|
|
|
|
|
|
features => $features, |
|
62
|
|
|
|
|
|
|
categories => [ $category ], |
|
63
|
|
|
|
|
|
|
}; |
|
64
|
|
|
|
|
|
|
} |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub teach_it { |
|
67
|
|
|
|
|
|
|
my $self = shift; |
|
68
|
|
|
|
|
|
|
my $learner = $self->learner; |
|
69
|
|
|
|
|
|
|
while ( my $data = $self->next ) { |
|
70
|
|
|
|
|
|
|
normalize( $data->{features} ); |
|
71
|
|
|
|
|
|
|
$self->weight_terms($data); |
|
72
|
|
|
|
|
|
|
$learner->add_example( |
|
73
|
|
|
|
|
|
|
attributes => $data->{features}, |
|
74
|
|
|
|
|
|
|
labels => $data->{categories} |
|
75
|
|
|
|
|
|
|
); |
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
} |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub classifier { |
|
81
|
|
|
|
|
|
|
my $self = shift; |
|
82
|
|
|
|
|
|
|
$self->teach_it; |
|
83
|
|
|
|
|
|
|
return AI::Classifier::Text->new( |
|
84
|
|
|
|
|
|
|
classifier => $self->learner->classifier, |
|
85
|
|
|
|
|
|
|
analyzer => $self->analyzer, |
|
86
|
|
|
|
|
|
|
); |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
sub weight_terms { |
|
91
|
|
|
|
|
|
|
my ( $self, $doc ) = @_; |
|
92
|
|
|
|
|
|
|
my $f = $doc->{features}; |
|
93
|
|
|
|
|
|
|
given ($self->term_weighting) { |
|
94
|
|
|
|
|
|
|
when ('n') { |
|
95
|
|
|
|
|
|
|
my $max_tf = max values %$f; |
|
96
|
|
|
|
|
|
|
$_ = 0.5 + 0.5 * $_ / $max_tf for values %$f; |
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
when ('b') { |
|
99
|
|
|
|
|
|
|
$_ = $_ ? 1 : 0 for values %$f; |
|
100
|
|
|
|
|
|
|
} |
|
101
|
|
|
|
|
|
|
when (undef){ |
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
default { |
|
104
|
|
|
|
|
|
|
croak 'Unknown weighting type: '.$self->term_weighting; |
|
105
|
|
|
|
|
|
|
} |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
} |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# this doesn't quite fit the current model (it requires the entire collection |
|
110
|
|
|
|
|
|
|
# of documents to be in memory at once), but it may be useful to someone, someday |
|
111
|
|
|
|
|
|
|
# so let's just leave it here |
|
112
|
|
|
|
|
|
|
sub collection_weighting { |
|
113
|
|
|
|
|
|
|
my (@documents, $subtrahend) = @_; |
|
114
|
|
|
|
|
|
|
$subtrahend //= 0; |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
my $num_docs = +@documents; |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
my %frequency; |
|
119
|
|
|
|
|
|
|
for my $doc (@documents) { |
|
120
|
|
|
|
|
|
|
for my $k (keys %{$doc->{attributes}}) { |
|
121
|
|
|
|
|
|
|
$frequency{$k}++; |
|
122
|
|
|
|
|
|
|
} |
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
foreach my $doc (@documents) { |
|
126
|
|
|
|
|
|
|
my $f = $doc->{attributes}; |
|
127
|
|
|
|
|
|
|
for (keys %$f) { |
|
128
|
|
|
|
|
|
|
$f->{$_} *= log($num_docs / ($frequency{$_} // 0) - $subtrahend); |
|
129
|
|
|
|
|
|
|
} |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub euclidean_length { |
|
134
|
|
|
|
|
|
|
my $f = shift; |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
my $total = 0; |
|
137
|
|
|
|
|
|
|
foreach (values %$f) { |
|
138
|
|
|
|
|
|
|
$total += $_**2; |
|
139
|
|
|
|
|
|
|
} |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
return sqrt($total); |
|
142
|
|
|
|
|
|
|
} |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub scale { |
|
145
|
|
|
|
|
|
|
my ($f, $scalar) = @_; |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
$_ *= $scalar foreach values %$f; |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
return $f; |
|
150
|
|
|
|
|
|
|
} |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub normalize { |
|
153
|
|
|
|
|
|
|
my $attrs = shift; |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
my $length = euclidean_length($attrs); |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
return $length ? scale($attrs, 1/$length) : $attrs; |
|
158
|
|
|
|
|
|
|
} |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
1; |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=pod |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=head1 NAME |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
AI::Classifier::Text::FileLearner - Training data reader for AI::NaiveBayes |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=head1 VERSION |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
version 0.03 |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
use AI::Classifier::Text::FileLearner; |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
my $learner = AI::Classifier::Text::FileLearner->new( training_dir => 't/data/training_set_ordered/' ); |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
my $classifier = $learner->classifier; |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
This is a trainer of text classifiers. It traverses a directory filled, |
|
183
|
|
|
|
|
|
|
interprets the subdirectories in it as category names, reads all files in them and adds them |
|
184
|
|
|
|
|
|
|
as examples for the classifier being trained. |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
head1 METHODS |
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=over 4 |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=item next |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
Internal method for traversing the training data directory. |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=item classifier |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
Returns a trained classifier. |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
=back |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=head1 AUTHOR |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
Zbigniew Lukasiak <zlukasiak@opera.com>, Tadeusz SoÅnierz <tsosnierz@opera.com> |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
This software is copyright (c) 2012 by Opera Software ASA. |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
|
209
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=cut |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
__END__ |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# ABSTRACT: Training data reader for AI::NaiveBayes |
|
216
|
|
|
|
|
|
|
|