| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Data::Classifier::NaiveBayes; |
|
2
|
2
|
|
|
2
|
|
79902
|
use Moose; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
use MooseX::Types::LoadableClass qw(LoadableClass); |
|
4
|
|
|
|
|
|
|
use List::Util qw(reduce sum); |
|
5
|
|
|
|
|
|
|
use 5.008008; |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
has categories => ( |
|
8
|
|
|
|
|
|
|
is => 'rw', |
|
9
|
|
|
|
|
|
|
default => sub { {} }); |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# Need to implement |
|
12
|
|
|
|
|
|
|
has thresholds => ( |
|
13
|
|
|
|
|
|
|
is => 'rw', |
|
14
|
|
|
|
|
|
|
default => sub { {} }); |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
has tokenizer => ( |
|
17
|
|
|
|
|
|
|
is => 'rw', |
|
18
|
|
|
|
|
|
|
lazy_build => 1); |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
has tokenizer_class => ( |
|
21
|
|
|
|
|
|
|
is => 'ro', |
|
22
|
|
|
|
|
|
|
isa => LoadableClass, |
|
23
|
|
|
|
|
|
|
default => 'Data::Classifier::NaiveBayes::Tokenizer', |
|
24
|
|
|
|
|
|
|
coerce => 1); |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
has words => ( |
|
27
|
|
|
|
|
|
|
is => 'rw', |
|
28
|
|
|
|
|
|
|
default => sub { {} }); |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub _build_tokenizer { $_[0]->tokenizer_class->new } |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub _cat_count { |
|
33
|
|
|
|
|
|
|
my ($self, $category) = @_; |
|
34
|
|
|
|
|
|
|
$self->categories->{$category}; |
|
35
|
|
|
|
|
|
|
} |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
sub _cat_scores { |
|
38
|
|
|
|
|
|
|
my ($self, $text) = @_; |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
my $probs = {}; |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
for my $cat (keys %{$self->categories}) { |
|
43
|
|
|
|
|
|
|
$probs->{$cat} = $self->_text_prop($cat, $text); |
|
44
|
|
|
|
|
|
|
} |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
return sort { $a->[1] <=> $b->[1] } map { [$_, $probs->{$_} ] } keys %{$probs}; |
|
47
|
|
|
|
|
|
|
} |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub _doc_prob { |
|
50
|
|
|
|
|
|
|
my ($self, $text, $cat) = @_; |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
return reduce { $a * $b } @{$self->tokenizer->words($text, sub{ |
|
53
|
|
|
|
|
|
|
my $word = shift; |
|
54
|
|
|
|
|
|
|
return $self->_word_weighted_average($word, $cat); |
|
55
|
|
|
|
|
|
|
})}; |
|
56
|
|
|
|
|
|
|
} |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub _inc_cat { |
|
59
|
|
|
|
|
|
|
my ($self, $cat) = @_; |
|
60
|
|
|
|
|
|
|
$self->categories->{$cat} ||= 0; |
|
61
|
|
|
|
|
|
|
$self->categories->{$cat} += 1; |
|
62
|
|
|
|
|
|
|
} |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
sub _inc_word { |
|
65
|
|
|
|
|
|
|
my ($self, $word, $cat) = @_; |
|
66
|
|
|
|
|
|
|
$self->words->{$word} ||= {}; |
|
67
|
|
|
|
|
|
|
$self->words->{$word}->{$cat} ||= 0; |
|
68
|
|
|
|
|
|
|
$self->words->{$word}->{$cat} += 1; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub _text_prop { |
|
72
|
|
|
|
|
|
|
my ($self, $cat, $text) = @_; |
|
73
|
|
|
|
|
|
|
my $cat_prob = ($self->_cat_count($cat) / $self->_total_count); |
|
74
|
|
|
|
|
|
|
my $doc_prob = $self->_doc_prob($text, $cat); |
|
75
|
|
|
|
|
|
|
return $cat_prob * $doc_prob; |
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub _total_count { |
|
79
|
|
|
|
|
|
|
my ($self) = @_; |
|
80
|
|
|
|
|
|
|
return sum values %{$self->categories}; |
|
81
|
|
|
|
|
|
|
} |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub _word_count { |
|
84
|
|
|
|
|
|
|
my ($self, $word, $category) = @_; |
|
85
|
|
|
|
|
|
|
return 0.0 unless $self->words->{$word} && $self->words->{$word}->{$category}; |
|
86
|
|
|
|
|
|
|
return sprintf("%.2f", $self->words->{$word}->{$category}); |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub _word_prob { |
|
90
|
|
|
|
|
|
|
my ($self, $word, $cat ) = @_; |
|
91
|
|
|
|
|
|
|
return 0.0 if $self->_cat_count($cat) == 0; |
|
92
|
|
|
|
|
|
|
return sprintf("%.2f", $self->_word_count($word, $cat) / $self->_cat_count($cat)); |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
sub _word_weighted_average { |
|
96
|
|
|
|
|
|
|
my ($self, $word, $cat ) = @_; |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
my $weight = 1.0; |
|
99
|
|
|
|
|
|
|
my $assumed_prob = 0.5; |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# calculate current probability |
|
102
|
|
|
|
|
|
|
my $basic_prob = $self->_word_prob($word, $cat); |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
# count the number of times this word has appeared in all |
|
105
|
|
|
|
|
|
|
# categories |
|
106
|
|
|
|
|
|
|
my $totals = sum map { $self->_word_count($word, $_) } keys %{$self->categories}; |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# the final weighted average |
|
109
|
|
|
|
|
|
|
return ($weight * $assumed_prob + $totals * $basic_prob) / ($weight + $totals); |
|
110
|
|
|
|
|
|
|
} |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub classify { |
|
113
|
|
|
|
|
|
|
my ($self, $text, $default) = @_; |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
my $max_prob = 0.0; |
|
116
|
|
|
|
|
|
|
my $best = undef; |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
my @scores = $self->_cat_scores($text); |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
for my $score ( @scores) { |
|
121
|
|
|
|
|
|
|
my ( $cat, $prob ) = @{$score}; |
|
122
|
|
|
|
|
|
|
if ( $prob > $max_prob ) { |
|
123
|
|
|
|
|
|
|
$max_prob = $prob; |
|
124
|
|
|
|
|
|
|
$best = $cat; |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
return $default unless $best; |
|
129
|
|
|
|
|
|
|
my $threshold = $self->thresholds->{$best} || 1.0; |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
for my $score ( @scores ) { |
|
132
|
|
|
|
|
|
|
my ( $cat, $prob ) = @{$score}; |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
next if $cat eq $best; |
|
135
|
|
|
|
|
|
|
return $default if $prob * $threshold > $max_prob; |
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
return $best; |
|
139
|
|
|
|
|
|
|
} |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
sub train { |
|
142
|
|
|
|
|
|
|
my ( $self, $cat, $string ) = @_; |
|
143
|
|
|
|
|
|
|
$self->tokenizer->words($string, sub{ |
|
144
|
|
|
|
|
|
|
$self->_inc_word(shift, $cat); |
|
145
|
|
|
|
|
|
|
}); |
|
146
|
|
|
|
|
|
|
$self->_inc_cat($cat); |
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
1; |
|
150
|
|
|
|
|
|
|
=head1 NAME |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
Data::Classifier::NaiveBayes |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
my $classifier = Data::Classifier::NaiveBayes->new; |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
$classifier->train('token', "Some text to train with"); |
|
159
|
|
|
|
|
|
|
print $classifier->classify("Some text to find a match"); |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
This a Naive Bayes classifer. The code for this project is largely and |
|
164
|
|
|
|
|
|
|
shamelessly based off of the work done by alexandru's stuff-classifier |
|
165
|
|
|
|
|
|
|
originally written in Ruby. |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
https://github.com/alexandru/stuff-classifier |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
The code was ported over to Perl and L<Moose>. |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
For more information please see the following: |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
http://bionicspirit.com/blog/2012/02/09/howto-build-naive-bayes-classifier.html |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=head2 tokenizer |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
An access to L<Data::Classifier::NaiveBayes::Tokenizer>. |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=head2 tokenizer_class |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
A string to the tokenizer class name. |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=head2 words($hash_ref) |
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
A key value pair of word counts by categories |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head2 categories($hash_ref) |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
A key value pair of catogory counts. |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=head1 METHODS |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=head2 classify($phrase) |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
This will return the highest probable category associated with the phrase. |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=head2 train($category, $phrase) |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
This will perform a word count and associate words with a category to later be |
|
203
|
|
|
|
|
|
|
classified. |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
L<Moose> |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
=head1 AUTHOR |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
Logan Bell, C<< <logie@cpan.org> >> |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
Copyright 2012, Logan Bell |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
|
218
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=cut |