File Coverage

blib/lib/Data/Classifier/NaiveBayes.pm

Criterion	Covered	Total	%
statement	1	3	33.3
branch			n/a
condition			n/a
subroutine	1	1	100.0
pod			n/a
total	2	4	50.0

line	stmt	sub	time	code
1				package Data::Classifier::NaiveBayes;
2	2	2	79902	use Moose;
	0
	0
3				use MooseX::Types::LoadableClass qw(LoadableClass);
4				use List::Util qw(reduce sum);
5				use 5.008008;
6
7				has categories => (
8				is => 'rw',
9				default => sub { {} });
10
11				# Need to implement
12				has thresholds => (
13				is => 'rw',
14				default => sub { {} });
15
16				has tokenizer => (
17				is => 'rw',
18				lazy_build => 1);
19
20				has tokenizer_class => (
21				is => 'ro',
22				isa => LoadableClass,
23				default => 'Data::Classifier::NaiveBayes::Tokenizer',
24				coerce => 1);
25
26				has words => (
27				is => 'rw',
28				default => sub { {} });
29
30				sub _build_tokenizer { $_[0]->tokenizer_class->new }
31
32				sub _cat_count {
33				my ($self, $category) = @_;
34				$self->categories->{$category};
35				}
36
37				sub _cat_scores {
38				my ($self, $text) = @_;
39
40				my $probs = {};
41
42				for my $cat (keys %{$self->categories}) {
43				$probs->{$cat} = $self->_text_prop($cat, $text);
44				}
45
46				return sort { $a->[1] <=> $b->[1] } map { [$_, $probs->{$_} ] } keys %{$probs};
47				}
48
49				sub _doc_prob {
50				my ($self, $text, $cat) = @_;
51
52				return reduce { $a * $b } @{$self->tokenizer->words($text, sub{
53				my $word = shift;
54				return $self->_word_weighted_average($word, $cat);
55				})};
56				}
57
58				sub _inc_cat {
59				my ($self, $cat) = @_;
60				$self->categories->{$cat} \|\|= 0;
61				$self->categories->{$cat} += 1;
62				}
63
64				sub _inc_word {
65				my ($self, $word, $cat) = @_;
66				$self->words->{$word} \|\|= {};
67				$self->words->{$word}->{$cat} \|\|= 0;
68				$self->words->{$word}->{$cat} += 1;
69				}
70
71				sub _text_prop {
72				my ($self, $cat, $text) = @_;
73				my $cat_prob = ($self->_cat_count($cat) / $self->_total_count);
74				my $doc_prob = $self->_doc_prob($text, $cat);
75				return $cat_prob * $doc_prob;
76				}
77
78				sub _total_count {
79				my ($self) = @_;
80				return sum values %{$self->categories};
81				}
82
83				sub _word_count {
84				my ($self, $word, $category) = @_;
85				return 0.0 unless $self->words->{$word} && $self->words->{$word}->{$category};
86				return sprintf("%.2f", $self->words->{$word}->{$category});
87				}
88
89				sub _word_prob {
90				my ($self, $word, $cat ) = @_;
91				return 0.0 if $self->_cat_count($cat) == 0;
92				return sprintf("%.2f", $self->_word_count($word, $cat) / $self->_cat_count($cat));
93				}
94
95				sub _word_weighted_average {
96				my ($self, $word, $cat ) = @_;
97
98				my $weight = 1.0;
99				my $assumed_prob = 0.5;
100
101				# calculate current probability
102				my $basic_prob = $self->_word_prob($word, $cat);
103
104				# count the number of times this word has appeared in all
105				# categories
106				my $totals = sum map { $self->_word_count($word, $_) } keys %{$self->categories};
107
108				# the final weighted average
109				return ($weight * $assumed_prob + $totals * $basic_prob) / ($weight + $totals);
110				}
111
112				sub classify {
113				my ($self, $text, $default) = @_;
114
115				my $max_prob = 0.0;
116				my $best = undef;
117
118				my @scores = $self->_cat_scores($text);
119
120				for my $score ( @scores) {
121				my ( $cat, $prob ) = @{$score};
122				if ( $prob > $max_prob ) {
123				$max_prob = $prob;
124				$best = $cat;
125				}
126				}
127
128				return $default unless $best;
129				my $threshold = $self->thresholds->{$best} \|\| 1.0;
130
131				for my $score ( @scores ) {
132				my ( $cat, $prob ) = @{$score};
133
134				next if $cat eq $best;
135				return $default if $prob * $threshold > $max_prob;
136				}
137
138				return $best;
139				}
140
141				sub train {
142				my ( $self, $cat, $string ) = @_;
143				$self->tokenizer->words($string, sub{
144				$self->_inc_word(shift, $cat);
145				});
146				$self->_inc_cat($cat);
147				}
148
149				1;
150				=head1 NAME
151
152				Data::Classifier::NaiveBayes
153
154				=head1 SYNOPSIS
155
156				my $classifier = Data::Classifier::NaiveBayes->new;
157
158				$classifier->train('token', "Some text to train with");
159				print $classifier->classify("Some text to find a match");
160
161				=head1 DESCRIPTION
162
163				This a Naive Bayes classifer. The code for this project is largely and
164				shamelessly based off of the work done by alexandru's stuff-classifier
165				originally written in Ruby.
166
167				https://github.com/alexandru/stuff-classifier
168
169				The code was ported over to Perl and L<Moose>.
170
171				For more information please see the following:
172
173				http://bionicspirit.com/blog/2012/02/09/howto-build-naive-bayes-classifier.html
174
175
176				=head1 ATTRIBUTES
177
178				=head2 tokenizer
179
180				An access to L<Data::Classifier::NaiveBayes::Tokenizer>.
181
182				=head2 tokenizer_class
183
184				A string to the tokenizer class name.
185
186				=head2 words($hash_ref)
187
188				A key value pair of word counts by categories
189
190				=head2 categories($hash_ref)
191
192				A key value pair of catogory counts.
193
194				=head1 METHODS
195
196				=head2 classify($phrase)
197
198				This will return the highest probable category associated with the phrase.
199
200				=head2 train($category, $phrase)
201
202				This will perform a word count and associate words with a category to later be
203				classified.
204
205				=head1 SEE ALSO
206
207				L<Moose>
208
209				=head1 AUTHOR
210
211				Logan Bell, C<< <logie@cpan.org> >>
212
213				=head1 COPYRIGHT & LICENSE
214
215				Copyright 2012, Logan Bell
216
217				This program is free software; you can redistribute it and/or modify
218				it under the same terms as Perl itself.
219
220				=cut