File Coverage

blib/lib/AI/Classifier/Text/FileLearner.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				package AI::Classifier::Text::FileLearner;
2				{
3				$AI::Classifier::Text::FileLearner::VERSION = '0.03';
4				}
5	1	1	25422	use strict;
	1		2
	1		36
6	1	1	5	use warnings;
	1		2
	1		44
7	1	1	21	use 5.010;
	1		7
	1		37
8
9	1	1	1510	use Moose;
	0
	0
10				use File::Find::Rule;
11				use File::Spec;
12				use List::Util 'max';
13				use Carp 'croak';
14				use AI::NaiveBayes::Learner;
15				use AI::Classifier::Text;
16				use AI::Classifier::Text::Analyzer;
17
18				has term_weighting => (is => 'ro', isa => 'Str');
19				has analyzer => ( is => 'ro', default => sub{ AI::Classifier::Text::Analyzer->new() } );
20				has learner => ( is => 'ro', default => sub{ AI::NaiveBayes::Learner->new() } );
21				has training_dir => ( is => 'ro', isa => 'Str', required => 1 );
22				has iterator => ( is => 'ro', lazy_build => 1 );
23				sub _build_iterator {
24				my $self = shift;
25				my $rule = File::Find::Rule->new( );
26				$rule->file;
27				$rule->not_name('*.data');
28				$rule->start( $self->training_dir );
29				return $rule;
30				}
31
32				sub get_category {
33				my( $self, $file ) = @_;
34				my $training_dir = $self->training_dir;
35				my $rest = File::Spec->abs2rel( $file, $training_dir );
36				my @dirs = File::Spec->splitdir( $rest );
37				return $dirs[0]
38				}
39
40
41				sub next {
42				my $self = shift;
43
44				my $file = $self->iterator->match;
45				return if !defined($file);
46				my $category = $self->get_category( $file );
47				open(my $fh, "<:encoding(UTF-8)", $file )
48				\|\| Carp::croak(
49				"Unable to read the specified training file: $file\n");
50				my $content = join('', <$fh>);
51				close $fh;
52				my $initial_features = {};
53				if( -f "$file.data" ){
54				my $data = do "$file.data";
55				$initial_features = $data->{initial_features}
56				}
57				my $features = $self->analyzer->analyze( $content, $initial_features );
58
59				return {
60				file => $file,
61				features => $features,
62				categories => [ $category ],
63				};
64				}
65
66				sub teach_it {
67				my $self = shift;
68				my $learner = $self->learner;
69				while ( my $data = $self->next ) {
70				normalize( $data->{features} );
71				$self->weight_terms($data);
72				$learner->add_example(
73				attributes => $data->{features},
74				labels => $data->{categories}
75				);
76				}
77				}
78
79
80				sub classifier {
81				my $self = shift;
82				$self->teach_it;
83				return AI::Classifier::Text->new(
84				classifier => $self->learner->classifier,
85				analyzer => $self->analyzer,
86				);
87				}
88
89
90				sub weight_terms {
91				my ( $self, $doc ) = @_;
92				my $f = $doc->{features};
93				given ($self->term_weighting) {
94				when ('n') {
95				my $max_tf = max values %$f;
96				$_ = 0.5 + 0.5 * $_ / $max_tf for values %$f;
97				}
98				when ('b') {
99				$_ = $_ ? 1 : 0 for values %$f;
100				}
101				when (undef){
102				}
103				default {
104				croak 'Unknown weighting type: '.$self->term_weighting;
105				}
106				}
107				}
108
109				# this doesn't quite fit the current model (it requires the entire collection
110				# of documents to be in memory at once), but it may be useful to someone, someday
111				# so let's just leave it here
112				sub collection_weighting {
113				my (@documents, $subtrahend) = @_;
114				$subtrahend //= 0;
115
116				my $num_docs = +@documents;
117
118				my %frequency;
119				for my $doc (@documents) {
120				for my $k (keys %{$doc->{attributes}}) {
121				$frequency{$k}++;
122				}
123				}
124
125				foreach my $doc (@documents) {
126				my $f = $doc->{attributes};
127				for (keys %$f) {
128				$f->{$_} *= log($num_docs / ($frequency{$_} // 0) - $subtrahend);
129				}
130				}
131				}
132
133				sub euclidean_length {
134				my $f = shift;
135
136				my $total = 0;
137				foreach (values %$f) {
138				$total += $_**2;
139				}
140
141				return sqrt($total);
142				}
143
144				sub scale {
145				my ($f, $scalar) = @_;
146
147				$_ *= $scalar foreach values %$f;
148
149				return $f;
150				}
151
152				sub normalize {
153				my $attrs = shift;
154
155				my $length = euclidean_length($attrs);
156
157				return $length ? scale($attrs, 1/$length) : $attrs;
158				}
159
160				1;
161
162				=pod
163
164				=head1 NAME
165
166				AI::Classifier::Text::FileLearner - Training data reader for AI::NaiveBayes
167
168				=head1 VERSION
169
170				version 0.03
171
172				=head1 SYNOPSIS
173
174				use AI::Classifier::Text::FileLearner;
175
176				my $learner = AI::Classifier::Text::FileLearner->new( training_dir => 't/data/training_set_ordered/' );
177
178				my $classifier = $learner->classifier;
179
180				=head1 DESCRIPTION
181
182				This is a trainer of text classifiers. It traverses a directory filled,
183				interprets the subdirectories in it as category names, reads all files in them and adds them
184				as examples for the classifier being trained.
185
186				head1 METHODS
187
188				=over 4
189
190				=item next
191
192				Internal method for traversing the training data directory.
193
194				=item classifier
195
196				Returns a trained classifier.
197
198				=back
199
200				=head1 AUTHOR
201
202				Zbigniew Lukasiak <zlukasiak@opera.com>, Tadeusz SoÅ›nierz <tsosnierz@opera.com>
203
204				=head1 COPYRIGHT AND LICENSE
205
206				This software is copyright (c) 2012 by Opera Software ASA.
207
208				This is free software; you can redistribute it and/or modify it under
209				the same terms as the Perl 5 programming language system itself.
210
211				=cut
212
213				__END__
214
215				# ABSTRACT: Training data reader for AI::NaiveBayes
216