File Coverage

blib/lib/Lingua/TFIDF.pm
Criterion Covered Total %
statement 57 57 100.0
branch 7 10 70.0
condition n/a
subroutine 12 12 100.0
pod 4 6 66.6
total 80 85 94.1


line stmt bran cond sub pod time code
1             package Lingua::TFIDF;
2              
3             # ABSTRACT: Language-independent TF-IDF calculator.
4              
5 1     1   846 use strict;
  1         2  
  1         39  
6 1     1   5 use warnings;
  1         2  
  1         34  
7 1     1   626 use Lingua::TFIDF::Types;
  1         3  
  1         31  
8 1     1   1837 use List::MoreUtils qw/uniq/;
  1         1376  
  1         92  
9 1     1   7 use List::Util qw/sum/;
  1         2  
  1         58  
10 1     1   5 use Smart::Args;
  1         2  
  1         687  
11              
12             our $VERSION = 0.01;
13              
14             sub new {
15 1     1 1 40 args
16             my $class => 'ClassName',
17             my $word_counter => +{ isa => 'Lingua::TFIDF::WordCounter', optional => 1 },
18             my $word_segmenter => 'Lingua::TFIDF::WordSegmenter';
19              
20 1 50       117 unless (defined $word_counter) {
21 1         656 require Lingua::TFIDF::WordCounter::Simple;
22 1         7 $word_counter = Lingua::TFIDF::WordCounter::Simple->new;
23             }
24              
25             bless +{
26 1         5 word_counter => $word_counter,
27             word_segmenter => $word_segmenter,
28             } => $class;
29             }
30              
31             sub idf {
32 3     3 1 2309 args
33             my $self,
34             my $documents => 'ArrayRef[Lingua::TFIDF::TermFrequency] | ArrayRef[Str]';
35              
36 3 50       637 return +{} if @$documents == 0;
37              
38 2         7 my @tfs = ref $documents->[0]
39 3 100       15 ? @$documents : map { $self->tf(document => \$_) } @$documents;
40 3         4 my %idf;
41 3         6 for my $word (uniq map { keys %$_ } @tfs) {
  6         138  
42 141         131 my $num_documents_including_word = grep { exists $_->{$word} } @tfs;
  282         443  
43 141         333 $idf{$word} = log(@tfs / $num_documents_including_word);
44             }
45 3         38 return \%idf;
46             }
47              
48             sub tf {
49 8     8 1 1818 args
50             my $self,
51             my $document => 'Ref | Str',
52             my $normalize => +{ isa => 'Bool', default => 0 };
53              
54 8         1141 $self->word_counter->clear;
55              
56 8         35 my $iter = $self->word_segmenter->segment($document);
57 8         21 my $counter = $self->word_counter;
58 8         23 while (defined (my $word = $iter->())) { $counter->add_count($word) }
  419         1241  
59              
60 8         25 my $tf = $counter->frequencies;
61 8 100       64 return $tf unless $normalize;
62              
63 3         50 my $total_words = sum values %$tf;
64 3         29 +{ map { ($_ => $tf->{$_} / $total_words) } keys %$tf };
  96         258  
65             }
66              
67             sub tf_idf {
68 2     2 1 4064 args
69             my $self,
70             my $documents => 'ArrayRef[Str]',
71             my $normalize => +{ isa => 'Bool', default => 0 };
72              
73 2 50       186 return +{} if @$documents == 0;
74              
75 4         13 my @tfs =
76 2         3 map { $self->tf(document => \$_, normalize => $normalize) } @$documents;
77 2         10 my $idf = $self->idf(documents => \@tfs);
78 2         3 my @tf_idf;
79 2         4 for my $tf (@tfs) {
80 4         19 push @tf_idf, +{ map { ($_ => $tf->{$_} * $idf->{$_}) } keys %$tf };
  130         269  
81             }
82 2         23 return \@tf_idf;
83             }
84              
85 16     16 0 79 sub word_counter { $_[0]->{word_counter} }
86              
87 8     8 0 37 sub word_segmenter { $_[0]->{word_segmenter} }
88              
89             1;
90              
91             __END__
92              
93             =pod
94              
95             =encoding UTF-8
96              
97             =head1 NAME
98              
99             Lingua::TFIDF - Language-independent TF-IDF calculator.
100              
101             =head1 VERSION
102              
103             version 0.01
104              
105             =head1 SYNOPSIS
106              
107             use Lingua::TFIDF;
108             use Lingua::TFIDF::WordSegmenter::SplitBySpace;
109            
110             my $tf_idf_calc = Lingua::TFIDF->new(
111             # Use a word segmenter for japanese text.
112             word_segmenter => Lingua::TFIDF::WordSegmenter::SplitBySpace->new,
113             );
114            
115             my $document1 = 'Humpty Dumpty sat on a wall...';
116             my $document2 = 'Remember, remember, the fifth of November...';
117            
118             my $tf = $tf_idf_calc->tf(document => $document1);
119             # TF of word "Dumpty" in $document1.
120             say $tf->{'Dumpty'}; # 2, if you are referring same text as mine.
121            
122             my $idf = $tf_idf_calc->idf(documents => [$document1, $document2]);
123             say $idf->{'Dumpty'}; # log(2/1) ≒ 0.693147
124            
125             my $tf_idfs = $tf_idf_calc->tf_idf(documents => [$document1, $document2]);
126             # TF-IDF of word "Dumpty" in $document1.
127             say $tf_idfs->[0]{'Dumpty'}; # 2 log(2/1) ≒ 1.386294
128             # Ditto. But in $document2.
129             say $tf_idfs->[1]{'Dumpty'}; # 0
130              
131             =head1 DESCRIPTION
132              
133             Quoting L<Wikipedia|http://en.wikipedia.org/wiki/Tf%E2%80%93idf>:
134              
135             tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in information retrieval and text mining.
136              
137             This module provides feature for calculating TF, IDF and TF-IDF.
138              
139             =head2 MOTIVATION
140              
141             There are several TF-IDF calculator modules in CPAN already, for example L<Text::TFIDF> and L<Lingua::JA::TFIDF>. So why I reinvent the wheel? The reason is language dependency: C<Text::TFIDF> assumes that words in sentence are separated by spaces. This assumption is not true in most east asian languages. And C<Lingua::JA::TFIDF> works only on japanese text.
142              
143             C<Lingua::TFIDF> solves this problem by separating word segmentation process from word frequency counting. You can process documents written in any languages, by providing appropriate word segmenter (see L</CUSTOM WORD SEGMENTER> below.)
144              
145             =head1 METHODS
146              
147             =head2 new(word_segmenter => $segmenter)
148              
149             Constructor. Takes 1 mandatory parameter C<word_segmenter>.
150              
151             =head3 CUSTOM WORD SEGMENTER
152              
153             Although this distribution bundles some language-independent word segmenter, like L<Lingua::TFIDF::WordSegmenter::SplitBySpace>, sometimes language-specifiec word segmenters are more appropriate. You can pass a custom word segmenter object to the calculator.
154              
155             The word segmenter is a plain Perl object that implements C<segment> method. The method takes 1 positional argument C<$document>, which is a string or a B<reference> to string. It is expected to return an word iterator as CodeRef.
156              
157             Roughly speaking, given custom word segmenter will be used like:
158              
159             my $document = 'foo bar baz';
160            
161             # Can be called with a reference, like |->segment(\$document)|.
162             # Detecting data type is callee's responsibility.
163             my $iter = $word_segmenter->segment($document);
164            
165             while (defined(my $word = $iter->())) {
166             ...
167             }
168              
169             =head2 idf(documents => \@documents)
170              
171             Calculates IDFs. Result is returned as HashRef, which the keys and values are words and corresponding IDFs respectively.
172              
173             =head2 tf(document => $document | \$document [, normalize => 0])
174              
175             Calculates TFs. Result is returned as HashRef, which the keys and values are words and corresponding TFs respectively.
176              
177             If optional parameter <normalize> is set true, the TFs are devided by the number of words in the C<$document>. It is useful when comparing TFs with other documents.
178              
179             =head2 tf_idf(documents => \@documents [, normalize => 0])
180              
181             Calculates TF-IDFs. Result is returned as ArrayRef of HashRef. Each HashRef contains TF-IDF values for corresponding document.
182              
183             =head1 SEE ALSO
184              
185             =over 2
186              
187             =item L<Lingua::TFIDF::WordSegmenter::LetterNgram>
188              
189             =item L<Lingua::TFIDF::WordSegmenter::SplitBySpace>
190              
191             =item L<Lingua::TFIDF::WordSegmenter::JA::MeCab>
192              
193             =back
194              
195             =head1 AUTHOR
196              
197             Koichi SATOH <sekia@cpan.org>
198              
199             =head1 COPYRIGHT AND LICENSE
200              
201             This software is Copyright (c) 2014 by Koichi SATOH.
202              
203             This is free software, licensed under:
204              
205             The MIT (X11) License
206              
207             =cut