File Coverage

blib/lib/Text/Similarity/Overlaps.pm
Criterion Covered Total %
statement 112 137 81.7
branch 21 32 65.6
condition n/a
subroutine 14 15 93.3
pod 0 5 0.0
total 147 189 77.7


line stmt bran cond sub pod time code
1             package Text::Similarity::Overlaps;
2              
3 3     3   12540 use 5.006;
  3         10  
  3         109  
4 3     3   14 use strict;
  3         4  
  3         84  
5 3     3   13 use warnings;
  3         4  
  3         69  
6              
7 3     3   18 use Text::Similarity;
  3         17  
  3         128  
8 3     3   1502 use Text::OverlapFinder;
  3         7  
  3         332  
9              
10             our @ISA = qw(Text::Similarity);
11              
12             # Items to export into callers namespace by default. Note: do not export
13             # names by default without a very good reason. Use EXPORT_OK instead.
14             # Do not simply export all your public functions/methods/constants.
15              
16             # This allows declaration use Text::Similarity::Overlaps ':all';
17             # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
18             # will save memory.
19              
20             our %EXPORT_TAGS = ( 'all' => [ qw() ] );
21             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
22              
23             our @EXPORT = qw();
24              
25             our $VERSION = '0.05';
26              
27             # information about granularity is not used now
28             # the would be that you could figure out similarity
29             # unit by unit, that is sentence by sentence,
30             # paragraph by paragraph, etc. however, at this
31             # point similarity is only computed document by
32             # document
33              
34 3     3   20 use constant WORD => 0;
  3         4  
  3         144  
35 3     3   17 use constant SENTENCE => 1;
  3         5  
  3         137  
36 3     3   16 use constant PARAGRAPH => 2;
  3         5  
  3         126  
37 3     3   15 use constant DOCUMENT => 3;
  3         5  
  3         4084  
38              
39             my %finder;
40              
41             sub new
42             {
43 4     4 0 472 my $class = shift;
44 4         44 my $self = $class->SUPER::new (@_);
45 4 50       29 if ($self->stoplist) {
46 0         0 $finder{$self} = Text::OverlapFinder->new(stoplist => $self->stoplist);
47             }
48             else {
49 4         31 $finder{$self} = Text::OverlapFinder->new;
50             }
51              
52             #string_compare_initialize (0, undef);
53 4         24 return $self;
54             }
55              
56              
57             sub finder : lvalue
58             {
59 42     42 0 52 my $self = shift;
60 42         252 $finder{$self};
61             }
62              
63             sub DESTROY
64             {
65 4     4   2156 my $self = shift;
66 4         31 delete $finder{$self};
67             }
68              
69             # this method requires that the input be provided in files.
70             # this is now just a front end to getSimilarityStrings that
71             # does file handling. Actual similarity measurements are
72             # performed between strings in getSimilarityStrings.
73              
74             # this seems like it might want to move up into Similarity.pm
75              
76             sub getSimilarity
77             {
78 28     28 0 29462 my $self = shift;
79              
80             # we created a separate method for getSimilarityStrings since overloading
81             # to accept both strings and file names as input parameters to
82             # getSimilarity would have required that we treat any file name that does
83             # not have a corresponding file to be treated as a string, thus making it
84             # impossible to really deal with missing file errors, and probably resulting
85             # in quite a bit of user annoyance as 'textt1.txt' is measured for similarity
86             # with the contents of 'text2.txt'
87              
88 28         59 my $file1 = shift;
89 28         35 my $file2 = shift;
90              
91             # granularity is not currently supported
92              
93 28         35 my $granularity = shift;
94 28 50       93 $granularity = DOCUMENT unless defined $granularity;
95              
96 28 50       563 unless (-e $file1) {
97 0         0 $self->error ("The file '$file1' does not exist");
98 0         0 return undef;
99             }
100 28 50       310 unless (-e $file2) {
101 0         0 $self->error ("The file '$file2' does not exist");
102 0         0 return undef;
103             }
104              
105 28 50       916 unless (open (FH1, '<', $file1)) {
106 0         0 $self->error ("Cannot open $file1: $!");
107 0         0 return undef;
108             }
109 28 50       1141 unless (open (FH2, '<', $file2)) {
110 0         0 $self->error ("Cannot open $file2: $!");
111 0         0 return undef;
112             }
113            
114 28         38 my $str1;
115             my $str2;
116 28         471 while () {
117 100         278 $str1 .= $self->sanitizeString ($_);
118             }
119 28         538 $str1 =~ s/\s+/ /g;
120 28         519 while () {
121 66         159 $str2 .= $self->sanitizeString ($_);
122             }
123 28         597 $str2 =~ s/\s+/ /g;
124              
125             # find compounds, not working right now
126              
127 28         102 $str1 = $self->compoundify ($str1);
128 28         72 $str2 = $self->compoundify ($str2);
129              
130 28         403 close FH1;
131 28         236 close FH2;
132              
133             # add this call, to expose this method to users too (in case they want to
134             # just measure the similarity of two strings. So we have our files converted
135             # into strings, and now measure their similarity.
136              
137 28         91 my ($score, %allScores) = $self -> getSimilarityStrings ($str1,$str2);
138              
139             # end getSimilarity here, making sure to return similarity value from
140             # get SimilarityStrings
141              
142 28 50       215 return wantarray ? ($score, %allScores) : $score;
143             }
144              
145             # this method measures the similarity between two strings. If a string is empty
146             # or missing then we generate an error and return undefined
147              
148             sub getSimilarityStrings {
149              
150 44     44 0 10345 my $self = shift;
151              
152 44         66 my $input1 = shift;
153 44         51 my $input2 = shift;
154              
155             # check to make sure you have a string! empty file or string should be rejected
156              
157 44 100       114 if (!defined($input1)) {
158 2         19 $self->error ("first input string is undefined: $!");
159 2         6 return undef;
160             }
161 42 50       79 if (!defined($input2)) {
162 0         0 $self->error ("second input string is undefined: $!");
163 0         0 return undef;
164             }
165              
166             # clean the strings, maybe make this optional
167              
168 42         122 my $str1 .= $self->sanitizeString ($input1);
169 42         127 my $str2 .= $self->sanitizeString ($input2);
170              
171             # find compounds, not working right now
172              
173 42         139 $str1 = $self->compoundify ($str1);
174 42         110 $str2 = $self->compoundify ($str2);
175              
176             # this is where we find our overlaps....
177             # then we score them using both the standard overlaps measure
178             # as well as based on the lesk.pm module from WordNet-Similarity
179              
180 42         114 my ($overlaps, $wc1, $wc2) = $self->finder->getOverlaps ($str1, $str2);
181              
182 42         71 my $score = 0;
183 42         84 my $raw_lesk = 0;
184 42         163 my %allScores = ('wc1' => $wc1, 'wc2' => $wc2);
185            
186 42 50       201 if ($self->verbose) {
187 0         0 print STDERR "keys: ", scalar keys %$overlaps, "\n";
188             }
189              
190 42         199 foreach my $key (sort keys %$overlaps) {
191              
192 40         390 my @words = split /\s+/, $key;
193              
194 40 50       142 if ($self->verbose) {
195 0         0 print STDERR "-->'$key' len(", scalar @words, ") cnt(", $overlaps->{$key}, ")\n";
196             }
197              
198             # find out how many words match, add 1 for each match
199              
200             #$score += scalar @words * scalar @words * ${$overlaps}{$key};
201 40         141 $score += scalar @words * $overlaps->{$key};
202 40         81 $allScores{'raw'} = $score;
203             # find the length of the key, square it, multiply with its
204             # value to get the lesk score for this particular match
205              
206 40         81 my $value = ($#words + 1) * ($#words + 1) * ${$overlaps}{$key};
  40         74  
207 40         176 $raw_lesk += $value;
208              
209             }
210              
211             # fix for divide by zero error, will short circuit when score is 0
212             # provided by cernst at esoft.com
213             # who reported via rt.cpan.org ticket 29902
214              
215 42 100       107 if ($score == 0){
216 10         18 $allScores{'raw'} = 0;
217 10         16 $allScores{'precision'} = 0;
218 10         19 $allScores{'recall'} = 0;
219 10         16 $allScores{'F'} = 0;
220 10         14 $allScores{'dice'} = 0;
221 10         16 $allScores{'E'} = 0;
222 10         15 $allScores{'cosine'} = 0;
223 10         20 $allScores{'raw_lesk'} = 0;
224 10         17 $allScores{'lesk'} = 0;
225            
226 10 100       92 return wantarray ? ($score, %allScores) : $score;
227             }
228              
229             # end of fix
230              
231 32         58 my $prec = $score / $wc2;
232 32         43 my $recall = $score / $wc1;
233 32         78 my $f = 2 * $prec * $recall / ($prec + $recall);
234            
235 32         56 my $dice = 2 * $score / ($wc1 + $wc2) ;
236 32         48 my $e = 1 - $f;
237 32         86 my $cos = $score / sqrt ($wc1 * $wc2);
238 32         45 my $lesk = $raw_lesk/ ($wc1 * $wc2);
239            
240 32         72 $allScores{'precision'} = $prec;
241 32         48 $allScores{'recall'} = $recall;
242 32         61 $allScores{'F'} = $f;
243 32         50 $allScores{'dice'} = $dice;
244 32         59 $allScores{'E'} = $e;
245 32         54 $allScores{'cosine'} = $cos;
246 32         49 $allScores{'raw_lesk'} = $raw_lesk;
247 32         59 $allScores{'lesk'} = $lesk;
248            
249             # display them, if requested
250 32 50       96 if ($self->verbose) {
251 0         0 print STDERR "wc 1: $wc1\n";
252 0         0 print STDERR "wc 2: $wc2\n";
253 0         0 print STDERR " Raw score: $score\n";
254 0         0 print STDERR " Precision: $prec\n";
255 0         0 print STDERR " Recall : $recall\n";
256 0         0 print STDERR " F-measure: $f\n";
257 0         0 print STDERR " Dice : $dice\n";
258 0         0 print STDERR " E-measure: $e\n";
259 0         0 print STDERR " Cosine : $cos\n";
260 0         0 print STDERR " Raw lesk : $raw_lesk\n";
261 0         0 print STDERR " Lesk : $lesk\n";
262            
263             #
264             # include might be the jaccard coefficient ....
265             # jaccard is 2*raw_score / union of string1 and string2
266             #
267             }
268            
269 32 100       106 if ($self->normalize) {
270 16         24 $score = $f;
271             }
272            
273 32 100       426 return wantarray ? ($score, %allScores) : $score;
274             }
275              
276 0     0 0   sub doStop {0}
277              
278             1;
279              
280             __END__