File Coverage

blib/lib/String/Compare.pm
Criterion Covered Total %
statement 77 78 98.7
branch 18 26 69.2
condition 2 2 100.0
subroutine 8 8 100.0
pod 6 6 100.0
total 111 120 92.5


line stmt bran cond sub pod time code
1             package String::Compare;
2              
3 1     1   16878 use base qw(Exporter);
  1         3  
  1         608  
4             @EXPORT = qw(compare);
5 1     1   8 use strict;
  1         3  
  1         2919  
6              
7             =head1 NAME
8              
9             String::Compare - Compare two strings and return how much they are alike
10              
11             =head1 SYNOPSIS
12              
13             use String::Compare;
14             my $str1 = "J R Company";
15             my $str2 = "J. R. Company";
16             my $str3 = "J R Associates";
17             my $points12 = compare($str1,$str2);
18             my $points13 = compare($str1,$str3);
19             if ($points12 > $points13) {
20             print $str1." refers to ".$str2;
21             } else {
22             print $str1." refers to ".$str3;
23             }
24              
25             =head1 DESCRIPTION
26              
27             This module was created when I needed to merge the information
28             between two databases, and I had to find who were who in each database,
29             but the names weren't always equals, sometimes there were differences.
30              
31             The problem was that I need to choose the right person, so I must see how
32             much the different names are alike. I've tried testing char by char, but situations
33             like the described in the synopsis showed me that wasn't enough. So I created a
34             set of tests to give a more accurate pontuation of how much the names are alike.
35              
36             The result is in percentage. If the strings are exactly equal, it would return 1,
37             if they have nothing in common, it would return 0.
38              
39             =head1 METHODS
40              
41             =over
42              
43             =item compare($str1,$str2,%tests)
44              
45             This method receives the two strings and optionally the names and weights
46             of each test. The default behavior is to use all the tests with the weigth 1.
47             This method lowercases both strings, since case doesn't change the meaning
48             of the content. But each test is case sensitive, so if you like you must lc the strings.
49              
50             The current tests are (you can use the tests individually if you like:
51              
52             P.S.: You can use custom tests, because the tests are executed using eval,
53             so if you want a custom test, just use the full name of a method.
54              
55             P.S.2: If you created a test, please share it, sending me by email and I will be
56             glad to include it into the default set.
57              
58             =back
59              
60             =cut
61              
62             my %default_options =
63             (
64             char_by_char => 1,
65             consonants => 1,
66             vowels => 1,
67             word_by_word => 1,
68             chars_only => 1
69             );
70              
71             sub compare {
72 2     2 1 77 my $str1 = shift;
73 2         4 my $str2 = shift;
74              
75 2         5 $str1 = lc($str1);
76 2         5 $str2 = lc($str2);
77             # skip any tests if they are the same
78 2 50       6 return 1 if $str1 eq $str2;
79              
80 2         5 my %user_opt = @_;
81 2         13 my %opt = (%default_options, %user_opt);
82 2         4 my %results;
83 2         3 my $totalPoints = 0;
84 2         3 my $score = 0;
85 2         19 foreach my $test (keys %opt) {
86 10         15 $totalPoints += $opt{$test};
87             }
88 2         7 foreach my $test (keys %opt) {
89 10 50       26 next if $opt{$test} == 0;
90 10   100     58 my $result = __PACKAGE__->can($test)->($str1,$str2) || 0;
91 10         27 $score += $result * $opt{$test}/$totalPoints;
92             }
93 2         10 return $score;
94             }
95              
96             =over
97              
98             =item char_by_char($str1,$str2)
99              
100             Tests character by character
101              
102             =back
103              
104             =cut
105              
106             sub char_by_char {
107 26     26 1 31 my $str1 = shift;
108 26         27 my $str2 = shift;
109 26         34 my $size1 = length $str1;
110 26         35 my $size2 = length $str2;
111 26         24 my $score = 0;
112 26 100       45 my $size = $size1>$size2?$size1:$size2;
113 26         124 for (my $i = 0;$i < $size; $i++) {
114 121 100       285 if (length $str1 < $i) {
115 9         12 last;
116             }
117 112 100       171 if (length $str2 < $i) {
118 4         6 last;
119             }
120 108         126 my $c1 = substr $str1, $i, 1;
121 108         120 my $c2 = substr $str2, $i, 1;
122 108 100       326 if ($c1 eq $c2) {
123 38         90 $score += 1/$size;
124             }
125             }
126 26         68 return $score;
127             }
128              
129             =over
130              
131             =item consonants($str1,$str2)
132              
133             Test char_by_char only in the consonants.
134              
135             =back
136              
137             =cut
138              
139             *consoants = *consonants;
140             sub consonants {
141 2     2 1 3 my $str1 = shift;
142 2         26 my $str2 = shift;
143 2         12 $str1 =~ s/[^bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQRSTVWXZ]//g;
144 2         12 $str2 =~ s/[^bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQRSTVWXZ]//g;
145 2         6 return char_by_char($str1,$str2);
146             }
147              
148             =over
149              
150             =item vowels($str1,$str2)
151              
152             Test char_by_char only in the vowels.
153              
154             =back
155              
156             =cut
157              
158             sub vowels {
159 2     2 1 4 my $str1 = shift;
160 2         3 my $str2 = shift;
161 2         20 $str1 =~ s/[^aeiouyAEIOUY]//g;
162 2         60 $str2 =~ s/[^aeiouyAEIOUY]//g;
163 2         6 return char_by_char($str1,$str2);
164             }
165              
166             =over
167              
168             =item word_by_word($str1, $str2)
169              
170             Test char_by_char each word, giving points according to the
171             size of the word.
172              
173             =back
174              
175             =cut
176              
177             sub word_by_word {
178 2     2 1 4 my $str1 = shift;
179 2         3 my $str2 = shift;
180 2         7 my @words1 = split(/\s+/,$str1);
181 2         7 my @words2 = split(/\s+/,$str2);
182 2         3 my $size1 = scalar @words1;
183 2         4 my $size2 = scalar @words2;
184 2 50       5 my $size = $size1>$size2?$size1:$size2;
185 2         3 my $score;
186             my $totalChars;
187 0         0 my @totalCharsPerWord;
188 2         7 for (my $i = 0; $i < $size; $i++) {
189 6 50       13 my $subsize1 = $i < $size1 ? length($words1[$i]) : 0;
190 6 50       14 my $subsize2 = $i < $size2 ? length($words2[$i]) : 0;
191 6 50       10 my $subsize = $subsize1 > $subsize2?$subsize1:$subsize2;
192 6         7 $totalChars += $subsize;
193 6         16 push @totalCharsPerWord, $subsize;
194             }
195 2         6 for (my $i = 0; $i < $size; $i++) {
196 6 50       12 last if $i >= $size1;
197 6         7 my $bestScore = 0;
198 6         14 for (my $j = 0; $j < $size; $j++) {
199 18 50       120 last if $j >= $size2;
200 18         37 my $result = char_by_char($words1[$i],$words2[$j]);
201 18 100       56 $bestScore = $result if $result > $bestScore;
202             }
203 6         19 $score += $bestScore * $totalCharsPerWord[$i]/$totalChars;
204             }
205 2         10 return $score;
206             }
207              
208             =over
209              
210             =item chars_only($str1,$str2)
211              
212             Test char_by_char only with the characters matched by \w.
213              
214             =back
215              
216             =cut
217              
218             sub chars_only {
219 2     2 1 15 my $str1 = shift;
220 2         3 my $str2 = shift;
221 2         10 $str1 =~ s/\W//g;
222 2         9 $str2 =~ s/\W//g;
223 2         5 return char_by_char($str1,$str2);
224             }
225              
226              
227             =head1 COPYRIGHT
228              
229             This module was created by "Daniel Ruoso" . It is licensed under both
230             the GNU GPL and the Artistic License.
231              
232             =cut
233