File Coverage

blib/lib/String/Trigram.pm
Criterion Covered Total %
statement 154 202 76.2
branch 58 86 67.4
condition 14 39 35.9
subroutine 17 23 73.9
pod 12 12 100.0
total 255 362 70.4


line stmt bran cond sub pod time code
1             package String::Trigram;
2              
3 1     1   6814 use Carp;
  1         2  
  1         58  
4 1     1   728 use locale;
  1         218  
  1         5  
5              
6 1     1   28 use 5.6.0;
  1         7  
  1         40  
7 1     1   4 use strict;
  1         2  
  1         29  
8 1     1   4 use warnings;
  1         2  
  1         2500  
9              
10             require Exporter;
11              
12             our @ISA = qw(Exporter);
13             our @EXPORT_OK = ('compare');
14             our $VERSION = '0.12';
15              
16             our $DEFAULT_MIN_SIM = 0;
17             our $DEFAULT_WARP = 1.0;
18             our $DEFAULT_IGNORE_CASE = 1;
19             our $DEFAULT_KEEP_ONLY_ALNUMS = 0;
20             our $DEFAULT_DEBUG = 0;
21             our $DEFAULT_NGRAM_LEN = 3;
22             our $DEFAULT_PADDING = $DEFAULT_NGRAM_LEN - 1;
23              
24             sub new {
25 39     39 1 598 my ( $pkg, %params ) = @_;
26              
27 39         57 my $seen = {};
28              
29 39         79 _setParams( \%params );
30              
31 39         105 foreach ( keys %params ) {
32 312 50       895 croak "Unknown parameter $_!"
33             if ( $_ !~
34             /^(cmpBase|minSim|warp|ignoreCase|keepOnlyAlNums|padding|debug|ngram)$/
35             );
36             }
37              
38             # check for reasonable values
39 39 50 33     122 if ( ( !$params{cmpBase} ) or ( ref( @{ $params{cmpBase} } ne 'ARRAY' ) ) )
  39         143  
40             {
41 0         0 croak
42             "We need a base for comparison, so you should specify the parameter cmpBase as a reference to an anonymous array of strings being the base of comparison. Don't bother specifying anything else, but this we do need.\n";
43             }
44              
45 39 50 33     157 if ( ( $params{minSim} < 0 ) || ( $params{minSim} > 1.0 ) ) {
46 0         0 croak "Minimal similarity must be >= 0 and <= 1.0";
47             }
48              
49 39 50 33     155 if ( defined $params{ngram} && ( $params{ngram} <= 0 ) ) {
50 0         0 croak "n in n-gram must be > 0";
51             }
52              
53 39 50       83 if ( $params{warp} == 0 ) {
54 0         0 croak "Warp must be > 0";
55             }
56              
57 39 50 33     150 if ( ( $params{padding} < 0 ) || ( $params{padding} > $params{ngram} - 1 ) )
58             {
59 0         0 croak "Padding must be between 0 and " . ( $params{ngram} - 1 ) . ".";
60             }
61              
62 39         153 my $self = bless {
63             ngram => int( $params{ngram} ),
64             minSim => $params{minSim},
65             warp => $params{warp},
66              
67             # index for trigrams
68             trigIdx => _trigramify(
69             $params{cmpBase}, $params{ignoreCase},
70             $params{keepOnlyAlNums}, ' ' x $params{padding},
71             undef, $params{ngram},
72             $seen
73             ),
74              
75             # index of all strings fed to the object, so no string is
76             # processed twice (this might lead to wrong results)
77             seenStrings => $seen,
78              
79             ignoreCase => $params{ignoreCase},
80             keepOnlyAlNums => $params{keepOnlyAlNums},
81             padding => ' ' x $params{padding},
82             debug => $params{debug},
83              
84             }, $pkg;
85              
86 39         173 return $self;
87             }
88              
89             sub compare {
90 31     31 1 1148 my ( $s1, $s2 ) = ( shift, shift );
91              
92 31 50       64 croak "I need at least 2 strings to compare as parameters, died"
93             unless defined $s2;
94              
95 31         40 my $result = {};
96              
97 31         111 new String::Trigram( cmpBase => [$s1], @_ )
98             ->getSimilarStrings( $s2, $result );
99              
100 31 100       368 $result->{$s1} || 0;
101             }
102              
103             sub reInit {
104 3     3 1 19 my ( $self, $newCmpBase ) = @_;
105              
106 3 50 33     20 if ( ( !$newCmpBase ) or ( ref( @$newCmpBase ne 'ARRAY' ) ) ) {
107 0         0 croak
108             "We need a base for comparison, so, as a parameter to this method, we do need a reference to an anonymous array of strings being the base of comparison.\n";
109             }
110              
111 3         7 $self->{seenStrings} = {};
112              
113 3         13 $self->_setTrigIdx(
114             _trigramify(
115             $newCmpBase, $self->{ignoreCase},
116             $self->{keepOnlyAlNums}, $self->{padding},
117             undef, $self->{ngram},
118             $self->{seenStrings}
119             )
120             );
121             }
122              
123             sub extendBase {
124 1     1 1 7 my ( $self, $newStrings ) = @_;
125              
126 1 50 33     10 if ( ( !$newStrings ) or ( ref( @$newStrings ne 'ARRAY' ) ) ) {
127 0         0 croak
128             "We need to add to the base for comparison, so, as a parameter to this method, we do need a reference to an anonymous array of strings being added to the base of comparison.\n";
129             }
130              
131             $self->_setTrigIdx(
132 1         6 _trigramify(
133             $newStrings, $self->{ignoreCase},
134             $self->{keepOnlyAlNums}, $self->{padding},
135             $self->{trigIdx}, $self->{ngram},
136             $self->{seenStrings}
137             )
138             );
139             }
140              
141             sub minSim {
142 3     3 1 12 my ( $self, $newMinSim ) = @_;
143              
144 3 50 33     14 if ( ( defined $newMinSim )
      66        
145             && ( ( ( $newMinSim < 0 ) || ( $newMinSim > 1.0 ) ) ) )
146             {
147 0         0 croak "Minimal similarity must be >= 0 and <= 1.0";
148             }
149              
150 3 100       7 $self->{minSim} = $newMinSim if ($newMinSim);
151              
152 3         7 return $self->{minSim};
153             }
154              
155             sub warp {
156 0     0 1 0 my ( $self, $newWarp ) = @_;
157              
158 0 0 0     0 if ( ( defined $newWarp ) && ( $newWarp <= 0 ) ) {
159 0         0 croak "Warp must be > 0";
160             }
161              
162 0 0       0 $self->{warp} = $newWarp if ($newWarp);
163              
164 0         0 return $self->{warp};
165             }
166              
167             sub ignoreCase {
168 0     0 1 0 my ( $self, $newIgnoreCase ) = @_;
169              
170 0 0       0 $self->{ignoreCase} = $newIgnoreCase if ($newIgnoreCase);
171              
172 0         0 return $self->{ignoreCase};
173             }
174              
175             sub keepOnlyAlNums {
176 0     0 1 0 my ( $self, $newKeepOnlyAlNums ) = @_;
177              
178 0 0       0 $self->{keepOnlyAlNums} = $newKeepOnlyAlNums if ($newKeepOnlyAlNums);
179              
180 0         0 return $self->{keepOnlyAlNums};
181             }
182              
183             sub padding {
184 0     0 1 0 my ( $self, $newPadding ) = @_;
185              
186 0 0 0     0 if ( ( defined $newPadding )
      0        
187             && ( ( ( $newPadding < 0 ) || ( $newPadding > $self->{ngram} - 1 ) ) ) )
188             {
189 0         0 croak "Padding must be between 0 and " . $self->{ngram} - 1 . ".";
190             }
191              
192 0 0       0 $self->{padding} = ' ' x $newPadding if ($newPadding);
193              
194 0         0 return length $self->{padding};
195             }
196              
197             sub debug {
198 0     0 1 0 my ( $self, $newDebug ) = @_;
199              
200 0 0       0 $self->{debug} = $newDebug if ( defined($newDebug) );
201              
202 0         0 return $self->{debug};
203             }
204              
205             # Splits str into trigrams and looks up every trigram in trigIdx. If
206             # successfull, it finds a list of strings containing the trigram and
207             # increases the value of the string containing the trigram by 1 in the
208             # lexical $simInfo (ref. to hash). Uses _computeSimilarity() to compute
209             # the similarity value.
210             #
211             # Parameters
212             #
213             # result KEY = matching string, VALUE = similarity value
214             # str string to be matched
215             # data further key-val-pairs for min sim and warp
216             #
217             # Returns
218             #
219             # 0-n number of similar strings
220             # -1 no match found
221              
222             sub getSimilarStrings {
223 53     53 1 98 my $self = shift;
224 53         61 my $str = shift;
225 53         52 my $result = shift;
226 53 100       103 my %data = @_ if @_;
227              
228 53         55 my $curMinSim = $data{minSim};
229 53         54 my $curWarp = $data{warp};
230              
231 53   100     187 $curMinSim ||= $self->{minSim};
232 53   66     148 $curWarp ||= $self->{warp};
233              
234 53 50       129 croak
235             "I need a reference to a hash as second parameter for getSimilarStrings()!"
236             if ( ref($result) ne 'HASH' );
237              
238 53         53 my $trigram; # contains current trigram
239             my $matches; # is pointed to all strings containing current trigram
240 0         0 my $len; # length of the string to compare
241 0         0 my $actNum; # that's how many times current trigram is found in string
242 0         0 my $actName; # this is a string containing current trigram
243 0         0 my $actMatch; # current match
244              
245             # KEY = trigram, SUBKEY = potentially similar string, VALUE = number
246             # of times, trigrams is found in string.
247             # Here the frequency of every trigram in every string is saved. The
248             # table is filled with the existing strings containing some trigram
249             # the frequencys of the trigram in the string are noted and decreased
250             # every time, a matching trigram is found until the value is 0. If
251             # the value is 0, a match cannot be counted anymore.
252 53         69 my %trigNumBuf = ();
253              
254             # KEY = potentially similar string, VALUE = number of identical trigrams
255 53         69 my $simInfo = {};
256              
257 53 100       126 $str =~ s/\W//g if $self->{keepOnlyAlNums};
258 53 100       115 $str = lc $str if $self->{ignoreCase};
259              
260 53         83 $str = $self->{padding} . $str . $self->{padding};
261              
262             # Number of n-grams is length of string minus n + 1
263 53         64 $len = length($str) - $self->{ngram} + 1;
264              
265             # **********************************************************
266             # divide string to compare into trigrams and search trigrams
267             # **********************************************************
268              
269 53         114 for ( my $i = 0 ; $i < $len ; $i++ ) {
270 339         486 $trigram = substr( $str, $i, $self->{ngram} );
271              
272             # look for every trigram in $self->{trigIdx}
273             # contine unless found
274 339 100       820 next unless ( exists( $self->{trigIdx}->{$trigram} ) );
275              
276             # point matches to strings containing current trigram
277 180         233 $matches = $self->{trigIdx}->{$trigram};
278              
279             # check every string containing current trigram
280 180         490 while ( ( $actName, $actMatch ) = each %$matches ) {
281 211         296 $actNum = $actMatch
282             ->{trigs}; # so many times current trigram is found in string
283              
284 211 100       607 $trigNumBuf{$trigram}->{$actName} = $actNum
285             unless ( exists( $trigNumBuf{$trigram}->{$actName} ) );
286              
287             # if there are instances of current trigram left for string
288 211 100       466 if ( $trigNumBuf{$trigram}->{$actName} > 0 ) {
289              
290             # mark that we used on instance of current trigram
291 205         267 $trigNumBuf{$trigram}->{$actName}--;
292              
293             # mark that we have found one more matching trigram for $actName
294 205         268 $simInfo->{$actName}->{name}++;
295 205         793 $simInfo->{$actName}->{len} = $actMatch->{len};
296             }
297             }
298             }
299              
300 53         111 return $self->_computeSimilarity( $str, $simInfo, $result, $curMinSim,
301             $curWarp );
302             }
303              
304             # Uses getSimilarStrings() to get matching strings and filters out the
305             # best one(s).
306             #
307             # Parameters
308             #
309             # $inpStr string to be matched
310             # $outStrList list of best strings
311             #
312             # Returns
313             #
314             # similarity value of best match or -1, of no match
315              
316             sub getBestMatch {
317 8     8 1 41 my $self = shift;
318 8         12 my $inpStr = shift;
319 8         11 my $outStrList = shift;
320              
321 8 50       17 croak
322             "I need a reference to an array as second parameter for getBestMatch()!"
323             if ( ref($outStrList) ne 'ARRAY' );
324              
325 8         11 my $maxVal = -1; # similarity of maximally similar string
326 8         10 my $name = ""; # current potentially similar string
327 8         9 my $val = -1; # similarity of $name
328 8         10 my $rslt = {}; # KEY = similar name, VALUE = degree of similarity
329 8         13 my @rsltKeys = (); # contains keys(%rslt)
330              
331             # clear list ultimatively containing similar strings
332 8         11 @$outStrList = ();
333              
334             # there are no similar strings at all
335 8 100       17 if ( $self->getSimilarStrings( $inpStr, $rslt, @_ ) == 0 ) {
336 1         7 return 0;
337             }
338              
339             # there is at least one similar string
340 7         21 @rsltKeys = keys(%$rslt);
341              
342             # looking for the best matching string, make the first one the best ...
343 7         9 $name = $rsltKeys[0];
344 7         10 $maxVal = $rslt->{$name};
345              
346             # since there might be several best strings (containing same degree of similarity)
347             # we put them into an array and not into a scalar
348 7         12 push @$outStrList, $name;
349              
350             # ... and check if there are better ones
351 7         14 for ( 1 .. @rsltKeys - 1 ) {
352 4         7 $name = $rsltKeys[$_];
353 4         5 $val = $rslt->{ $rsltKeys[$_] };
354              
355 4 50       19 if ( $val == $maxVal ) {
    100          
356 0         0 push @$outStrList, $name;
357             }
358             elsif ( $val > $maxVal ) {
359              
360             # if we found a still better string, clear the array and put the better string into it
361 2         3 @$outStrList = ();
362 2         4 $maxVal = $val;
363 2         5 push @$outStrList, $name;
364             }
365             }
366              
367 7         53 return $maxVal;
368             }
369              
370             sub _setTrigIdx {
371 4     4   8 my ( $self, $newTrigIdx ) = @_;
372              
373 4         11 $self->{trigIdx} = $newTrigIdx;
374             }
375              
376             sub _getTrigIdx {
377 0     0   0 my $self = shift;
378 0         0 return $self->{trigIdx};
379             }
380              
381             # Computes similarity of potientially matching strings in hash
382             # newSimInfo. The result is saved in newResult. The computation of the
383             # similarity works like this:
384             #
385             # (a = all trigrams, d = different trigrams, e = warp)
386             #
387             # (a**e - d**e)/a**e
388             #
389             # The default for e is 1. If e is > 1.0, short strings are getting away
390             # better, if e is < 1.0 short strings are getting away worse.
391             #
392             # Parameters
393             #
394             # $newStr string to be matched
395             # $newSimInfo KEY = potentially matching string, VALUE = number of matching trigrams
396             # $newResult KEY = actually matching string, VALUE = similarity value
397             # $curMinSim current minimal similarity
398             # $curWarp current warp
399             #
400             # Returns
401             #
402             # number of matching strings
403              
404             sub _computeSimilarity {
405 53     53   58 my $self = shift;
406 53         59 my $newStr = shift;
407 53         54 my $newSimInfo = shift;
408 53         49 my $newResult = shift;
409 53         81 my $curMinSim = shift;
410 53         55 my $curWarp = shift;
411              
412             # clear hash containing the results
413 53         75 %$newResult = ();
414              
415 53         50 my $strCnt = 0; # number of similar strings (return value)
416 53         51 my $allTrigs; # number of all trigrams
417             my $sameTrigs; # number of same trigrams
418 0         0 my $actSim; # similarity (0 - 1)
419 53         56 my $len =
420             length($newStr); # length of string - $padNum for the padded blanks
421              
422             # check every potientially similar string (i.e. every string containing at least one
423             # identical trigram)
424 53         115 foreach ( keys(%$newSimInfo) ) {
425 50         65 $sameTrigs = $newSimInfo->{$_}->{name};
426              
427             # the number of n-grams in a string result from subtracting n from the length of
428             # the string and adding 1. If it is padded with blanks, there is one additional
429             # n-gram for each blank. So to compute the number of n-grams of two strings we
430             # subtract n twice, add 2 and add the number of padded blanks * 2. Since $newStr
431             # is already padded and the length noted for $_ contains already the padding we
432             # do not need to take the padding explicitly into account. Finally, to get
433             # $allTrigs (types not tokens) we need to subtract the number of matching trigrams
434             # - those occuring in both strings - once.
435 50         96 $allTrigs =
436             $len + $newSimInfo->{$_}->{len} - 2 * $self->{ngram} - $sameTrigs + 2;
437              
438 50         73 $actSim = _computeActSim( $sameTrigs, $allTrigs, $curWarp );
439              
440 50 50       99 if ( $self->{debug} ) {
441 0         0 my $tmpStr = $self->{padding} . $_ . $self->{padding};
442 0         0 print STDERR "\nCompare\n";
443 0         0 print STDERR "$newStr ->",
444             sort join( ":", $newStr =~ /(?=(...))/g ), "<-\n";
445 0         0 print STDERR "$tmpStr ->",
446             sort join( ":", $tmpStr =~ /(?=(...))/g ), "<-\n";
447 0         0 print STDERR "-" x
448             ( 22 + length($newStr) + 2 * length( $self->{padding} ) +
449             length($_) ), "\n";
450 0         0 print STDERR "N-GRAM-LEN: ", $self->{ngram}, "\n";
451 0         0 print STDERR "ALL : ", $allTrigs, "\n";
452 0         0 print STDERR "SAME: ", $sameTrigs, "\n";
453 0         0 print STDERR "DIFF: ", $allTrigs - $sameTrigs, "\n";
454 0         0 print STDERR "ACTSIM: ", $actSim, "\n";
455 0         0 print STDERR "PADDING: ", length $self->{padding}, "\n";
456 0         0 print STDERR "MINSIM: ", $self->{minSim}, "\n";
457 0         0 print STDERR "WARP: ", $curWarp, "\n\n";
458             }
459              
460             # count string as similar only if similarity exceeds minimal similarity
461 50 100       112 if ( $actSim > $curMinSim ) {
462 47         69 $newResult->{$_} = $actSim;
463 47         88 $strCnt++;
464             }
465             }
466              
467 53         275 return $strCnt;
468             }
469              
470             # compute similarity
471             sub _computeActSim {
472 50     50   51 my $sameTrigs = shift;
473 50         45 my $allTrigs = shift;
474 50         52 my $curWarp = shift;
475              
476 50         80 my $diffTrigs = -1; # number of different trigrams
477 50         45 my $actSim = -1; # similarity (0 - 1)
478              
479             # no warp here so skip the complicated stuff below
480 50 100       77 if ( $curWarp == 1 ) {
481 48         93 $actSim = $sameTrigs / $allTrigs;
482             }
483             else {
484             # we've got to take warp into account
485 2         4 $diffTrigs = $allTrigs - $sameTrigs;
486 2         30 $actSim =
487             ( ( $allTrigs**$curWarp ) - ( $diffTrigs**$curWarp ) ) /
488             ( $allTrigs**$curWarp );
489             }
490             }
491              
492             # Takes list of strings and puts them into an index of trigrams. KEY is a trigram,
493             # VALUE a list of strings containing that trigram. VALUE has two KEYS:
494             #
495             # trigs: count of trigram occurring in string
496             # len: length of string (if $keepAlNums, after applying s/\W//g)
497             #
498             # Parameters
499             #
500             # $list list of strings being the base of comparison
501             # $ignoreCase ignore case if 1
502             # $keepAlNums s/\W//g if 1
503             # $pad contains 0 - 2 blanks for padding
504             #
505             # Returns
506             #
507             # trigram index
508              
509             sub _trigramify {
510 43     43   52 my $list = shift;
511 43         48 my $ignoreCase = shift;
512 43         46 my $keepAlNums = shift;
513 43         45 my $pad = shift;
514 43         46 my $trigs = shift;
515 43         47 my $ngram = shift;
516 43         46 my $seen = shift;
517 43         41 my $tmpStr;
518             my $len;
519              
520 43         68 foreach (@$list) {
521 58         68 $tmpStr = $_;
522              
523 58 100       105 $tmpStr =~ s/\W//g if $keepAlNums;
524 58 100       152 $tmpStr = lc $tmpStr if $ignoreCase;
525              
526 58 100       109 next if exists $seen->{$tmpStr};
527 56         101 $seen->{$tmpStr} = 1;
528              
529 56         88 $tmpStr = $pad . $tmpStr . $pad;
530              
531 56         59 $len = length($tmpStr);
532              
533 56         140 for ( my $i = 0 ; $i < ( length($tmpStr) - $ngram + 1 ) ; $i++ ) {
534 350         936 $trigs->{ substr( $tmpStr, $i, $ngram ) }->{$_}->{trigs}++;
535 350         1036 $trigs->{ substr( $tmpStr, $i, $ngram ) }->{$_}->{len} = $len;
536             }
537             }
538              
539 43         318 return $trigs;
540             }
541              
542             sub _setParams {
543 39     39   43 my $params = shift;
544              
545             # set defaults, if not specified otherwise
546 39 100       105 $params->{ngram} = $DEFAULT_NGRAM_LEN unless exists $params->{ngram};
547 39 100       97 $params->{minSim} = $DEFAULT_MIN_SIM unless exists $params->{minSim};
548 39 100       95 $params->{warp} = $DEFAULT_WARP unless exists $params->{warp};
549 39 100       101 $params->{ignoreCase} = $DEFAULT_IGNORE_CASE
550             unless exists $params->{ignoreCase};
551 39 100       87 $params->{keepOnlyAlNums} = $DEFAULT_KEEP_ONLY_ALNUMS
552             unless exists $params->{keepOnlyAlNums};
553 39 100       103 $params->{padding} = $params->{ngram} - 1 unless exists $params->{padding};
554 39 100       121 $params->{debug} = $DEFAULT_DEBUG unless exists $params->{debug};
555             }
556              
557             1;
558             __END__