line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Fuzzy::PP; |
2
|
2
|
|
|
2
|
|
43063
|
use strict; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
141
|
|
3
|
2
|
|
|
2
|
|
11
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
52
|
|
4
|
2
|
|
|
2
|
|
2029
|
use utf8; |
|
2
|
|
|
|
|
27
|
|
|
2
|
|
|
|
|
12
|
|
5
|
|
|
|
|
|
|
require Exporter; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
8
|
|
|
|
|
|
|
our @EXPORT = qw/distance_edits/; |
9
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# Get away with some XS for speed if available... |
12
|
|
|
|
|
|
|
local $@; |
13
|
|
|
|
|
|
|
eval { require List::Util; }; |
14
|
|
|
|
|
|
|
unless ($@) { |
15
|
|
|
|
|
|
|
*min = \&List::Util::min; |
16
|
|
|
|
|
|
|
} |
17
|
|
|
|
|
|
|
else { |
18
|
|
|
|
|
|
|
*min = \&_min; |
19
|
|
|
|
|
|
|
} |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
sub new { |
22
|
0
|
|
|
0
|
1
|
|
my ($class,$source,%args) = @_; |
23
|
|
|
|
|
|
|
|
24
|
0
|
0
|
|
|
|
|
my $self = { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
25
|
|
|
|
|
|
|
source => $source, |
26
|
|
|
|
|
|
|
_last_distance => undef, |
27
|
|
|
|
|
|
|
_length_rejections => 0, |
28
|
|
|
|
|
|
|
_ualphabet_rejections => 0, |
29
|
|
|
|
|
|
|
_no_alphabet => 0, |
30
|
|
|
|
|
|
|
length => length($source), |
31
|
|
|
|
|
|
|
no_exact => defined($args{'no_exact'}) ? delete($args{'no_exact'}) : 0, |
32
|
|
|
|
|
|
|
trans => defined($args{'trans'}) ? delete($args{'trans'}) : 0, |
33
|
|
|
|
|
|
|
max_distance => defined($args{'max'}) ? delete($args{'max'}) :-1, |
34
|
|
|
|
|
|
|
}; |
35
|
|
|
|
|
|
|
|
36
|
0
|
|
|
|
|
|
bless( $self, $class ); |
37
|
|
|
|
|
|
|
|
38
|
0
|
|
|
|
|
|
return $self; |
39
|
|
|
|
|
|
|
} |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub _no_alphabet { |
42
|
0
|
|
|
0
|
|
|
my ($self,$onoff) = @_; |
43
|
0
|
0
|
0
|
|
|
|
$self->{_no_alphabet} = $onoff if ($onoff == 0 || $onoff == 1); |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
sub get_trans { |
47
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
48
|
0
|
|
|
|
|
|
return $self->{trans}; |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
sub ualphabet_rejections { |
52
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
53
|
0
|
|
|
|
|
|
return $self->{_ualphabet_rejections}; |
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
sub length_rejections { |
57
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
58
|
0
|
|
|
|
|
|
return $self->{_length_rejections}; |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub unicode_length { |
62
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
63
|
0
|
|
|
|
|
|
return length $self->{source}; |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub last_distance { |
67
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
68
|
0
|
|
|
|
|
|
return $self->{_last_distance}; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub set_max_distance { |
72
|
0
|
|
|
0
|
1
|
|
my ($self,$max) = @_; |
73
|
|
|
|
|
|
|
# set_max_distance() with no args = no max |
74
|
0
|
0
|
|
|
|
|
$max = -1 if (!defined $max); |
75
|
0
|
0
|
|
|
|
|
$self->{max_distance} = $max if ($max >= -1); |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub get_max_distance { |
79
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
80
|
0
|
0
|
|
|
|
|
return ($self->{max_distance} == -1)?undef:$self->{max_distance}; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub transpositions_ok { |
84
|
0
|
|
|
0
|
1
|
|
my ($self,$onoff) = @_; |
85
|
0
|
0
|
0
|
|
|
|
$self->{trans} = $onoff if ($onoff == 0 || $onoff == 1); |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub no_exact { |
89
|
0
|
|
|
0
|
0
|
|
my ($self,$onoff) = @_; |
90
|
0
|
0
|
0
|
|
|
|
$self->{no_exact} = $onoff if ($onoff == 0 || $onoff == 1); |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub distance { |
94
|
0
|
|
|
0
|
1
|
|
my ($self,$target,$max) = @_; |
95
|
|
|
|
|
|
|
|
96
|
0
|
0
|
|
|
|
|
if($self->{source} eq $target) { |
97
|
0
|
0
|
|
|
|
|
return $self->{no_exact}?undef:0; |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# $max overrides our objects max_distance |
101
|
|
|
|
|
|
|
# allows nearest() to change he max_distance dynamically for speed |
102
|
0
|
0
|
|
|
|
|
$max = defined($max)?$max:$self->{max_distance}; |
103
|
|
|
|
|
|
|
|
104
|
0
|
|
|
|
|
|
my $target_length = length($target); |
105
|
|
|
|
|
|
|
|
106
|
0
|
0
|
0
|
|
|
|
return ($self->{length}?$self->{length}:$target_length) |
|
|
0
|
|
|
|
|
|
107
|
|
|
|
|
|
|
if(!$target_length || !$self->{length}); |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# pass the string lengths to keep from calling length() again later |
110
|
0
|
0
|
|
|
|
|
if( $self->{trans} ) { |
111
|
0
|
|
|
|
|
|
my $score = _damerau($self->{source},$self->{length},$target,$target_length,$max); |
112
|
0
|
0
|
|
|
|
|
return ($score > 0)?$score:undef; |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
else { |
115
|
0
|
|
|
|
|
|
my $score = _levenshtein($self->{source},$self->{length},$target,$target_length,$max); |
116
|
0
|
0
|
|
|
|
|
return ($score > 0)?$score:undef; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
sub nearest { |
121
|
0
|
|
|
0
|
1
|
|
my ($self,$words) = @_; |
122
|
|
|
|
|
|
|
|
123
|
0
|
0
|
|
|
|
|
if ( ref $words eq ref [] ) { |
124
|
0
|
|
|
|
|
|
my $max = $self->{max_distance}; |
125
|
0
|
|
|
|
|
|
my $best_index = undef; |
126
|
|
|
|
|
|
|
|
127
|
0
|
|
|
|
|
|
for ( 0 .. $#{ $words } ) { |
|
0
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# compatability |
129
|
0
|
0
|
0
|
|
|
|
if( $max != -1 && abs($self->{length} - length($words->[$_])) > $max ) { |
130
|
0
|
|
|
|
|
|
$self->{_length_rejections}++; |
131
|
0
|
|
|
|
|
|
next; |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# compatability |
135
|
0
|
0
|
0
|
|
|
|
if ( $max != -1 && _alphabet_difference($self->{source},$words->[$_]) > $max) { |
136
|
0
|
|
|
|
|
|
$self->{_ualphabet_rejections}++; |
137
|
0
|
|
|
|
|
|
next; |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
my $d = $self->distance($words->[$_], $max); |
142
|
|
|
|
|
|
|
|
143
|
0
|
0
|
0
|
|
|
|
if( !defined($d) ) { |
|
|
0
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# no_exact => 1 match or $d > $max |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
elsif( $max == -1 || $d < $max ) { |
147
|
|
|
|
|
|
|
# better match found |
148
|
0
|
|
|
|
|
|
$self->{_last_distance} = $max = $d; |
149
|
0
|
|
|
|
|
|
$best_index = $_; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
|
153
|
0
|
|
|
|
|
|
return $best_index; |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
1; |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
sub _levenshtein { |
160
|
0
|
|
|
0
|
|
|
my ($source,$source_length,$target,$target_length,$max_distance) = @_; |
161
|
|
|
|
|
|
|
|
162
|
0
|
|
|
|
|
|
my @scores;; |
163
|
0
|
|
|
|
|
|
my ($i,$j,$large_value); |
164
|
|
|
|
|
|
|
|
165
|
0
|
0
|
|
|
|
|
if ($max_distance >= 0) { |
166
|
0
|
|
|
|
|
|
$large_value = $max_distance + 1; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
else { |
169
|
0
|
0
|
|
|
|
|
if ($target_length > $source_length) { |
170
|
0
|
|
|
|
|
|
$large_value = $target_length; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
else { |
173
|
0
|
|
|
|
|
|
$large_value = $source_length; |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
|
177
|
0
|
|
|
|
|
|
for ($j = 0; $j <= $target_length; $j++) { |
178
|
0
|
|
|
|
|
|
$scores[0][$j] = $j; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
0
|
|
|
|
|
|
for ($i = 1; $i <= $source_length; $i++) { |
182
|
0
|
|
|
|
|
|
my ($col_min,$next,$prev); |
183
|
0
|
|
|
|
|
|
my $c1 = substr($source,$i-1,1); |
184
|
0
|
|
|
|
|
|
my $min_j = 1; |
185
|
0
|
|
|
|
|
|
my $max_j = $target_length; |
186
|
|
|
|
|
|
|
|
187
|
0
|
0
|
|
|
|
|
if ($max_distance >= 0) { |
188
|
0
|
0
|
|
|
|
|
if ($i > $max_distance) { |
189
|
0
|
|
|
|
|
|
$min_j = $i - $max_distance; |
190
|
|
|
|
|
|
|
} |
191
|
0
|
0
|
|
|
|
|
if ($target_length > $max_distance + $i) { |
192
|
0
|
|
|
|
|
|
$max_j = $max_distance + $i; |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
0
|
|
|
|
|
|
$col_min = $large_value; |
197
|
0
|
|
|
|
|
|
$next = $i % 2; |
198
|
|
|
|
|
|
|
|
199
|
0
|
0
|
|
|
|
|
if ($next == 1) { |
200
|
0
|
|
|
|
|
|
$prev = 0; |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
else { |
203
|
0
|
|
|
|
|
|
$prev = 1; |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
|
206
|
0
|
|
|
|
|
|
$scores[$next][0] = $i; |
207
|
|
|
|
|
|
|
|
208
|
0
|
|
|
|
|
|
for ($j = 1; $j <= $target_length; $j++) { |
209
|
0
|
0
|
0
|
|
|
|
if ($j < $min_j || $j > $max_j) { |
210
|
0
|
|
|
|
|
|
$scores[$next][$j] = $large_value; |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
else { |
213
|
0
|
|
|
|
|
|
my $c2 = substr($target,$j-1,1); |
214
|
|
|
|
|
|
|
|
215
|
0
|
0
|
|
|
|
|
if ($c1 eq $c2) { |
216
|
0
|
|
|
|
|
|
$scores[$next][$j] = $scores[$prev][$j-1]; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
else { |
219
|
0
|
|
|
|
|
|
my $delete = $scores[$prev][$j] + 1;#[% delete_cost %]; |
220
|
0
|
|
|
|
|
|
my $insert = $scores[$next][$j-1] + 1;#[% insert_cost %]; |
221
|
0
|
|
|
|
|
|
my $substitute = $scores[$prev][$j-1] + 1;#[% substitute_cost %]; |
222
|
0
|
|
|
|
|
|
my $minimum = $delete; |
223
|
|
|
|
|
|
|
|
224
|
0
|
0
|
|
|
|
|
if ($insert < $minimum) { |
225
|
0
|
|
|
|
|
|
$minimum = $insert; |
226
|
|
|
|
|
|
|
} |
227
|
0
|
0
|
|
|
|
|
if ($substitute < $minimum) { |
228
|
0
|
|
|
|
|
|
$minimum = $substitute; |
229
|
|
|
|
|
|
|
} |
230
|
0
|
|
|
|
|
|
$scores[$next][$j] = $minimum; |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
0
|
0
|
|
|
|
|
if ($scores[$next][$j] < $col_min) { |
235
|
0
|
|
|
|
|
|
$col_min = $scores[$next][$j]; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
|
239
|
0
|
0
|
|
|
|
|
if ($max_distance >= 0) { |
240
|
0
|
0
|
|
|
|
|
if ($col_min > $max_distance) { |
241
|
0
|
|
|
|
|
|
return -1; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
|
246
|
0
|
|
|
|
|
|
return $scores[$source_length % 2][$target_length]; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
sub _damerau { |
250
|
0
|
|
|
0
|
|
|
my ($source,$source_length,$target,$target_length,$max_distance) = @_; |
251
|
|
|
|
|
|
|
|
252
|
0
|
|
|
|
|
|
my $lengths_max = $source_length + $target_length; |
253
|
0
|
|
|
|
|
|
my ($swap_count,$swap_score,$target_char_count); |
254
|
0
|
|
|
|
|
|
my $dictionary_count = {}; #create dictionary to keep character count |
255
|
0
|
|
|
|
|
|
my @scores; |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
# init values outside of work loops |
258
|
0
|
|
|
|
|
|
$scores[0][0] = $scores[1][0] = $scores[0][1] = $lengths_max; |
259
|
0
|
|
|
|
|
|
$scores[1][1] = 0; |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
# Work Loops |
262
|
0
|
|
|
|
|
|
foreach my $source_index ( 1 .. $source_length ) { |
263
|
0
|
|
|
|
|
|
$swap_count = 0; |
264
|
0
|
|
|
|
|
|
$dictionary_count->{ substr( $source, $source_index - 1, 1 ) } = 0; |
265
|
0
|
|
|
|
|
|
$scores[ $source_index + 1 ][1] = $source_index; |
266
|
0
|
|
|
|
|
|
$scores[ $source_index + 1 ][0] = $lengths_max; |
267
|
|
|
|
|
|
|
|
268
|
0
|
|
|
|
|
|
foreach my $target_index ( 1 .. $target_length ) { |
269
|
0
|
0
|
|
|
|
|
if ( $source_index == 1 ) { |
270
|
0
|
|
|
|
|
|
$dictionary_count->{ substr( $target, $target_index - 1, 1 ) } = 0; |
271
|
0
|
|
|
|
|
|
$scores[1][ $target_index + 1 ] = $target_index; |
272
|
0
|
|
|
|
|
|
$scores[0][ $target_index + 1 ] = $lengths_max; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
$target_char_count = |
276
|
0
|
|
|
|
|
|
$dictionary_count->{ substr( $target, $target_index - 1, 1 ) }; |
277
|
0
|
|
|
|
|
|
$swap_score = $scores[$target_char_count][$swap_count] + |
278
|
|
|
|
|
|
|
( $source_index - $target_char_count - 1 ) + 1 + |
279
|
|
|
|
|
|
|
( $target_index - $swap_count - 1 ); |
280
|
|
|
|
|
|
|
|
281
|
0
|
0
|
|
|
|
|
if ( |
282
|
|
|
|
|
|
|
substr( $source, $source_index - 1, 1 ) ne |
283
|
|
|
|
|
|
|
substr( $target, $target_index - 1, 1 ) ) |
284
|
|
|
|
|
|
|
{ |
285
|
0
|
|
|
|
|
|
$scores[ $source_index + 1 ][ $target_index + 1 ] = min( |
286
|
|
|
|
|
|
|
$scores[$source_index][$target_index]+1, |
287
|
|
|
|
|
|
|
$scores[ $source_index + 1 ][$target_index]+1, |
288
|
|
|
|
|
|
|
$scores[$source_index][ $target_index + 1 ]+1, |
289
|
|
|
|
|
|
|
$swap_score |
290
|
|
|
|
|
|
|
); |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
else { |
293
|
0
|
|
|
|
|
|
$swap_count = $target_index; |
294
|
|
|
|
|
|
|
|
295
|
0
|
|
|
|
|
|
$scores[ $source_index + 1 ][ $target_index + 1 ] = min( |
296
|
|
|
|
|
|
|
$scores[$source_index][$target_index], $swap_score |
297
|
|
|
|
|
|
|
); |
298
|
|
|
|
|
|
|
} |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
# This is where the $max_distance check goes ideally, but it doesn't pass tests |
302
|
|
|
|
|
|
|
#if ( $max_distance != -1 && $max_distance < $scores[ $source_index + 1 ][ $target_length + 1 ] ) |
303
|
|
|
|
|
|
|
#{ |
304
|
|
|
|
|
|
|
# return -1; |
305
|
|
|
|
|
|
|
#} |
306
|
|
|
|
|
|
|
|
307
|
0
|
|
|
|
|
|
$dictionary_count->{ substr( $source, $source_index - 1, 1 ) } = |
308
|
|
|
|
|
|
|
$source_index; |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
|
311
|
0
|
0
|
0
|
|
|
|
return -1 if ($max_distance != -1 && $scores[ $source_length + 1 ][ $target_length + 1 ] > $max_distance); |
312
|
0
|
|
|
|
|
|
return $scores[ $source_length + 1 ][ $target_length + 1 ]; |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
# this function is very unoptimized |
316
|
|
|
|
|
|
|
sub _alphabet_difference { |
317
|
0
|
|
|
0
|
|
|
my $source = shift; |
318
|
0
|
|
|
|
|
|
my $target = shift; |
319
|
0
|
|
|
|
|
|
my %dict; |
320
|
0
|
|
|
|
|
|
my $missing = 0; |
321
|
|
|
|
|
|
|
|
322
|
0
|
|
|
|
|
|
for (0 .. length($source)) { |
323
|
0
|
|
|
|
|
|
my $char = substr($source,$_,1); |
324
|
0
|
0
|
0
|
|
|
|
$missing++ if(!exists $dict{$char} && $target !~ $char); |
325
|
0
|
|
|
|
|
|
$dict{$char} = 1; |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
|
328
|
0
|
|
|
|
|
|
return $missing; |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
sub _min { |
332
|
0
|
|
|
0
|
|
|
my $min = shift; |
333
|
0
|
0
|
|
|
|
|
return $min if not @_; |
334
|
|
|
|
|
|
|
|
335
|
0
|
|
|
|
|
|
my $next = shift; |
336
|
0
|
0
|
|
|
|
|
unshift @_, $min < $next ? $min : $next; |
337
|
0
|
|
|
|
|
|
goto &_min; |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
__END__ |