File Coverage

blib/lib/String/KeyboardDistance.pm
Criterion Covered Total %
statement 107 107 100.0
branch 64 72 88.8
condition 8 10 80.0
subroutine 13 13 100.0
pod 11 11 100.0
total 203 213 95.3


line stmt bran cond sub pod time code
1             package String::KeyboardDistance;
2              
3             require 5.005_62;
4 1     1   1174 use strict;
  1         1  
  1         32  
5 1     1   5 use warnings;
  1         4  
  1         3713  
6              
7             require Exporter;
8             our @ISA = qw(Exporter);
9              
10             =head1 NAME
11              
12             String::KeyboardDistance - String Comparison Algorithm
13              
14             =head1 SYNOPSIS
15              
16             use String::KeyboardDistance qw(:all);
17             my $s1 = 'Apple';
18             my $s2 = 'Wople';
19              
20             # compute a match probability
21             my $pr = qwerty_keyboard_distance_match('Apple','Wople');
22              
23             # find the keyboard distance between two strings
24             my $dst = qwerty_keyboard_distance('IBM','HAL');
25              
26             # find the keyboard distance between two characters
27             $dst = qwerty_char_distance('a','v');
28              
29             print "maximum distance: $qwerty_max_distance\n";
30              
31             =head1 DESCRIPTION
32              
33             This module implmements a version of keyboard distance for fuzzy
34             string matching. Keyboard distance is a measure of the physical
35             distance between two keys on a keyboard. For example, 'g' has a
36             distance of 1 from the keys 'r', 't', 'y', 'f', 'h', 'v', 'b',
37             and 'n'. Immediate diagonals (like ''r, 'y', 'v', and 'n') are
38             considered to have a distance of 1 instead of 1.414 to help to
39             prevent horizontal/vertical bias.
40              
41             A match probability between two strings is computed from the total
42             distances between corresponding characters divided by the length of the
43             longer string multiplied by the maximum distance between the two furthest
44             keys on the keyboard.
45              
46             The functions in this module use a simple grid of keys. For the qwerty
47             mapping, the grid is similar to the following:
48              
49             | 0 1 2 3 4 5 6 7 8 9 10 11 12 13
50             --+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
51             0 | ` | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 0 | - | = | |
52             1 | | q | w | e | r | t | y | u | i | o | p | [ | ] | \ |
53             2 | | a | s | d | f | g | h | j | k | l | ; | ' | | |
54             3 | | z | x | c | v | b | n | m | , | . | / | | | |
55             --+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
56              
57             The grids for both qwerty and dvorak are based on PC style keyboards.
58             Shifted characters have the same coordinates (a and A, 6 and ^).
59              
60             =head2 EXPORT
61              
62             This module exports no symbols by default. The following functions
63             are available for export through EXPORT_OK:
64              
65             build_qwerty_map
66             max_qwerty_distance
67             qwerty_char_distance
68             qwerty_keyboard_distance
69             qwerty_keyboard_distance_match
70              
71             build_dvorak_map
72             max_dvorak_distance
73             dvorak_char_distance
74             dvorak_keyboard_distance
75             dvorak_keyboard_distance_match
76              
77             grid_distance
78              
79             The following varialbes are availalbe for export through EXPORT_OK:
80              
81             @qwerty_grid
82             $qwerty_map
83             $qwerty_max_distance
84              
85             @dvorak_grid
86             $dvorak_map
87             $dvorak_max_distance
88              
89             Additionaly, this module supports the following export tags:
90              
91             :Functions - import all functions
92             :Variables - import all variables
93             :all - import both functions and variables
94              
95             =cut
96              
97             our @EXP_SUBS = qw(
98             build_qwerty_map
99             max_qwerty_distance
100             qwerty_char_distance
101             qwerty_keyboard_distance
102             qwerty_keyboard_distance_match
103              
104             build_dvorak_map
105             max_dvorak_distance
106             dvorak_char_distance
107             dvorak_keyboard_distance
108             dvorak_keyboard_distance_match
109              
110             grid_distance
111             );
112              
113             our @EXP_VARS = qw(
114             @qwerty_grid
115             $qwerty_map
116             $qwerty_max_distance
117              
118             @dvorak_grid
119             $dvorak_map
120             $dvorak_max_distance
121             );
122              
123             our %EXPORT_TAGS = (
124             'all' => [ @EXP_SUBS, @EXP_VARS ],
125             'Functions' => \@EXP_SUBS,
126             'Variables' => \@EXP_VARS,
127             );
128              
129             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
130             our @EXPORT = qw();
131             our $VERSION = '1.00';
132              
133             our @qwerty_grid = (
134             [[ split(//,q{`1234567890-= }) ],[ split(//,q{~!@#$%^&*()_+ })]],
135             [[ split(//,q{ qwertyuiop[]\\})],[ split(//,q( QWERTYUIOP{}|))]],
136             [[ split(//,q{ asdfghjkl;' }) ],[ split(//,q{ ASDFGHJKL:" })]],
137             [[ split(//,q{ zxcvbnm,./ }) ],[ split(//,q{ ZXCVBNM<>? })]],
138             );
139             our $qwerty_map = build_qwerty_map();
140             our $qwerty_max_distance = max_qwerty_distance();
141              
142             our @dvorak_grid = (
143             [[ split(//,q{`1234567890[] }) ],[ split(//,q"~!@#$%^&*(){} ")]],
144             [[ split(//,q{ ',.pyfgcrl/=\\})],[ split(//,q{ "<>PYFGCRL?+|})]],
145             [[ split(//,q{ aoeuidhtns- }) ],[ split(//,q{ AOEUIDHTNS_ })]],
146             [[ split(//,q{ ;qjkxbmwvz }) ],[ split(//,q{ :QJKXBMWVZ })]],
147             );
148             our $dvorak_map = build_dvorak_map();
149             our $dvorak_max_distance = max_dvorak_distance();
150              
151              
152             =head1 API REFERENCE
153              
154              
155             =head2 build_qwerty_map
156              
157             param: array reference to receive the map [optional]
158             return: the map (array reference)
159              
160             This function builds a map of each character to its corresponding
161             location on the keyboard. The location is derived by looking at the
162             keyboard as a simple grid in which the location of keys on the keyboard
163             are considereed to be the same as their shifted values. The following
164             keys are considered to have the same location:
165              
166             1 and !
167             r and r
168             / and ?
169              
170             The map is an array, where the index is the value returned by chr() for
171             the character at that location. The value is an array ref describing
172             the location of the character on the keyboard. The first element is the
173             y position, the second is the x, and the third represents the shift value
174             (0 for non-shifted, 1 for shifted). All non-keyable characters, including
175             tabs and spaces, will have undef values in the map, meaning they have no
176             point on the grid. These non-key characterss should be considered to be
177             the maximum distance away from any other character except themselves.
178              
179             The map is constructed at run-time from the @qwerty_grid package global,
180             and cached in the $qwerty_map package global.
181              
182             =cut
183              
184             sub build_qwerty_map
185             {
186 1   50 1 1 18 my $map = shift||[];
187 1         3 my($i,$j);
188 1         5 for($i = 0; $i < @qwerty_grid; ++$i) {
189 4         6 for($j = 0; $j < @{$qwerty_grid[$i][0]}; ++$j) {
  58         133  
190 54 50       102 next unless defined $qwerty_grid[$i][0][$j];
191 54 100       114 next if ' ' eq $qwerty_grid[$i][0][$j];
192             # print "building map0: $i,$j: ",$qwerty_grid[$i][0][$j],':',
193             # ord($qwerty_grid[$i][0][$j]),"\n";
194 47         127 $map->[ord $qwerty_grid[$i][0][$j]] = [$i,$j,0];
195             }
196 4         7 for($j = 0; $j < @{$qwerty_grid[$i][1]}; ++$j) {
  60         137  
197 56 50       106 next unless defined $qwerty_grid[$i][1][$j];
198 56 100       118 next if ' ' eq $qwerty_grid[$i][1][$j];
199             #print "building map1: $i,$j: ",$qwerty_grid[$i][1][$j],':',
200             # ord($qwerty_grid[$i][1][$j]),"\n";
201 47         135 $map->[ord $qwerty_grid[$i][1][$j]] = [$i,$j,1];
202             }
203             }
204 1         3 return $map;
205             }
206              
207             =head2 build_dvorak_map
208              
209             param: array reference to receive the map [optional]
210             return: the map (array reference)
211              
212             This function is identical to build_qwerty_map, with the exception
213             that it builds a map for dvorak keyboards, and the map is constructed
214             from the @dvorak_grid package global, and cached in the $dvorak_map
215             package global.
216              
217             =cut
218              
219             sub build_dvorak_map
220             {
221 1   50 1 1 7 my $map = shift||[];
222 1         2 my($i,$j);
223 1         3 for($i = 0; $i < @dvorak_grid; ++$i) {
224 4         5 for($j = 0; $j < @{$dvorak_grid[$i][0]}; ++$j) {
  60         149  
225 56 50       101 next unless defined $dvorak_grid[$i][0][$j];
226 56 100       104 next if ' ' eq $dvorak_grid[$i][0][$j];
227             # print "building map0: $i,$j: ",$dvorak_grid[$i][0][$j],':',
228             # ord($dvorak_grid[$i][0][$j]),"\n";
229 47         107 $map->[ord $dvorak_grid[$i][0][$j]] = [$i,$j,0];
230             }
231 4         7 for($j = 0; $j < @{$dvorak_grid[$i][1]}; ++$j) {
  60         100  
232 56 50       87 next unless defined $dvorak_grid[$i][1][$j];
233 56 100       92 next if ' ' eq $dvorak_grid[$i][1][$j];
234             #print "building map1: $i,$j: ",$dvorak_grid[$i][1][$j],':',
235             # ord($dvorak_grid[$i][1][$j]),"\n";
236 47         101 $map->[ord $dvorak_grid[$i][1][$j]] = [$i,$j,1];
237             }
238             }
239 1         3 return $map;
240             }
241              
242             =head2 max_qwerty_distance
243              
244             return: maximum distance
245              
246             This function dynamicly computes the maximum distance between
247             keys in the qwerty map. The maximum key distance is stored in
248             the $qwerty_max_distance package global.
249              
250             =cut
251              
252             sub max_qwerty_distance
253             {
254 1     1 1 2 my $max = 0;
255 1         5 for(my $i = 0; $i < @$qwerty_map; ++$i) {
256 127 100       263 next unless $qwerty_map->[$i];
257 94         208 for(my $j = 0; $j < @$qwerty_map; ++$j) {
258 11938 100       28376 next unless $qwerty_map->[$j];
259 8836 100       15487 next if $i == $j;
260 8742         19487 my $dst = qwerty_char_distance(chr($i),chr($j));
261 8742 100       36715 $max = $dst if $dst > $max;
262             }
263             }
264 1         4 return $max;
265             }
266              
267             =head2 max_dvorak_distance
268              
269             return: maximum distance
270              
271             This function dynamicly computes the maximum distance between
272             keys in the dvorak map. The maximum key distance is stored in
273             the $dvorak_max_distance package global.
274              
275             =cut
276              
277             sub max_dvorak_distance
278             {
279 1     1 1 2 my $max = 0;
280 1         5 for(my $i = 0; $i < @$dvorak_map; ++$i) {
281 127 100       240 next unless $dvorak_map->[$i];
282 94         176 for(my $j = 0; $j < @$dvorak_map; ++$j) {
283 11938 100       22795 next unless $dvorak_map->[$j];
284 8836 100       13632 next if $i == $j;
285 8742         16372 my $dst = dvorak_char_distance(chr($i),chr($j));
286 8742 100       27221 $max = $dst if $dst > $max;
287             }
288             }
289 1         7 return $max;
290             }
291              
292             =head2 qwerty_char_distance
293              
294             param: char 1
295             param: char 2
296             return: distance
297              
298             This function computes the distance between the two characters passed
299             on a qwerty keyboard.
300              
301             =cut
302              
303             sub qwerty_char_distance
304             {
305 8926 100   8926 1 18158 return 0 if $_[0] eq $_[1]; # return 0 if same, regardless of map
306             # if either of the chars are not in the map, return the max distance
307 8789 50       16127 return $qwerty_max_distance unless $qwerty_map->[ord $_[0]];
308 8789 50       15827 return $qwerty_max_distance unless $qwerty_map->[ord $_[1]];
309 8789         13668 return grid_distance(
310 8789         17877 @{$qwerty_map->[ord $_[0]]}[0,1],
311 8789         9352 @{$qwerty_map->[ord $_[1]]}[0,1],
312             );
313             }
314              
315             =head2 dvorak_char_distance
316              
317             param: char 1
318             param: char 2
319             return: distance
320              
321             This function computes the distance between the two characters passed
322             on a dvorak keyboard.
323              
324             =cut
325              
326             sub dvorak_char_distance
327             {
328 8926 100   8926 1 15625 return 0 if $_[0] eq $_[1]; # return 0 if same, regardless of map
329             # if either of the chars are not in the map, return the max distance
330 8789 50       14668 return $dvorak_max_distance unless $dvorak_map->[ord $_[0]];
331 8789 50       13633 return $dvorak_max_distance unless $dvorak_map->[ord $_[1]];
332 8789         11952 return grid_distance(
333 8789         15861 @{$dvorak_map->[ord $_[0]]}[0,1],
334 8789         7889 @{$dvorak_map->[ord $_[1]]}[0,1],
335             );
336             }
337              
338             =head2 grid_distance
339              
340             param: x1 - point 1's x coordinate
341             param: y1 - point 1's y coordinate
342             param: x2 - point 2's x coordinate
343             param: y2 - point 2's y coordinate
344             return: distance between points
345              
346             This function returns the distance between two points. If the two
347             points have an x distance of 1, and a y distance of 1, then they
348             are considered to be a distance of 1 apart. This is meant to help
349             prevent horizontal/vertical bias in the distancing function. Otherwise
350             we use the following formula:
351              
352             sqrt( (x1 - x2)**2 + (y1 - y2)**2 );
353              
354             =cut
355              
356             sub grid_distance
357             {
358 17578 100 100 17578 1 44300 return 0 if($_[0] == $_[2] && $_[1] == $_[3]); # points are same
359 17390 100 100     53776 return 1 if(abs($_[0] - $_[2]) == 1 && abs($_[1] - $_[3]) == 1); # same as 1
360 16358         37192 sqrt(($_[0] - $_[2])**2 + ($_[1] - $_[3])**2);
361             }
362              
363             =head2 qwerty_keyboard_distance
364              
365             param: string1
366             param: string2
367             return: distance
368              
369             Returns the sum of the distances between corresponding characters
370             in the two strings. If one string is longer than the other the
371             remaining characters are counted as having the same value as the
372             maximum distance.
373              
374             =cut
375              
376             sub qwerty_keyboard_distance
377             {
378 29     29 1 35 my($s1,$s2) = @_;
379 29         38 my($l1,$l2) = (length($s1),length($s2));
380 29 100       40 my $short = $l1 < $l2 ? $s1 : $s2;
381 29 100       46 my $long = $l1 < $l2 ? $s2 : $s1;
382 29 100       38 my $ls = $l1 < $l2 ? $l1 : $l2;
383 29 100       40 my $ll = $l1 < $l2 ? $l2 : $l1;
384              
385 29         29 my $tot = 0;
386 29         24 my $i;
387 29         62 for($i = 0; $i < $ls; ++$i) {
388             #print "calling distance(",substr($short,$i,1),',',$long,$i,1,")\n";
389 184         367 $tot += abs(qwerty_char_distance(substr($short,$i,1),substr($long,$i,1)));
390             }
391              
392 29         59 while($i < $ll) {
393 12         12 $tot += $qwerty_max_distance;
394 12         19 ++$i;
395             }
396              
397 29         60 return $tot;
398             }
399              
400             =head2 qwerty_keyboard_distance_match
401              
402             param: string1
403             param: string2
404             return: probability of match
405              
406             The probability of a match is:
407              
408             Pr = 1 - ( D / (L * M) )
409              
410             Where D is the distance between the two strings, L is the length of
411             the longer string, and M is the maximum character distance.
412              
413             =cut
414              
415             sub qwerty_keyboard_distance_match
416             {
417 29     29 1 1114 my($s1,$s2) = @_;
418 29         64 my($l1,$l2) = (length($s1),length($s2));
419 29 100       58 my $ls = $l1 < $l2 ? $l1 : $l2;
420 29 100       37 my $ll = $l1 < $l2 ? $l2 : $l1;
421 29         47 my $dst = qwerty_keyboard_distance($s1,$s2);
422 29         90 return (1 - ($dst/($ll*$qwerty_max_distance)));
423             }
424              
425             =head2 dvorak_keyboard_distance
426              
427             param: string1
428             param: string2
429             return: distance
430              
431             Returns the sum of the distances between corresponding characters
432             in the two strings. If one string is longer than the other the
433             remaining characters are counted as having the same value as the
434             maximum distance.
435              
436             =cut
437              
438             sub dvorak_keyboard_distance
439             {
440 29     29 1 32 my($s1,$s2) = @_;
441 29         34 my($l1,$l2) = (length($s1),length($s2));
442 29 100       43 my $short = $l1 < $l2 ? $s1 : $s2;
443 29 100       40 my $long = $l1 < $l2 ? $s2 : $s1;
444 29 100       49 my $ls = $l1 < $l2 ? $l1 : $l2;
445 29 100       36 my $ll = $l1 < $l2 ? $l2 : $l1;
446              
447 29         27 my $tot = 0;
448 29         28 my $i = 0;
449 29         55 for($i = 0; $i < $ls; ++$i) {
450             #print "calling distance(",substr($short,$i,1),',',$long,$i,1,")\n";
451 184         376 $tot += abs(dvorak_char_distance(substr($short,$i,1),substr($long,$i,1)));
452             }
453              
454 29         54 while($i < $ll) {
455 12         14 $tot += $dvorak_max_distance;
456 12         22 ++$i;
457             }
458 29         56 return $tot;
459             }
460              
461             =head2 dvorak_keyboard_distance_match
462              
463             param: string1
464             param: string2
465             return: probability of match
466              
467             The probability of a match is computed in the same way as
468             for qwerty_keyboard_distance_match().
469              
470             =cut
471              
472             sub dvorak_keyboard_distance_match
473             {
474 29     29 1 82 my($s1,$s2) = @_;
475 29         37 my($l1,$l2) = (length($s1),length($s2));
476 29 100       51 my $ls = $l1 < $l2 ? $l1 : $l2;
477 29 100       43 my $ll = $l1 < $l2 ? $l2 : $l1;
478 29         40 my $dst = dvorak_keyboard_distance($s1,$s2);
479 29         74 return 1 - ($dst/($ll*$dvorak_max_distance));
480             }
481              
482             1;
483             __END__