| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Math::Vector::Similarity; |
|
2
|
|
|
|
|
|
|
# ABSTRACT: Cosine similarity, euclidean distance and other vector comparison functions |
|
3
|
|
|
|
|
|
|
our $VERSION = '0.001'; |
|
4
|
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
525794
|
use strict; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
66
|
|
|
6
|
2
|
|
|
2
|
|
9
|
use warnings; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
89
|
|
|
7
|
2
|
|
|
2
|
|
10
|
use Carp qw( croak ); |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
94
|
|
|
8
|
2
|
|
|
2
|
|
8
|
use Exporter 'import'; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
1006
|
|
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
|
11
|
|
|
|
|
|
|
cosine_similarity |
|
12
|
|
|
|
|
|
|
cosine_distance |
|
13
|
|
|
|
|
|
|
euclidean_distance |
|
14
|
|
|
|
|
|
|
dot_product |
|
15
|
|
|
|
|
|
|
normalize |
|
16
|
|
|
|
|
|
|
); |
|
17
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( all => \@EXPORT_OK ); |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
sub dot_product { |
|
20
|
5
|
|
|
5
|
1
|
295663
|
my ( $a, $b ) = @_; |
|
21
|
5
|
100
|
|
|
|
169
|
croak "vectors must have same dimensions" |
|
22
|
|
|
|
|
|
|
unless @$a == @$b; |
|
23
|
4
|
|
|
|
|
4
|
my $sum = 0; |
|
24
|
4
|
|
|
|
|
18
|
$sum += $a->[$_] * $b->[$_] for 0..$#$a; |
|
25
|
4
|
|
|
|
|
17
|
return $sum; |
|
26
|
|
|
|
|
|
|
} |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
sub normalize { |
|
30
|
3
|
|
|
3
|
1
|
464
|
my ( $vec ) = @_; |
|
31
|
3
|
|
|
|
|
3
|
my $norm = 0; |
|
32
|
3
|
|
|
|
|
9
|
$norm += $_ * $_ for @$vec; |
|
33
|
3
|
|
|
|
|
4
|
$norm = sqrt($norm); |
|
34
|
3
|
100
|
|
|
|
9
|
return $vec if $norm == 0; |
|
35
|
2
|
|
|
|
|
4
|
return [ map { $_ / $norm } @$vec ]; |
|
|
5
|
|
|
|
|
10
|
|
|
36
|
|
|
|
|
|
|
} |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub cosine_similarity { |
|
40
|
11
|
|
|
11
|
1
|
3754
|
my ( $a, $b ) = @_; |
|
41
|
11
|
100
|
|
|
|
99
|
croak "vectors must have same dimensions" |
|
42
|
|
|
|
|
|
|
unless @$a == @$b; |
|
43
|
10
|
|
|
|
|
18
|
my ( $dot, $norm_a, $norm_b ) = ( 0, 0, 0 ); |
|
44
|
10
|
|
|
|
|
23
|
for my $i (0..$#$a) { |
|
45
|
793
|
|
|
|
|
792
|
$dot += $a->[$i] * $b->[$i]; |
|
46
|
793
|
|
|
|
|
772
|
$norm_a += $a->[$i] * $a->[$i]; |
|
47
|
793
|
|
|
|
|
914
|
$norm_b += $b->[$i] * $b->[$i]; |
|
48
|
|
|
|
|
|
|
} |
|
49
|
10
|
|
|
|
|
20
|
my $denom = sqrt($norm_a) * sqrt($norm_b); |
|
50
|
10
|
100
|
|
|
|
21
|
return 0 if $denom == 0; |
|
51
|
9
|
|
|
|
|
26
|
return $dot / $denom; |
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub cosine_distance { |
|
56
|
3
|
|
|
3
|
1
|
355
|
my ( $a, $b ) = @_; |
|
57
|
3
|
|
|
|
|
5
|
return 1 - cosine_similarity($a, $b); |
|
58
|
|
|
|
|
|
|
} |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub euclidean_distance { |
|
62
|
4
|
|
|
4
|
1
|
880
|
my ( $a, $b ) = @_; |
|
63
|
4
|
100
|
|
|
|
136
|
croak "vectors must have same dimensions" |
|
64
|
|
|
|
|
|
|
unless @$a == @$b; |
|
65
|
3
|
|
|
|
|
5
|
my $sum = 0; |
|
66
|
3
|
|
|
|
|
7
|
for my $i (0..$#$a) { |
|
67
|
6
|
|
|
|
|
7
|
my $d = $a->[$i] - $b->[$i]; |
|
68
|
6
|
|
|
|
|
9
|
$sum += $d * $d; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
3
|
|
|
|
|
5
|
return sqrt($sum); |
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
1; |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
__END__ |