File Coverage

blib/lib/Text/SimSearch.pm

Criterion	Covered	Total	%
statement	18	111	16.2
branch	0	10	0.0
condition	0	2	0.0
subroutine	6	13	46.1
pod	0	5	0.0
total	24	141	17.0

line	stmt	bran	cond	sub	pod	time	code
1							package Text::SimSearch;
2	1			1		5	use strict;
	1					1
	1					33
3	1			1		4	use warnings;
	1					1
	1					29
4	1			1		4	use strict;
	1					2
	1					26
5	1			1		4	use warnings;
	1					1
	1					32
6	1			1		757	use Storable qw( nstore retrieve);
	1					3148
	1					85
7	1			1		679	use Time::HiRes qw(gettimeofday tv_interval);
	1					1328
	1					5
8
9							our $VERSION = '0.01_002';
10
11							sub new {
12	0			0	0		my $class = shift;
13	0						my $self = bless {@_}, $class;
14	0						return $self;
15							}
16
17							sub add_item_from_file {
18	0			0	0		my $self = shift;
19	0						my $file = shift;
20
21	0						my $tmp_data;
22							my $labels;
23	0	0					open my $fh, "<", $file
24							or die("can not open $file");
25	0						my $i = 0;
26	0						while (<$fh>) {
27	0						chomp $_;
28	0						my @f = split "\t", $_;
29	0						my $label = shift @f;
30	0						$labels->[$i] = $label;
31
32	0						my %vec = @f;
33	0						my $vec = $self->_unit_length( \%vec );
34	0						while ( my ( $key, $val ) = each %$vec ) {
35	0	0					next unless $val > 0;
36	0						$tmp_data->{$key}->{$i} = $val;
37							}
38
39	0						$i++;
40							}
41	0						close($fh);
42
43							# make "Posting-Lists" from $tmp_data created above.
44							# Note: concatenate the label-ID and the weight as string,
45							# and then convert it to interger value.
46
47	0						my $posting_lists;
48
49	0						my $key_scale = int( log( int @$labels ) / log(10) ) + 1;
50	0						my $val_scale = 6;
51
52	0						$i = 0;
53	0						while ( my ( $key, $ref ) = each %$tmp_data ) {
54
55	0						my @array;
56	0						for ( keys %$ref ) {
57
58	0						my $label_id = $_;
59	0						my $weight = $ref->{$label_id};
60
61							# convert the weight to integer value.
62	0						my $condition = '%.' . $val_scale . 'f';
63	0						my $integer = sprintf( $condition, $weight ) * ( 10**$val_scale );
64
65							# convert the label_id to integer value, and connect to $interger.
66	0						$condition = '%0' . $key_scale . 'd';
67	0						$integer .= sprintf( $condition, $label_id );
68	0						push @array, $integer;
69							}
70
71							# cut down posting-list to suitable size and compress it.
72	0						my @tmp;
73	0						my $n = 0;
74							LABEL:
75	0						for ( sort { $b <=> $a } @array ) {
	0
76	0						my $p = pack( "w*", $_ );
77	0						push @tmp, $p;
78	0	0					last LABEL if ++$n == 1000;
79							}
80
81	0						$posting_lists->{$key} = \@tmp;
82
83	0						$i++;
84							}
85
86							$self->{index_data} = {
87	0						posting_lists => $posting_lists,
88							labels => $labels,
89							key_scale => $key_scale,
90							val_scale => $val_scale
91							};
92							}
93
94							sub search {
95	0			0	0		my $self = shift;
96	0						my $query_vector = shift;
97	0		0				my $number = shift \|\| 10;
98
99	0						my $t0 = [gettimeofday];
100
101	0						my $vec = $self->_unit_length($query_vector);
102	0						my $key_scale = $self->{index_data}->{key_scale};
103	0						my $val_scale = $self->{index_data}->{val_scale};
104
105	0						my $dot_product;
106	0						while ( my ( $q_key, $q_val ) = each %$vec ) {
107	0						my $compressed_array = $self->{index_data}->{posting_lists}->{$q_key};
108	0	0					next if !$compressed_array;
109	0						LABEL:
110							my $max;
111
112	0						for (@$compressed_array) {
113
114							# decompress and decode
115	0						my $string = unpack( "w*", $_ );
116	0						my $count = length($string) - $key_scale;
117	0						my $val = substr( $string, 0, $count );
118	0						my $label_id = int substr( $string, $count, $key_scale );
119	0						$val = $val / ( 10**$val_scale );
120
121							# calculate similarities
122	0						$dot_product->{$label_id} += $q_val * $val;
123							}
124							}
125
126	0						my @list;
127	0						for (
128	0						sort { $dot_product->{$b} <=> $dot_product->{$a} }
129							keys %$dot_product
130							)
131							{
132	0						my $similarity = sprintf( "%8.6f", $dot_product->{$_} );
133	0						my $label = $self->{index_data}->{labels}->[$_];
134	0						push @list, { label => $label, similarity => $similarity };
135	0	0					last if int @list == $number;
136							}
137	0						my $elapsed = tv_interval($t0);
138
139							return {
140	0						elapsed => $elapsed,
141							retrieved_list => \@list,
142							return_num => int @list,
143							};
144							}
145
146							sub save {
147	0			0	0		my $self = shift;
148	0						my $save_file = shift;
149	0						my $index = $self->{index_data};
150	0						nstore( $index, $save_file );
151							}
152
153							sub load {
154	0			0	0		my $self = shift;
155	0						my $save_file = shift;
156	0						my $index = retrieve($save_file);
157	0						$self->{index_data} = $index;
158							}
159
160							sub _unit_length {
161	0			0			my $self = shift;
162	0						my $vec = shift;
163	0						my $ret;
164	0						my $norm = $self->_calc_norm($vec);
165	0						while ( my ( $key, $value ) = each %$vec ) {
166	0						$ret->{$key} = $value / $norm;
167							}
168	0						return $ret;
169							}
170
171							sub _calc_norm {
172	0			0			my $self = shift;
173	0						my $vec = shift;
174
175	0						my $norm;
176	0						for ( values %$vec ) {
177	0						$norm += $_**2;
178							}
179	0						sqrt($norm);
180							}
181
182							1;
183							__END__