| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Search::ContextGraph; |
|
2
|
|
|
|
|
|
|
|
|
3
|
13
|
|
|
13
|
|
325052
|
use strict; |
|
|
13
|
|
|
|
|
32
|
|
|
|
13
|
|
|
|
|
445
|
|
|
4
|
13
|
|
|
13
|
|
71
|
use warnings; |
|
|
13
|
|
|
|
|
40
|
|
|
|
13
|
|
|
|
|
383
|
|
|
5
|
13
|
|
|
13
|
|
69
|
use Carp; |
|
|
13
|
|
|
|
|
27
|
|
|
|
13
|
|
|
|
|
1223
|
|
|
6
|
13
|
|
|
13
|
|
94
|
use base "Storable"; |
|
|
13
|
|
|
|
|
22
|
|
|
|
13
|
|
|
|
|
16539
|
|
|
7
|
13
|
|
|
13
|
|
62118
|
use File::Find; |
|
|
13
|
|
|
|
|
29
|
|
|
|
13
|
|
|
|
|
1383
|
|
|
8
|
13
|
|
|
13
|
|
27029
|
use IO::Socket; |
|
|
13
|
|
|
|
|
523789
|
|
|
|
13
|
|
|
|
|
73
|
|
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = '0.15'; |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
my $count = 0; |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 NAME |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
Search::ContextGraph - spreading activation search engine |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
use Search::ContextGraph; |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my $cg = Search::ContextGraph->new(); |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# first you add some documents, perhaps all at once... |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
my %docs = ( |
|
29
|
|
|
|
|
|
|
'first' => [ 'elephant', 'snake' ], |
|
30
|
|
|
|
|
|
|
'second' => [ 'camel', 'pony' ], |
|
31
|
|
|
|
|
|
|
'third' => { 'snake' => 2, 'constrictor' => 1 }, |
|
32
|
|
|
|
|
|
|
); |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
$cg->bulk_add( %docs ); |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# or in a loop... |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
foreach my $title ( keys %docs ) { |
|
39
|
|
|
|
|
|
|
$cg->add( $title, $docs{$title} ); |
|
40
|
|
|
|
|
|
|
} |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# or from a file... |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
my $cg = Search::ContextGraph->load_from_dir( "./myfiles" ); |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# you can store a graph object for later use |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
$cg->store( "stored.cng" ); |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# and retrieve it later... |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
my $cg = ContextGraph->retrieve( "stored.cng" ); |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# SEARCHING |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# the easiest way |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
my @ranked_docs = $cg->simple_search( 'peanuts' ); |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# get back both related terms and docs for more power |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
my ( $docs, $words ) = $cg->search('snake'); |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# you can use a document as your query |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
my ( $docs, $words ) = $cg->find_similar('First Document'); |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# Or you can query on a combination of things |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
my ( $docs, $words ) = |
|
75
|
|
|
|
|
|
|
$cg->mixed_search( { docs => [ 'First Document' ], |
|
76
|
|
|
|
|
|
|
terms => [ 'snake', 'pony' ] |
|
77
|
|
|
|
|
|
|
); |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# Print out result set of returned documents |
|
81
|
|
|
|
|
|
|
foreach my $k ( sort { $docs->{$b} <=> $docs->{$a} } |
|
82
|
|
|
|
|
|
|
keys %{ $docs } ) { |
|
83
|
|
|
|
|
|
|
print "Document $k had relevance ", $docs->{$k}, "\n"; |
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# Reload it |
|
89
|
|
|
|
|
|
|
my $new = Search::ContextGraph->retrieve( "filename" ); |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
Spreading activation is a neat technique for building search engines that |
|
96
|
|
|
|
|
|
|
return accurate results for a query even when there is no exact keyword match. |
|
97
|
|
|
|
|
|
|
The engine works by building a data structure called a B, which |
|
98
|
|
|
|
|
|
|
is a giant network of document and term nodes. All document nodes are connected |
|
99
|
|
|
|
|
|
|
to the terms that occur in that document; similarly, every term node is connected |
|
100
|
|
|
|
|
|
|
to all of the document nodes that term occurs in. We search the graph by |
|
101
|
|
|
|
|
|
|
starting at a query node and distributing a set amount of energy to its neighbor |
|
102
|
|
|
|
|
|
|
nodes. Then we recurse, diminishing the energy at each stage, until this |
|
103
|
|
|
|
|
|
|
spreading energy falls below a given threshold. Each node keeps track of |
|
104
|
|
|
|
|
|
|
accumulated energy, and this serves as our measure of relevance. |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
This means that documents that have many words in common will appear similar to the |
|
107
|
|
|
|
|
|
|
search engine. Likewise, words that occur together in many documents will be |
|
108
|
|
|
|
|
|
|
perceived as semantically related. Especially with larger, coherent document |
|
109
|
|
|
|
|
|
|
collections, the search engine can be quite effective at recognizing synonyms |
|
110
|
|
|
|
|
|
|
and finding useful relationships between documents. You can read a full |
|
111
|
|
|
|
|
|
|
description of the algorithm at L. |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
The search engine gives expanded recall (relevant results even when there is no |
|
114
|
|
|
|
|
|
|
keyword match) without incurring the kind of computational and patent issues |
|
115
|
|
|
|
|
|
|
posed by latent semantic indexing (LSI). The technique used here was originally |
|
116
|
|
|
|
|
|
|
described in a 1981 dissertation by Scott Preece. |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=head1 CONSTRUCTORS |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=over |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=item new %PARAMS |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
Object constructor. Possible parameters: |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=over |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=item auto_reweight |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
Rebalance the graph every time a change occurs. Default is true. |
|
131
|
|
|
|
|
|
|
Disable and do by hand using L for better performance in |
|
132
|
|
|
|
|
|
|
graphs with frequent updates/additions/deletions. |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=item debug LEVEL |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
Set this to 1 or 2 to turn on verbose debugging output |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=item max_depth |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
Set the maximum distance to spread energy out from the start |
|
142
|
|
|
|
|
|
|
node. Default is effectively unlimited. You can tweak it using L. |
|
143
|
|
|
|
|
|
|
Comes in handy if you find searches are too slow. |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=item xs |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
When true, tells the module to use compiled C internals. This reduces |
|
148
|
|
|
|
|
|
|
memory requirements by about 60%, but actually runs a little slower than the |
|
149
|
|
|
|
|
|
|
pure Perl version. Don't bother to turn it on unless you have a huge graph. |
|
150
|
|
|
|
|
|
|
Default is pure Perl. |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=over |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=item * using the compiled version makes it impossible to store the graph to disk. |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=item * xs is B in version 0.09. But it will return in triumph! |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=back |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=item START_ENERGY |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Initial energy to assign to a query node. Default is 100. |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=item ACTIVATE_THRESHOLD |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Minimal energy needed to propagate search along the graph. Default is 1. |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=item COLLECT_THRESHOLD |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
Minimal energy needed for a node to enter the result set. Default is 1. |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=back |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=cut |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
sub new { |
|
178
|
15
|
|
|
15
|
1
|
212
|
my ( $class, %params) = @_; |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
# backwards compatible... |
|
181
|
15
|
|
|
|
|
75
|
*add_document = \&add; |
|
182
|
15
|
|
|
|
|
44
|
*add_documents = \&bulk_add; |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# plucene friendly |
|
185
|
15
|
|
|
|
|
42
|
*optimize = \&reweight_graph; |
|
186
|
15
|
|
|
|
|
66
|
*is_indexed = \&has_doc; |
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# fail on all unknown paramters (helps fight typos) |
|
189
|
15
|
|
|
|
|
82
|
my @allowed = qw/debug auto_reweight use_global_weights max_depth START_ENERGY ACTIVATE_THRESHOLD COLLECT_THRESHOLD use_file xs/; |
|
190
|
15
|
|
|
|
|
28
|
my %check; |
|
191
|
15
|
|
|
|
|
161
|
$check{$_}++ foreach @allowed; |
|
192
|
|
|
|
|
|
|
|
|
193
|
15
|
|
|
|
|
34
|
my @forbidden; |
|
194
|
15
|
|
|
|
|
55
|
foreach my $k ( keys %params ) { |
|
195
|
12
|
50
|
|
|
|
52
|
push @forbidden, $k unless exists $check{$k}; |
|
196
|
|
|
|
|
|
|
} |
|
197
|
15
|
50
|
|
|
|
57
|
if ( @forbidden ) { |
|
198
|
0
|
|
|
|
|
0
|
croak "The following unrecognized parameters were detected: ", |
|
199
|
|
|
|
|
|
|
join ", ", @forbidden; |
|
200
|
|
|
|
|
|
|
} |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
|
|
203
|
15
|
|
|
|
|
167
|
my $obj = bless |
|
204
|
|
|
|
|
|
|
{ debug => 0, |
|
205
|
|
|
|
|
|
|
auto_reweight => 1, |
|
206
|
|
|
|
|
|
|
use_global_weights => 1, |
|
207
|
|
|
|
|
|
|
max_depth => 100000000, |
|
208
|
|
|
|
|
|
|
START_ENERGY => 100, |
|
209
|
|
|
|
|
|
|
ACTIVATE_THRESHOLD => 1, |
|
210
|
|
|
|
|
|
|
COLLECT_THRESHOLD => .2, |
|
211
|
|
|
|
|
|
|
%params, |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
depth => 0, |
|
214
|
|
|
|
|
|
|
neighbors => {}, |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
}, |
|
217
|
|
|
|
|
|
|
$class; |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
|
|
220
|
15
|
100
|
|
|
|
180
|
if ( $obj->{use_file} ) { |
|
221
|
1
|
|
|
|
|
3
|
my %neighbors; |
|
222
|
13
|
|
|
13
|
|
24279
|
use MLDBM qw/DB_File Storable/; |
|
|
13
|
|
|
|
|
51331
|
|
|
|
13
|
|
|
|
|
100
|
|
|
223
|
13
|
|
|
13
|
|
518
|
use Fcntl; |
|
|
13
|
|
|
|
|
31
|
|
|
|
13
|
|
|
|
|
138808
|
|
|
224
|
1
|
|
|
|
|
165
|
warn "Using MLDBM: $obj->{use_file}"; |
|
225
|
1
|
50
|
|
|
|
12
|
$obj->{neighbors} = tie %neighbors, 'MLDBM', $obj->{use_file} or die $!; |
|
226
|
|
|
|
|
|
|
#$obj->{neighbors} = \%neighbors; |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
|
|
231
|
14
|
|
|
|
|
79
|
return $obj; |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
} |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=item load_from_dir DIR [, \&PARSE ] |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
Load documents from a directory. Takes two arguments, a directory path |
|
239
|
|
|
|
|
|
|
and an optional parsing subroutine. If the parsing subroutine is passed |
|
240
|
|
|
|
|
|
|
an argument, it will use it to extract term tokens from the file. |
|
241
|
|
|
|
|
|
|
By default, the file is split on whitespace and stripped of numbers and |
|
242
|
|
|
|
|
|
|
punctuation. |
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
=cut |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
{ |
|
247
|
|
|
|
|
|
|
my $parse_sub; |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
sub load_from_dir { |
|
250
|
0
|
|
|
0
|
1
|
0
|
my ( $class, $dir, $code ) = @_; |
|
251
|
|
|
|
|
|
|
|
|
252
|
0
|
0
|
|
|
|
0
|
croak "$dir is not a directory" unless -d $dir; |
|
253
|
|
|
|
|
|
|
|
|
254
|
0
|
|
|
|
|
0
|
require File::Find; |
|
255
|
0
|
0
|
0
|
|
|
0
|
unless ( defined $code |
|
|
|
|
0
|
|
|
|
|
|
256
|
|
|
|
|
|
|
and ref $code |
|
257
|
|
|
|
|
|
|
and ref $code eq 'CODE' ) { |
|
258
|
|
|
|
|
|
|
$code = sub { |
|
259
|
0
|
|
|
0
|
|
0
|
my $text = shift; |
|
260
|
0
|
|
|
|
|
0
|
$text =~ s/[^\w]/ /gs; |
|
261
|
0
|
|
|
|
|
0
|
my @toks = split /\s+/m, $text; |
|
262
|
0
|
|
|
|
|
0
|
return grep { length($_) > 1 } @toks; |
|
|
0
|
|
|
|
|
0
|
|
|
263
|
0
|
|
|
|
|
0
|
}; |
|
264
|
|
|
|
|
|
|
} |
|
265
|
|
|
|
|
|
|
|
|
266
|
0
|
|
|
|
|
0
|
$parse_sub = $code; |
|
267
|
0
|
|
|
|
|
0
|
my %docs; |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# Recursively open every file and provide the contents |
|
270
|
|
|
|
|
|
|
# to whatever parsing subroutine we're using |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
my $reader = |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
sub { |
|
275
|
0
|
|
|
0
|
|
0
|
my ( $parse ) = @_; |
|
276
|
0
|
0
|
|
|
|
0
|
return if /^\./; |
|
277
|
0
|
0
|
|
|
|
0
|
return unless -f $_; |
|
278
|
0
|
0
|
|
|
|
0
|
open my $fh, $_ or |
|
279
|
|
|
|
|
|
|
croak "Could not open file $File::Find::name: $!"; |
|
280
|
0
|
|
|
|
|
0
|
local $/; |
|
281
|
0
|
|
|
|
|
0
|
my $contents = <$fh>; |
|
282
|
0
|
0
|
|
|
|
0
|
close $fh or croak "failed to close filehandle"; |
|
283
|
0
|
|
|
|
|
0
|
my @words = $parse_sub->($contents); |
|
284
|
0
|
|
|
|
|
0
|
$docs{ $File::Find::name } = \@words; |
|
285
|
0
|
|
|
|
|
0
|
}; |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
|
|
288
|
0
|
|
|
|
|
0
|
find( $reader , $dir ); |
|
289
|
0
|
|
|
|
|
0
|
my $self = __PACKAGE__->new(); |
|
290
|
0
|
|
|
|
|
0
|
$self->bulk_add( %docs ); |
|
291
|
0
|
|
|
|
|
0
|
return $self; |
|
292
|
|
|
|
|
|
|
} |
|
293
|
|
|
|
|
|
|
} |
|
294
|
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=item load_from_tdm FILENAME |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
Opens and loads a term-document matrix (TDM) file to initialize the graph. |
|
300
|
|
|
|
|
|
|
The TDM encodes information about term-to-document links. |
|
301
|
|
|
|
|
|
|
This is a legacy method mainly for the convenience of the module author. |
|
302
|
|
|
|
|
|
|
For notes on the proper file format, see the README file. |
|
303
|
|
|
|
|
|
|
=cut |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
sub load_from_tdm { |
|
306
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $file ) = @_; |
|
307
|
0
|
0
|
|
|
|
0
|
croak "TDM file $file does not exist" unless -f $file; |
|
308
|
0
|
0
|
|
|
|
0
|
return if $self->{'loaded'}; |
|
309
|
0
|
|
|
|
|
0
|
$self->_read_tdm( $file ); |
|
310
|
0
|
|
|
|
|
0
|
$self->{'loaded'} = 1; |
|
311
|
0
|
|
|
|
|
0
|
$self->reweight_graph(); |
|
312
|
|
|
|
|
|
|
} |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
=item rename OLD, NEW |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
Renames a document. Will return undef if the new name is already in use. |
|
318
|
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
=cut |
|
320
|
|
|
|
|
|
|
sub rename { |
|
321
|
|
|
|
|
|
|
|
|
322
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $old, $new ) = @_; |
|
323
|
0
|
0
|
0
|
|
|
0
|
croak "rename method needs two arguments" unless |
|
324
|
|
|
|
|
|
|
defined $old and defined $new; |
|
325
|
0
|
0
|
|
|
|
0
|
croak "document $old not found" unless |
|
326
|
|
|
|
|
|
|
exists $self->{neighbors}{ _nodeify('D', $old ) }; |
|
327
|
|
|
|
|
|
|
|
|
328
|
0
|
|
|
|
|
0
|
my $bad = _nodeify( 'D', $old ); |
|
329
|
0
|
|
|
|
|
0
|
my $good = _nodeify( 'D', $new ); |
|
330
|
|
|
|
|
|
|
|
|
331
|
0
|
0
|
|
|
|
0
|
return if exists $self->{neighbors}{$good}; |
|
332
|
|
|
|
|
|
|
|
|
333
|
0
|
|
|
|
|
0
|
my $s = $self->{neighbors}; |
|
334
|
0
|
|
|
|
|
0
|
foreach my $n ( keys %{ $s->{$bad} } ) { |
|
|
0
|
|
|
|
|
0
|
|
|
335
|
0
|
|
|
|
|
0
|
$s->{$good}{$n} = |
|
336
|
|
|
|
|
|
|
$s->{$n}{$good} = |
|
337
|
|
|
|
|
|
|
$s->{$bad}{$n}; |
|
338
|
0
|
|
|
|
|
0
|
delete $s->{$bad}{$n}; |
|
339
|
0
|
|
|
|
|
0
|
delete $s->{$n}{$bad}; |
|
340
|
|
|
|
|
|
|
} |
|
341
|
0
|
|
|
|
|
0
|
delete $s->{$bad}; |
|
342
|
0
|
|
|
|
|
0
|
return 1; |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
} |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
=item retrieve FILENAME |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
Loads a previously stored graph from disk, using Storable. |
|
351
|
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
=cut |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
sub retrieve { |
|
355
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $file ) = @_; |
|
356
|
0
|
0
|
|
|
|
0
|
croak "Must provide a filename to retrieve graph" |
|
357
|
|
|
|
|
|
|
unless $file; |
|
358
|
0
|
0
|
|
|
|
0
|
croak "'$file' is not a file" unless |
|
359
|
|
|
|
|
|
|
-f $file; |
|
360
|
|
|
|
|
|
|
|
|
361
|
0
|
|
|
|
|
0
|
Storable::retrieve( $file ); |
|
362
|
|
|
|
|
|
|
} |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=back |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=head1 ACCESSORS |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=over |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=item [get|set]_activate_threshold |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
Accessor for node activation threshold value. This value determines how far |
|
374
|
|
|
|
|
|
|
energy can spread in the graph. Lower it to increase the number of results. |
|
375
|
|
|
|
|
|
|
Default is 1. |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=cut |
|
378
|
|
|
|
|
|
|
|
|
379
|
2
|
|
|
2
|
0
|
1147
|
sub get_activate_threshold { $_[0]->{'ACTIVATE_THRESHOLD'} } |
|
380
|
|
|
|
|
|
|
sub set_activate_threshold { |
|
381
|
3
|
|
|
3
|
0
|
818
|
my ( $self, $threshold ) = @_; |
|
382
|
3
|
100
|
|
|
|
123
|
croak "Can't set activate threshold to zero" |
|
383
|
|
|
|
|
|
|
unless $threshold; |
|
384
|
2
|
100
|
|
|
|
110
|
croak "Can't set activate threshold to negative value" |
|
385
|
|
|
|
|
|
|
unless $threshold > 0; |
|
386
|
1
|
|
|
|
|
6
|
$self->{'ACTIVATE_THRESHOLD'} = $_[1]; |
|
387
|
|
|
|
|
|
|
} |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=item [get|set]_auto_reweight |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
Accessor for auto reweight flag. If true, edge weights will be recalculated |
|
393
|
|
|
|
|
|
|
every time a document is added, updated or removed. This can significantly slow |
|
394
|
|
|
|
|
|
|
down large graphs. On by default. |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
=cut |
|
397
|
|
|
|
|
|
|
|
|
398
|
0
|
|
|
0
|
0
|
0
|
sub get_auto_reweight{ $_[0]->{auto_reweight} } |
|
399
|
0
|
|
|
0
|
0
|
0
|
sub set_auto_reweight{ $_[0]->{auto_reweight} = $_[0]->[1]; } |
|
400
|
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=item [get|set]_collect_threshold |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
Accessor for collection threshold value. This determines how much energy a |
|
405
|
|
|
|
|
|
|
node must have to make it into the result set. Lower it to increase the |
|
406
|
|
|
|
|
|
|
number of results. Default is 1. |
|
407
|
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=cut |
|
409
|
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
sub get_collect_threshold { |
|
411
|
2
|
50
|
|
2
|
0
|
21
|
return ( $_[0]->{'xs'} ? |
|
412
|
|
|
|
|
|
|
$_[0]->{Graph}->collectionThreshold : |
|
413
|
|
|
|
|
|
|
$_[0]->{'COLLECT_THRESHOLD'}) |
|
414
|
|
|
|
|
|
|
} |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
sub set_collect_threshold { |
|
417
|
2
|
|
|
2
|
0
|
5
|
my ( $self, $newval ) = @_; |
|
418
|
|
|
|
|
|
|
|
|
419
|
2
|
|
100
|
|
|
10
|
$newval ||=0; |
|
420
|
|
|
|
|
|
|
|
|
421
|
2
|
50
|
|
|
|
7
|
$self->{Graph}->collectionThreshold( $newval ) |
|
422
|
|
|
|
|
|
|
if $self->{'xs'}; |
|
423
|
|
|
|
|
|
|
|
|
424
|
2
|
|
100
|
|
|
10
|
$self->{'COLLECT_THRESHOLD'} = $newval || 0; |
|
425
|
2
|
|
|
|
|
6
|
return 1; |
|
426
|
|
|
|
|
|
|
} |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=item [get|set]_debug_mode LEVEL |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
Turns debugging on or off. 1 is verbose, 2 is very verbose, 0 is off. |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=cut |
|
433
|
|
|
|
|
|
|
|
|
434
|
0
|
|
|
0
|
0
|
0
|
sub get_debug_mode { $_[0]->{debug} } |
|
435
|
|
|
|
|
|
|
sub set_debug_mode { |
|
436
|
0
|
|
|
0
|
0
|
0
|
my ( $self, $mode ) = @_; |
|
437
|
0
|
|
|
|
|
0
|
$self->{'debug'} = $mode; |
|
438
|
|
|
|
|
|
|
} |
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=item [get|set]_initial_energy |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
Accessor for initial energy value at the query node. This controls how |
|
445
|
|
|
|
|
|
|
much energy gets poured into the graph at the start of the search. |
|
446
|
|
|
|
|
|
|
Increase this value to get more results from your queries. |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=cut |
|
449
|
|
|
|
|
|
|
|
|
450
|
2
|
|
|
2
|
0
|
15
|
sub get_initial_energy { $_[0]->{'START_ENERGY'} } |
|
451
|
|
|
|
|
|
|
sub set_initial_energy { |
|
452
|
2
|
|
|
2
|
0
|
6
|
my ( $self, $start_energy ) = @_; |
|
453
|
2
|
50
|
|
|
|
9
|
croak "Can't set initial energy to zero" |
|
454
|
|
|
|
|
|
|
unless $start_energy; |
|
455
|
2
|
100
|
|
|
|
115
|
croak "Can't set initial energy to negative value" |
|
456
|
|
|
|
|
|
|
unless $start_energy > 0; |
|
457
|
1
|
|
|
|
|
30
|
$self->{'START_ENERGY'} = $start_energy ; |
|
458
|
|
|
|
|
|
|
} |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
=item [get|set]_max_depth LEVEL |
|
461
|
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
You can tell the graph to cut off searches after a certain distance from |
|
463
|
|
|
|
|
|
|
the query node. This can speed up searches on very large graphs, and has |
|
464
|
|
|
|
|
|
|
little adverse effect, especially if you are interested in just the first |
|
465
|
|
|
|
|
|
|
few search results. Set this value to undef to restore the default (10^8). |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=cut |
|
468
|
|
|
|
|
|
|
|
|
469
|
3
|
|
|
3
|
0
|
450
|
sub get_max_depth { $_[0]->{max_depth} } |
|
470
|
3
|
100
|
|
3
|
0
|
215
|
sub set_max_depth { croak "Tried to set maximum depth to an undefined value" |
|
471
|
|
|
|
|
|
|
unless defined $_[1]; |
|
472
|
2
|
|
100
|
|
|
19
|
$_[0]->{max_depth} = $_[1] || 100000000 |
|
473
|
|
|
|
|
|
|
} |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
=back |
|
479
|
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
=head1 METHODS |
|
481
|
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
=over |
|
483
|
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
=item add DOC, WORDS |
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
Add a document to the search engine. Takes as arguments a unique doc |
|
487
|
|
|
|
|
|
|
identifier and a reference to an array or hash of words in the |
|
488
|
|
|
|
|
|
|
document. |
|
489
|
|
|
|
|
|
|
For example: |
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
TITLE => { WORD1 => COUNT1, WORD2 => COUNT2 ... } |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
or |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
TITLE => [ WORD1, WORD2, WORD3 ] |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
Use L if you want to pass in a bunch of docs all at once. |
|
498
|
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
=cut |
|
500
|
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
sub add { |
|
503
|
|
|
|
|
|
|
|
|
504
|
420
|
|
|
420
|
1
|
28619
|
my ( $self, $title, $words ) = @_; |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
|
|
507
|
420
|
50
|
|
|
|
1228
|
croak "Please provide a word list" unless defined $words; |
|
508
|
420
|
50
|
66
|
|
|
3263
|
croak "Word list is not a reference to an array or hash" |
|
|
|
|
66
|
|
|
|
|
|
509
|
|
|
|
|
|
|
unless ref $words and ref $words eq "HASH" or ref $words eq "ARRAY"; |
|
510
|
|
|
|
|
|
|
|
|
511
|
420
|
50
|
|
|
|
934
|
croak "Please provide a document identifier" unless defined $title; |
|
512
|
|
|
|
|
|
|
|
|
513
|
420
|
|
|
|
|
853
|
my $dnode = _nodeify( 'D', $title ); |
|
514
|
420
|
50
|
|
|
|
1461
|
croak "Tried to add document with duplicate identifier: '$title'\n" |
|
515
|
|
|
|
|
|
|
if exists $self->{neighbors}{$dnode}; |
|
516
|
|
|
|
|
|
|
|
|
517
|
420
|
|
|
|
|
668
|
my @list; |
|
518
|
420
|
100
|
|
|
|
1104
|
if ( ref $words eq 'ARRAY' ) { |
|
519
|
419
|
|
|
|
|
489
|
@list = @{$words}; |
|
|
419
|
|
|
|
|
3733
|
|
|
520
|
|
|
|
|
|
|
} else { |
|
521
|
1
|
|
|
|
|
3
|
@list = keys %{$words}; |
|
|
1
|
|
|
|
|
46
|
|
|
522
|
|
|
|
|
|
|
} |
|
523
|
|
|
|
|
|
|
|
|
524
|
420
|
50
|
|
|
|
1048
|
croak "Tried to add a document with no content" unless scalar @list; |
|
525
|
|
|
|
|
|
|
|
|
526
|
420
|
|
|
|
|
493
|
my @edges; |
|
527
|
420
|
|
|
|
|
905
|
foreach my $term ( @list ) { |
|
528
|
13489
|
|
|
|
|
27818
|
my $tnode = _nodeify( 'T', lc( $term ) ); |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
# Local weight for the document |
|
531
|
13489
|
100
|
|
|
|
27162
|
my $lcount = ( ref $words eq 'HASH' ? $words->{$term} : 1 ); |
|
532
|
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
# Update number of docs this word occurs in |
|
534
|
13489
|
|
|
|
|
35123
|
my $gcount = ++$self->{term_count}{lc( $term )}; |
|
535
|
|
|
|
|
|
|
|
|
536
|
13489
|
|
|
|
|
13877
|
my $final_weight = 1; |
|
537
|
13489
|
|
|
|
|
33581
|
push @edges, [ $dnode, $tnode, $final_weight, $lcount ]; |
|
538
|
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
} |
|
540
|
420
|
|
|
|
|
787
|
$self->{reweight_flag} = 1; |
|
541
|
420
|
|
|
|
|
908
|
__normalize( \@edges ); |
|
542
|
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=cut |
|
545
|
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
DEVELOPMENT |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
if ( $self->{supersize} ) { |
|
549
|
|
|
|
|
|
|
my $n = $self->{neighbors}; |
|
550
|
|
|
|
|
|
|
foreach my $e ( @edges ) { |
|
551
|
|
|
|
|
|
|
#warn "adding edge $e->[0], $e->[1]\n"; |
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
$n->{$e->[0]} = {} unless exists $n->{$e->[0]}; |
|
554
|
|
|
|
|
|
|
$n->{$e->[1]} = {} unless exists $n->{$e->[1]}; |
|
555
|
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
my $tmp = $n->{$e->[0]}; |
|
557
|
|
|
|
|
|
|
$tmp->{$e->[1]} = join ',', $e->[2], $e->[3]; |
|
558
|
|
|
|
|
|
|
$tmp = $n->{$e->[1]}; |
|
559
|
|
|
|
|
|
|
$tmp->{$e->[0]} = join ',', $e->[2], $e->[3]; |
|
560
|
|
|
|
|
|
|
} |
|
561
|
|
|
|
|
|
|
=cut |
|
562
|
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
# PURE PERL VERSION |
|
565
|
|
|
|
|
|
|
#} else { |
|
566
|
420
|
|
|
|
|
698
|
foreach my $e ( @edges ) { |
|
567
|
13489
|
|
|
|
|
72348
|
$self->{neighbors}{$e->[0]}{$e->[1]} = join ',', $e->[2], $e->[3]; |
|
568
|
13489
|
|
|
|
|
75829
|
$self->{neighbors}{$e->[1]}{$e->[0]} = join ',', $e->[2], $e->[3]; |
|
569
|
|
|
|
|
|
|
} |
|
570
|
|
|
|
|
|
|
#} |
|
571
|
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
#print "Reweighting graph\n"; |
|
574
|
420
|
100
|
|
|
|
2151
|
$self->reweight_graph() if $self->{auto_reweight}; |
|
575
|
420
|
|
|
|
|
15729
|
return 1; |
|
576
|
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
} |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
=item add_file PATH [, name => NAME, parse => CODE] |
|
581
|
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
Adds a document from a file. By default, uses the PATH provided as the document |
|
583
|
|
|
|
|
|
|
identifier, and parses the file by splitting on whitespace. If a fancier title, |
|
584
|
|
|
|
|
|
|
or more elegant parsing behavior is desired, pass in named arguments as indicated. |
|
585
|
|
|
|
|
|
|
NAME can be any string, CODE should be a reference to a subroutine that takes one |
|
586
|
|
|
|
|
|
|
argument (the contents of the file) and returns an array of tokens, or a hash in the |
|
587
|
|
|
|
|
|
|
form TOKEN => COUNT, or a reference to the same. |
|
588
|
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
=cut |
|
590
|
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
sub add_file { |
|
592
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $path, %params ) = @_; |
|
593
|
|
|
|
|
|
|
|
|
594
|
0
|
0
|
0
|
|
|
0
|
croak "Invalid file '$path' provided to add_file method." |
|
595
|
|
|
|
|
|
|
unless defined $path and -f $path; |
|
596
|
|
|
|
|
|
|
|
|
597
|
0
|
0
|
|
|
|
0
|
my $title = ( exists $params{name} ? $params{name} : $path ); |
|
598
|
|
|
|
|
|
|
|
|
599
|
0
|
|
|
|
|
0
|
local $/; |
|
600
|
0
|
0
|
|
|
|
0
|
open my $fh, $path or croak "Unable to open $path: $!"; |
|
601
|
0
|
|
|
|
|
0
|
my $content = <$fh>; |
|
602
|
|
|
|
|
|
|
|
|
603
|
0
|
|
|
|
|
0
|
my $ref; |
|
604
|
|
|
|
|
|
|
|
|
605
|
0
|
0
|
|
|
|
0
|
if ( exists $params{parse} ) { |
|
606
|
0
|
0
|
|
|
|
0
|
croak "code provided is not a reference" unless |
|
607
|
|
|
|
|
|
|
ref $params{parse}; |
|
608
|
0
|
0
|
|
|
|
0
|
croak "code provided is not a subroutine" unless |
|
609
|
|
|
|
|
|
|
ref $params{parse} eq 'CODE'; |
|
610
|
|
|
|
|
|
|
|
|
611
|
0
|
|
|
|
|
0
|
$ref = $params{parse}->( $content ); |
|
612
|
0
|
0
|
0
|
|
|
0
|
croak "did not get an appropriate reference back after parsing" |
|
613
|
|
|
|
|
|
|
unless ref $ref and ref $ref =~ /(HASH|ARRAY)/; |
|
614
|
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
} else { |
|
617
|
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
my $code = sub { |
|
619
|
0
|
|
|
0
|
|
0
|
my $txt = shift; |
|
620
|
0
|
|
|
|
|
0
|
$txt =~ s/\W/ /g; |
|
621
|
0
|
|
|
|
|
0
|
my @toks = split m/\s+/, $txt; |
|
622
|
0
|
|
|
|
|
0
|
\@toks; |
|
623
|
0
|
|
|
|
|
0
|
}; |
|
624
|
0
|
|
|
|
|
0
|
$ref = $code->($content); |
|
625
|
|
|
|
|
|
|
} |
|
626
|
|
|
|
|
|
|
|
|
627
|
0
|
0
|
|
|
|
0
|
return unless $ref; |
|
628
|
0
|
|
|
|
|
0
|
$self->add( $title, $ref ); |
|
629
|
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
} |
|
631
|
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
=item bulk_add DOCS |
|
633
|
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
Add documents to the graph in bulk. Takes as an argument a hash |
|
635
|
|
|
|
|
|
|
whose keys are document identifiers, and values are references |
|
636
|
|
|
|
|
|
|
to hashes in the form { WORD1 => COUNT, WORD2 => COUNT...} |
|
637
|
|
|
|
|
|
|
This method is faster than adding in documents one by one if |
|
638
|
|
|
|
|
|
|
you have auto_rebalance turned on. |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=cut |
|
641
|
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
sub bulk_add { |
|
643
|
|
|
|
|
|
|
|
|
644
|
0
|
|
|
0
|
1
|
0
|
my ( $self, %incoming_docs ) = @_; |
|
645
|
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
# Disable graph rebalancing until we've added everything |
|
647
|
|
|
|
|
|
|
{ |
|
648
|
0
|
|
|
|
|
0
|
local $self->{auto_reweight} = 0; |
|
|
0
|
|
|
|
|
0
|
|
|
649
|
|
|
|
|
|
|
|
|
650
|
0
|
|
|
|
|
0
|
foreach my $doc ( keys %incoming_docs ) { |
|
651
|
0
|
|
|
|
|
0
|
$self->add( $doc, $incoming_docs{$doc}); |
|
652
|
|
|
|
|
|
|
} |
|
653
|
|
|
|
|
|
|
} |
|
654
|
0
|
0
|
|
|
|
0
|
$self->reweight_graph() if $self->{auto_reweight}; |
|
655
|
|
|
|
|
|
|
} |
|
656
|
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
=item degree NODE |
|
659
|
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
Given a raw node, returns the degree (raw node means the node must |
|
661
|
|
|
|
|
|
|
be prefixed with 'D:' or 'T:' depending on type ) |
|
662
|
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
=cut |
|
664
|
|
|
|
|
|
|
|
|
665
|
5
|
|
|
5
|
1
|
3434
|
sub degree { scalar keys %{$_[0]->{neighbors}{$_[1]}} } |
|
|
5
|
|
|
|
|
46
|
|
|
666
|
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
=item delete DOC |
|
669
|
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
Remove a document from the graph. Takes a document identifier |
|
671
|
|
|
|
|
|
|
as an argument. Returns 1 if successful, undef otherwise. |
|
672
|
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
=cut |
|
674
|
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
sub delete { |
|
676
|
|
|
|
|
|
|
|
|
677
|
37
|
|
|
37
|
1
|
282
|
my ( $self, $type, $name ) = @_; |
|
678
|
|
|
|
|
|
|
|
|
679
|
37
|
50
|
|
|
|
153
|
croak "Must provide a node type to delete() method" unless defined $type; |
|
680
|
37
|
50
|
|
|
|
215
|
croak "Invalid type $type passed to delete method. Must be one of [TD]" |
|
681
|
|
|
|
|
|
|
unless $type =~ /^[TD]$/io; |
|
682
|
37
|
50
|
|
|
|
84
|
croak "Please provide a node name" unless defined $name; |
|
683
|
|
|
|
|
|
|
|
|
684
|
37
|
50
|
|
|
|
97
|
return unless defined $name; |
|
685
|
37
|
|
|
|
|
87
|
my $node = _nodeify( $type, $name); |
|
686
|
|
|
|
|
|
|
|
|
687
|
37
|
|
|
|
|
87
|
my $n = $self->{neighbors}; |
|
688
|
37
|
50
|
|
|
|
148
|
croak "Found a neighborless node $node" |
|
689
|
|
|
|
|
|
|
unless exists $n->{$node}; |
|
690
|
|
|
|
|
|
|
|
|
691
|
37
|
|
|
|
|
60
|
my @terms = keys %{ $n->{$node} }; |
|
|
37
|
|
|
|
|
462
|
|
|
692
|
|
|
|
|
|
|
|
|
693
|
37
|
50
|
|
|
|
179
|
warn "found ", scalar @terms, " neighbors attached to $node\n" |
|
694
|
|
|
|
|
|
|
if $self->{debug}; |
|
695
|
|
|
|
|
|
|
# Check to see if we have orphaned any terms |
|
696
|
37
|
|
|
|
|
61
|
foreach my $t ( @terms ) { |
|
697
|
|
|
|
|
|
|
|
|
698
|
1205
|
|
|
|
|
2144
|
delete $n->{$node}{$t}; |
|
699
|
1205
|
|
|
|
|
2124
|
delete $n->{$t}{$node}; |
|
700
|
|
|
|
|
|
|
|
|
701
|
1205
|
100
|
|
|
|
1260
|
if ( scalar keys %{ $n->{$t} } == 0 ) { |
|
|
1205
|
|
|
|
|
2994
|
|
|
702
|
976
|
50
|
|
|
|
1922
|
warn "\tdeleting orphaned node $t" if $self->{debug}; |
|
703
|
976
|
|
|
|
|
3124
|
my ( $subtype, $name ) = $t =~ /^(.):(.*)$/; |
|
704
|
|
|
|
|
|
|
#$self->delete( $subtype, $name ); |
|
705
|
976
|
|
|
|
|
2199
|
delete $n->{$t}; |
|
706
|
|
|
|
|
|
|
} |
|
707
|
|
|
|
|
|
|
} |
|
708
|
|
|
|
|
|
|
|
|
709
|
37
|
|
|
|
|
113
|
delete $n->{$node}; |
|
710
|
37
|
|
|
|
|
156
|
$self->check_consistency(); |
|
711
|
37
|
|
|
|
|
921
|
$self->{reweight_flag} = 1; |
|
712
|
37
|
50
|
|
|
|
308
|
$self->reweight_graph if $self->{auto_reweight}; |
|
713
|
37
|
|
|
|
|
570
|
1; |
|
714
|
|
|
|
|
|
|
} |
|
715
|
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
=item has_doc DOC |
|
719
|
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
Returns true if the document with identifier DOC is in the collection |
|
721
|
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
=cut |
|
723
|
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
sub has_doc { |
|
725
|
4
|
|
|
4
|
1
|
15
|
my ( $self, $doc ) = @_; |
|
726
|
4
|
50
|
|
|
|
10
|
carp "Received undefined value for has_doc" unless defined $doc; |
|
727
|
4
|
|
|
|
|
9
|
my $node = _nodeify( 'D', $doc ); |
|
728
|
4
|
|
100
|
|
|
34
|
return exists $self->{neighbors}{$node} || undef; |
|
729
|
|
|
|
|
|
|
} |
|
730
|
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
=item has_term TERM |
|
732
|
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
Returns true if the term TERM is in the collection |
|
734
|
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
=cut |
|
736
|
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
sub has_term { |
|
738
|
4
|
|
|
4
|
1
|
6
|
my ( $self, $term ) = @_; |
|
739
|
4
|
50
|
|
|
|
11
|
carp "Received undefined value for has_term" unless defined $term; |
|
740
|
4
|
|
|
|
|
8
|
my $node = _nodeify( 'T', $term ); |
|
741
|
4
|
|
100
|
|
|
31
|
return exists $self->{neighbors}{$node} || undef; |
|
742
|
|
|
|
|
|
|
} |
|
743
|
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
=item distance NODE1, NODE2, TYPE |
|
747
|
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
Calculates the distance between two nodes of the same type (D or T) |
|
749
|
|
|
|
|
|
|
using the formula: |
|
750
|
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
distance = ... |
|
752
|
|
|
|
|
|
|
=cut |
|
753
|
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
sub distance { |
|
755
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $n1, $n2, $type ) = @_; |
|
756
|
0
|
0
|
|
|
|
0
|
croak unless $type; |
|
757
|
0
|
|
|
|
|
0
|
$type = lc( $type ); |
|
758
|
0
|
0
|
|
|
|
0
|
croak unless $type =~ /^[dt]$/; |
|
759
|
0
|
0
|
|
|
|
0
|
my $key = ( $type eq 't' ? 'terms' : 'documents' ); |
|
760
|
0
|
|
|
|
|
0
|
my @shared = $self->intersection( $key => [ $n1, $n2 ] ); |
|
761
|
0
|
0
|
|
|
|
0
|
return 0 unless @shared; |
|
762
|
|
|
|
|
|
|
#warn "Found ", scalar @shared, " nodes shared between $n1 and $n2\n"; |
|
763
|
|
|
|
|
|
|
|
|
764
|
0
|
|
|
|
|
0
|
my $node1 = _nodeify( $type, $n1 ); |
|
765
|
0
|
|
|
|
|
0
|
my $node2 = _nodeify( $type, $n2 ); |
|
766
|
|
|
|
|
|
|
# formula is w(t1,d1)/deg(d1) + w(t1,d2)/deg(d2) ... ) /deg( t1 ) |
|
767
|
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
#warn "Calculating distance\n"; |
|
769
|
0
|
|
|
|
|
0
|
my $sum1 = 0; |
|
770
|
0
|
|
|
|
|
0
|
my $sum2 = 0; |
|
771
|
0
|
|
|
|
|
0
|
foreach my $next ( @shared ) { |
|
772
|
0
|
|
|
|
|
0
|
my ( undef, $lcount1) = split m/,/, $self->{neighbors}{$node1}{$next}; |
|
773
|
0
|
|
|
|
|
0
|
my ( undef, $lcount2) = split m/,/, $self->{neighbors}{$node2}{$next}; |
|
774
|
|
|
|
|
|
|
|
|
775
|
0
|
|
|
|
|
0
|
my $degree = $self->degree( $next ); |
|
776
|
|
|
|
|
|
|
#warn "\t degree of $next is $degree\n"; |
|
777
|
0
|
|
|
|
|
0
|
my $elem1 = $lcount1 / $degree; |
|
778
|
0
|
|
|
|
|
0
|
$sum1 += $elem1; |
|
779
|
0
|
|
|
|
|
0
|
my $elem2 = $lcount2 / $degree; |
|
780
|
0
|
|
|
|
|
0
|
$sum2 += $elem2; |
|
781
|
|
|
|
|
|
|
} |
|
782
|
|
|
|
|
|
|
#warn "sum is $sum1, $sum2\n"; |
|
783
|
0
|
|
|
|
|
0
|
my $final = ($sum1 / $self->degree( $node1 )) + ( $sum2 / $self->degree( $node2 )); |
|
784
|
|
|
|
|
|
|
#warn "final is $final\n"; |
|
785
|
0
|
|
|
|
|
0
|
return $final; |
|
786
|
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
} |
|
789
|
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
=item distance_matrix TYPE LIMIT |
|
791
|
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
Used for clustering using linear local embedding. Produces a similarity matrix |
|
793
|
|
|
|
|
|
|
in a format I'm too tired to document right now. LIMIT is the maximum number |
|
794
|
|
|
|
|
|
|
of neighbors to keep for each node. |
|
795
|
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
=cut |
|
797
|
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
sub distance_matrix { |
|
799
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $type, $limit ) = @_; |
|
800
|
0
|
0
|
|
|
|
0
|
croak "Must provide type argument to distance_matrix()" |
|
801
|
|
|
|
|
|
|
unless defined $type; |
|
802
|
0
|
0
|
|
|
|
0
|
croak "must provide limit" unless $limit; |
|
803
|
0
|
|
|
|
|
0
|
my @nodes; |
|
804
|
0
|
0
|
|
|
|
0
|
if ( lc( $type ) eq 'd' ) { |
|
|
|
0
|
|
|
|
|
|
|
805
|
0
|
|
|
|
|
0
|
@nodes = $self->doc_list(); |
|
806
|
|
|
|
|
|
|
} elsif ( lc( $type ) eq 't' ) { |
|
807
|
0
|
|
|
|
|
0
|
@nodes = $self->term_list(); |
|
808
|
|
|
|
|
|
|
} else { |
|
809
|
0
|
|
|
|
|
0
|
croak "Unsupported type $type"; |
|
810
|
|
|
|
|
|
|
} |
|
811
|
|
|
|
|
|
|
|
|
812
|
0
|
|
|
|
|
0
|
my @ret; |
|
813
|
0
|
|
|
|
|
0
|
my $count = 0; |
|
814
|
0
|
|
|
|
|
0
|
foreach my $from ( @nodes ) { |
|
815
|
0
|
|
|
|
|
0
|
warn $from, " - $count\n"; |
|
816
|
0
|
|
|
|
|
0
|
$count++; |
|
817
|
0
|
|
|
|
|
0
|
my $index = -1; |
|
818
|
0
|
|
|
|
|
0
|
my @found; |
|
819
|
0
|
|
|
|
|
0
|
foreach my $to ( @nodes ) { |
|
820
|
0
|
|
|
|
|
0
|
$index++; |
|
821
|
0
|
0
|
|
|
|
0
|
next if $from eq $to; |
|
822
|
0
|
|
|
|
|
0
|
my $dist = $self->distance( $from, $to, $type ); |
|
823
|
0
|
0
|
|
|
|
0
|
push @found, [ $index, $dist ] if $dist; |
|
824
|
|
|
|
|
|
|
#print( $index++, ' ', $dist, " " ) if $dist; |
|
825
|
|
|
|
|
|
|
} |
|
826
|
0
|
|
|
|
|
0
|
my @sorted = sort { $b->[1] <=> $a->[1] } @found; |
|
|
0
|
|
|
|
|
0
|
|
|
827
|
0
|
|
|
|
|
0
|
my @final = splice ( @sorted, 0, $limit ); |
|
828
|
0
|
|
|
|
|
0
|
push @ret, join " ", ( map { join ' ', $_->[0], substr($_->[1], 0, 7) } |
|
|
0
|
|
|
|
|
0
|
|
|
829
|
0
|
|
|
|
|
0
|
sort { $a->[0] <=> $b->[0] } |
|
830
|
|
|
|
|
|
|
@final), "\n"; |
|
831
|
|
|
|
|
|
|
#print "\n"; |
|
832
|
|
|
|
|
|
|
} |
|
833
|
0
|
|
|
|
|
0
|
return join "\n", @ret; |
|
834
|
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
} |
|
836
|
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
=item intersection @NODES |
|
838
|
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
Returns a list of neighbor nodes that all the given nodes share in common |
|
840
|
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
=cut |
|
842
|
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
sub intersection { |
|
844
|
2
|
|
|
2
|
1
|
1324
|
my ( $self, %nodes ) = @_; |
|
845
|
2
|
|
|
|
|
3
|
my @nodes; |
|
846
|
2
|
100
|
|
|
|
17
|
if ( exists $nodes{documents} ) { |
|
847
|
1
|
|
|
|
|
3
|
push @nodes, map { _nodeify( 'D', $_ ) } @{ $nodes{documents}}; |
|
|
2
|
|
|
|
|
7
|
|
|
|
1
|
|
|
|
|
4
|
|
|
848
|
|
|
|
|
|
|
} |
|
849
|
2
|
100
|
|
|
|
33
|
if ( exists $nodes{terms} ) { |
|
850
|
1
|
|
|
|
|
1
|
push @nodes, map { _nodeify( 'T', $_ ) } @{ $nodes{terms}}; |
|
|
2
|
|
|
|
|
7
|
|
|
|
1
|
|
|
|
|
3
|
|
|
851
|
|
|
|
|
|
|
} |
|
852
|
|
|
|
|
|
|
|
|
853
|
2
|
|
|
|
|
4
|
my %seen; |
|
854
|
2
|
|
|
|
|
4
|
foreach my $n ( @nodes ) { |
|
855
|
4
|
|
|
|
|
9
|
my @neighbors = $self->_neighbors( $n ); |
|
856
|
4
|
|
|
|
|
101
|
$seen{ $_ }++ foreach @neighbors; |
|
857
|
|
|
|
|
|
|
} |
|
858
|
4
|
|
|
|
|
12
|
return map { s/^[DT]://; $_ } |
|
|
4
|
|
|
|
|
23
|
|
|
|
131
|
|
|
|
|
169
|
|
|
859
|
2
|
|
|
|
|
14
|
grep { $seen{$_} == scalar @nodes } |
|
860
|
|
|
|
|
|
|
keys %seen; |
|
861
|
|
|
|
|
|
|
} |
|
862
|
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
=item raw_search @NODES |
|
864
|
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
Given a list of nodes, returns a hash of nearest nodes with relevance values, |
|
866
|
|
|
|
|
|
|
in the format NODE => RELEVANCE, for all nodes above the threshold value. |
|
867
|
|
|
|
|
|
|
(You probably want one of L, L, or L instead). |
|
868
|
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
=cut |
|
870
|
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
sub raw_search { |
|
872
|
12
|
|
|
12
|
1
|
27
|
my ( $self, @query ) = @_; |
|
873
|
|
|
|
|
|
|
|
|
874
|
12
|
|
|
|
|
46
|
$self->_clear(); |
|
875
|
12
|
|
|
|
|
37
|
foreach ( @query ) { |
|
876
|
12
|
|
|
|
|
51
|
$self->_energize( $_, $self->{'START_ENERGY'}); |
|
877
|
|
|
|
|
|
|
} |
|
878
|
12
|
|
|
|
|
36
|
my $results_ref = $self->_collect(); |
|
879
|
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
|
|
881
|
12
|
|
|
|
|
30
|
return $results_ref; |
|
882
|
|
|
|
|
|
|
} |
|
883
|
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
=item reweight_graph |
|
888
|
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
Iterates through the graph, calculating edge weights and normalizing |
|
890
|
|
|
|
|
|
|
around nodes. This method is automatically called every time a |
|
891
|
|
|
|
|
|
|
document is added, removed, or updated, unless you turn the option |
|
892
|
|
|
|
|
|
|
off with auto_reweight(0). When adding a lot of docs, this can be |
|
893
|
|
|
|
|
|
|
time consuming, so either set auto_reweight to off or use the |
|
894
|
|
|
|
|
|
|
L method to add lots of docs at once |
|
895
|
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
=cut |
|
897
|
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
sub reweight_graph { |
|
899
|
294
|
|
|
294
|
1
|
1615
|
my ( $self ) = @_; |
|
900
|
|
|
|
|
|
|
|
|
901
|
294
|
|
|
|
|
543
|
my $n = $self->{neighbors}; #shortcut |
|
902
|
294
|
|
|
|
|
766
|
my $doc_count = $self->doc_count(); |
|
903
|
|
|
|
|
|
|
#print "Renormalizing for doc count $doc_count\n" if $self->{debug}; |
|
904
|
294
|
|
|
|
|
7166
|
foreach my $node ( keys %{$n} ) { |
|
|
294
|
|
|
|
|
16117
|
|
|
905
|
|
|
|
|
|
|
|
|
906
|
150286
|
100
|
|
|
|
325623
|
next unless $node =~ /^D:/o; |
|
907
|
5210
|
50
|
|
|
|
24072
|
warn "reweighting at node $node\n" if $self->{debug} > 1; |
|
908
|
5210
|
|
|
|
|
6133
|
my @terms = keys %{ $n->{$node} }; |
|
|
5210
|
|
|
|
|
73395
|
|
|
909
|
5210
|
|
|
|
|
19343
|
my @edges; |
|
910
|
5210
|
|
|
|
|
7402
|
foreach my $t ( @terms ) { |
|
911
|
|
|
|
|
|
|
|
|
912
|
173163
|
|
|
|
|
357962
|
my $pair = $n->{$node}{$t}; |
|
913
|
173163
|
|
|
|
|
411221
|
my ( undef, $lcount ) = split /,/, $pair; |
|
914
|
173163
|
|
|
|
|
639016
|
( my $term = $t ) =~ s/^T://; |
|
915
|
173163
|
50
|
|
|
|
375967
|
croak "did not receive a local count" unless $lcount; |
|
916
|
173163
|
|
|
|
|
174757
|
my $weight; |
|
917
|
173163
|
50
|
|
|
|
349496
|
if ( $self->{use_global_weights} ) { |
|
918
|
|
|
|
|
|
|
|
|
919
|
173163
|
|
|
|
|
349136
|
my $gweight = log( $doc_count / $self->doc_count( $term ) ) + 1; |
|
920
|
173163
|
|
|
|
|
384076
|
my $lweight = log( $lcount ) + 1; |
|
921
|
173163
|
|
|
|
|
265720
|
$weight = ( $gweight * $lweight ); |
|
922
|
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
} else { |
|
924
|
|
|
|
|
|
|
|
|
925
|
0
|
|
|
|
|
0
|
$weight = log( $lcount ) + 1; |
|
926
|
|
|
|
|
|
|
} |
|
927
|
173163
|
|
|
|
|
574606
|
push @edges, [ $node, $t, $weight, $lcount ]; |
|
928
|
|
|
|
|
|
|
} |
|
929
|
|
|
|
|
|
|
|
|
930
|
5210
|
|
|
|
|
13666
|
__normalize( \@edges ); |
|
931
|
|
|
|
|
|
|
|
|
932
|
5210
|
|
|
|
|
9523
|
foreach my $e ( @edges ) { |
|
933
|
173163
|
|
|
|
|
571156
|
my $pair = join ',', $e->[2], $e->[3]; |
|
934
|
173163
|
|
|
|
|
566737
|
$n->{$node}{$e->[1]} = $n->{$e->[1]}{$node} = $pair; |
|
935
|
|
|
|
|
|
|
} |
|
936
|
|
|
|
|
|
|
} |
|
937
|
294
|
|
|
|
|
17793
|
$self->{reweight_flag} = 0; |
|
938
|
294
|
|
|
|
|
648
|
return 1; |
|
939
|
|
|
|
|
|
|
} |
|
940
|
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
=item update ID, WORDS |
|
945
|
|
|
|
|
|
|
|
|
946
|
|
|
|
|
|
|
Given a document identifier and a word list, updates the information for |
|
947
|
|
|
|
|
|
|
that document in the graph. Returns the number of changes made |
|
948
|
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
=cut |
|
950
|
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
sub update { |
|
952
|
|
|
|
|
|
|
|
|
953
|
2
|
|
|
2
|
1
|
68
|
my ( $self, $id, $words ) = @_; |
|
954
|
|
|
|
|
|
|
|
|
955
|
2
|
50
|
|
|
|
10
|
croak "update not implemented in XS" if $self->{xs}; |
|
956
|
2
|
50
|
|
|
|
7
|
croak "Must provide a document identifier to update_document" unless defined $id; |
|
957
|
2
|
|
|
|
|
8
|
my $dnode = _nodeify( 'D', $id ); |
|
958
|
|
|
|
|
|
|
|
|
959
|
2
|
100
|
|
|
|
13
|
return unless exists $self->{neighbors}{$dnode}; |
|
960
|
1
|
50
|
33
|
|
|
19
|
croak "must provide a word list " |
|
|
|
|
33
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
961
|
|
|
|
|
|
|
unless defined $words and |
|
962
|
|
|
|
|
|
|
ref $words and |
|
963
|
|
|
|
|
|
|
( ref $words eq 'HASH' or |
|
964
|
|
|
|
|
|
|
ref $words eq 'ARRAY' ); |
|
965
|
|
|
|
|
|
|
|
|
966
|
1
|
|
|
|
|
32
|
my $n = $self->{neighbors}{$dnode}; |
|
967
|
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
# Get the current word list |
|
969
|
1
|
|
|
|
|
2
|
my @terms = keys %{ $n }; |
|
|
1
|
|
|
|
|
6
|
|
|
970
|
|
|
|
|
|
|
|
|
971
|
1
|
50
|
|
|
|
6
|
if ( ref $words eq 'ARRAY' ) { |
|
972
|
1
|
|
|
|
|
2
|
my %words; |
|
973
|
1
|
|
|
|
|
8
|
$words{$_}++ foreach @$words; |
|
974
|
1
|
|
|
|
|
4
|
$words = \%words; |
|
975
|
|
|
|
|
|
|
} |
|
976
|
|
|
|
|
|
|
|
|
977
|
1
|
|
|
|
|
4
|
local $self->{auto_reweight} = 0; |
|
978
|
|
|
|
|
|
|
|
|
979
|
1
|
|
|
|
|
2
|
my $must_reweight = 0; |
|
980
|
1
|
|
|
|
|
2
|
my %seen; |
|
981
|
|
|
|
|
|
|
|
|
982
|
1
|
|
|
|
|
2
|
foreach my $term ( keys %{$words} ) { |
|
|
1
|
|
|
|
|
4
|
|
|
983
|
|
|
|
|
|
|
|
|
984
|
4
|
|
|
|
|
10
|
my $t = _nodeify( 'T', $term ); |
|
985
|
|
|
|
|
|
|
|
|
986
|
4
|
100
|
|
|
|
11
|
if ( exists $n->{$t} ){ |
|
987
|
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
# Update the local count, if necessary |
|
989
|
3
|
|
|
|
|
7
|
my $curr_val = $n->{$t}; |
|
990
|
3
|
|
|
|
|
9
|
my ( undef, $loc ) = split m/,/, $curr_val; |
|
991
|
|
|
|
|
|
|
|
|
992
|
3
|
50
|
|
|
|
12
|
unless ( $loc == $words->{$term} ) { |
|
993
|
0
|
|
|
|
|
0
|
$n->{$t} = join ',', 1, $words->{$term}; |
|
994
|
0
|
|
|
|
|
0
|
$must_reweight++; |
|
995
|
|
|
|
|
|
|
} |
|
996
|
|
|
|
|
|
|
} |
|
997
|
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
else { |
|
999
|
|
|
|
|
|
|
|
|
1000
|
1
|
|
|
|
|
8
|
$n->{$t} = |
|
1001
|
|
|
|
|
|
|
$self->{neighbors}{$t}{$dnode} = |
|
1002
|
|
|
|
|
|
|
join ',', 1, $words->{$term}; |
|
1003
|
1
|
|
|
|
|
2
|
$must_reweight++; |
|
1004
|
|
|
|
|
|
|
} |
|
1005
|
|
|
|
|
|
|
|
|
1006
|
4
|
|
|
|
|
12
|
$seen{$t}++; |
|
1007
|
|
|
|
|
|
|
} |
|
1008
|
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
# Check for deleted words |
|
1010
|
1
|
|
|
|
|
5
|
foreach my $t ( @terms ) { |
|
1011
|
3
|
50
|
|
|
|
9
|
$must_reweight++ |
|
1012
|
|
|
|
|
|
|
unless exists $seen{$t}; |
|
1013
|
|
|
|
|
|
|
} |
|
1014
|
|
|
|
|
|
|
|
|
1015
|
1
|
50
|
|
|
|
16
|
$self->reweight_graph() if |
|
1016
|
|
|
|
|
|
|
$must_reweight; |
|
1017
|
|
|
|
|
|
|
|
|
1018
|
1
|
|
|
|
|
17
|
return $must_reweight; |
|
1019
|
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
} |
|
1021
|
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
=item doc_count [TERM] |
|
1024
|
|
|
|
|
|
|
|
|
1025
|
|
|
|
|
|
|
Returns a count of all documents that TERM occurs in. |
|
1026
|
|
|
|
|
|
|
If no argument is provided, returns a document count |
|
1027
|
|
|
|
|
|
|
for the entire collection. |
|
1028
|
|
|
|
|
|
|
|
|
1029
|
|
|
|
|
|
|
=cut |
|
1030
|
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
sub doc_count { |
|
1032
|
173490
|
|
|
173490
|
1
|
254700
|
my ( $self, $term ) = @_; |
|
1033
|
173490
|
100
|
|
|
|
301573
|
if ( defined $term ) { |
|
1034
|
173186
|
50
|
|
|
|
498440
|
$term = _nodeify( 'T', $term ) unless $term =~ /^T:/; |
|
1035
|
173186
|
|
|
|
|
395011
|
my $node = $self->{neighbors}{$term}; |
|
1036
|
173186
|
100
|
|
|
|
326908
|
return 0 unless defined $node; |
|
1037
|
173183
|
|
|
|
|
199690
|
return scalar keys %{$node}; |
|
|
173183
|
|
|
|
|
568622
|
|
|
1038
|
|
|
|
|
|
|
} else { |
|
1039
|
304
|
|
|
|
|
61002
|
return scalar grep /D:/, |
|
1040
|
304
|
|
|
|
|
414
|
keys %{ $self->{'neighbors'} }; |
|
1041
|
|
|
|
|
|
|
} |
|
1042
|
|
|
|
|
|
|
} |
|
1043
|
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
=item doc_list [TERM] |
|
1046
|
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
Returns a sorted list of document identifiers that contain |
|
1048
|
|
|
|
|
|
|
TERM, in ASCII-betical order. If no argument is given, |
|
1049
|
|
|
|
|
|
|
returns a sorted document list for the whole collection. |
|
1050
|
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
=cut |
|
1052
|
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
sub doc_list { |
|
1054
|
4
|
|
|
4
|
1
|
74
|
my ( $self, $term ) = @_; |
|
1055
|
4
|
|
|
|
|
5
|
my $t; |
|
1056
|
4
|
50
|
33
|
|
|
16
|
if ( defined $term and $term !~ /T:/) { |
|
1057
|
0
|
|
|
|
|
0
|
$t = _nodeify( 'T', $term ); |
|
1058
|
|
|
|
|
|
|
} |
|
1059
|
4
|
50
|
|
|
|
14
|
my $hash = ( defined $term ? |
|
1060
|
|
|
|
|
|
|
$self->{neighbors}{$t} : |
|
1061
|
|
|
|
|
|
|
$self->{neighbors} ); |
|
1062
|
|
|
|
|
|
|
|
|
1063
|
136
|
|
|
|
|
216
|
sort map { s/^D://o; $_ } |
|
|
136
|
|
|
|
|
235
|
|
|
|
4
|
|
|
|
|
862
|
|
|
1064
|
4
|
|
|
|
|
6
|
grep /^D:/, keys %{ $hash }; |
|
1065
|
|
|
|
|
|
|
} |
|
1066
|
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
sub dump { |
|
1069
|
0
|
|
|
0
|
0
|
0
|
my ( $self ) = @_; |
|
1070
|
0
|
|
|
|
|
0
|
my @docs = $self->doc_list(); |
|
1071
|
|
|
|
|
|
|
|
|
1072
|
0
|
|
|
|
|
0
|
foreach my $d ( @docs ) { |
|
1073
|
0
|
|
|
|
|
0
|
print $self->dump_node( $d ); |
|
1074
|
|
|
|
|
|
|
} |
|
1075
|
|
|
|
|
|
|
} |
|
1076
|
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
=item dump_node NODE |
|
1078
|
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
Lists all of the neighbors of a node, together with edge |
|
1080
|
|
|
|
|
|
|
weights connecting to them |
|
1081
|
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
=cut |
|
1083
|
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
sub dump_node { |
|
1085
|
2
|
|
|
2
|
1
|
10
|
my ( $self, $node ) = @_; |
|
1086
|
|
|
|
|
|
|
|
|
1087
|
2
|
|
|
|
|
4
|
my @lines; |
|
1088
|
2
|
|
|
|
|
8
|
push @lines, join "\t", "COUNT", "WEIGHT", "NEIGHBOR"; |
|
1089
|
|
|
|
|
|
|
|
|
1090
|
2
|
|
|
|
|
5
|
foreach my $n ( keys %{ $self->{neighbors}{$node} } ) { |
|
|
2
|
|
|
|
|
33
|
|
|
1091
|
116
|
|
|
|
|
180
|
my $v = $self->{neighbors}{$node}{$n}; |
|
1092
|
116
|
|
|
|
|
188
|
my ( $weight, $count ) = split /,/, $v; |
|
1093
|
116
|
|
|
|
|
237
|
push @lines, join "\t", $count, substr( $weight, 0, 8 ), $n; |
|
1094
|
|
|
|
|
|
|
} |
|
1095
|
2
|
|
|
|
|
31
|
return @lines; |
|
1096
|
|
|
|
|
|
|
} |
|
1097
|
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
=item dump_tdm [FILE] |
|
1101
|
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
Dumps internal state in term-document matrix (TDM) format, which looks |
|
1103
|
|
|
|
|
|
|
like this: |
|
1104
|
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
A B C B C B C |
|
1106
|
|
|
|
|
|
|
A B C B C B C |
|
1107
|
|
|
|
|
|
|
A B C B C B C |
|
1108
|
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
Where each row represents a document, A is the number of terms in the |
|
1110
|
|
|
|
|
|
|
document, B is the term node and C is the edge weight between the doc |
|
1111
|
|
|
|
|
|
|
node and B. Mostly used as a legacy format by the module author. |
|
1112
|
|
|
|
|
|
|
Doc and term nodes are printed in ASCII-betical sorted order, zero-based |
|
1113
|
|
|
|
|
|
|
indexing. Up to you to keep track of the ID => title mappings, neener-neener! |
|
1114
|
|
|
|
|
|
|
Use doc_list and term_list to get an equivalently sorted list |
|
1115
|
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
=cut |
|
1117
|
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
sub dump_tdm { |
|
1119
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $file ) = @_; |
|
1120
|
|
|
|
|
|
|
|
|
1121
|
0
|
|
|
|
|
0
|
my $counter = 0; |
|
1122
|
0
|
|
|
|
|
0
|
my %lookup; |
|
1123
|
0
|
|
|
|
|
0
|
$lookup{$_} = $counter++ foreach $self->term_list; |
|
1124
|
|
|
|
|
|
|
|
|
1125
|
0
|
|
|
|
|
0
|
my @docs = $self->doc_list; |
|
1126
|
|
|
|
|
|
|
|
|
1127
|
0
|
|
|
|
|
0
|
my $fh; |
|
1128
|
0
|
0
|
|
|
|
0
|
if ( defined $file ) { |
|
1129
|
0
|
0
|
|
|
|
0
|
open $fh, "> $file" or croak |
|
1130
|
|
|
|
|
|
|
"Could not open TDM output file: $!"; |
|
1131
|
|
|
|
|
|
|
} else { |
|
1132
|
0
|
|
|
|
|
0
|
*fh = *STDOUT; |
|
1133
|
|
|
|
|
|
|
} |
|
1134
|
0
|
|
|
|
|
0
|
foreach my $doc ( @docs ) { |
|
1135
|
0
|
|
|
|
|
0
|
my $n = $self->{neighbors}{$doc}; |
|
1136
|
|
|
|
|
|
|
|
|
1137
|
0
|
|
|
|
|
0
|
my $row_count = scalar keys %{$n}; |
|
|
0
|
|
|
|
|
0
|
|
|
1138
|
0
|
|
|
|
|
0
|
print $fh $row_count; |
|
1139
|
|
|
|
|
|
|
|
|
1140
|
0
|
|
|
|
|
0
|
foreach my $t ( sort keys %{$doc} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
1141
|
0
|
|
|
|
|
0
|
my $index = $lookup{$t}; |
|
1142
|
0
|
|
|
|
|
0
|
my ( $weight, undef ) = split m/,/, $n->{$t}; |
|
1143
|
0
|
|
|
|
|
0
|
print $fh ' ', $index, ' ', $weight; |
|
1144
|
|
|
|
|
|
|
} |
|
1145
|
0
|
|
|
|
|
0
|
print $fh "\n"; |
|
1146
|
|
|
|
|
|
|
} |
|
1147
|
|
|
|
|
|
|
} |
|
1148
|
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
=item near_neighbors [NODE] |
|
1152
|
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
Returns a list of neighbor nodes of the same type (doc/doc, or term/term) two |
|
1154
|
|
|
|
|
|
|
hops away. |
|
1155
|
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
=cut |
|
1157
|
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
sub near_neighbors { |
|
1159
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $name, $type ) = @_; |
|
1160
|
|
|
|
|
|
|
|
|
1161
|
0
|
|
|
|
|
0
|
my $node = _nodeify( $type, $name ); |
|
1162
|
|
|
|
|
|
|
|
|
1163
|
0
|
|
|
|
|
0
|
my $n = $self->{neighbors}{$node}; |
|
1164
|
|
|
|
|
|
|
|
|
1165
|
0
|
|
|
|
|
0
|
my %found; |
|
1166
|
0
|
|
|
|
|
0
|
foreach my $next ( keys %{$n} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
1167
|
0
|
|
|
|
|
0
|
foreach my $mynext ( keys %{ $self->{neighbors}{$next} }){ |
|
|
0
|
|
|
|
|
0
|
|
|
1168
|
0
|
|
|
|
|
0
|
$found{$mynext}++; |
|
1169
|
|
|
|
|
|
|
} |
|
1170
|
|
|
|
|
|
|
} |
|
1171
|
0
|
|
|
|
|
0
|
delete $found{$node}; |
|
1172
|
0
|
|
|
|
|
0
|
return keys %found; |
|
1173
|
|
|
|
|
|
|
} |
|
1174
|
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
=item term_count [DOC] |
|
1177
|
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
Returns the number of unique terms in a document or, |
|
1179
|
|
|
|
|
|
|
if no document is specified, in the entire collection. |
|
1180
|
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
=cut |
|
1182
|
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
sub term_count { |
|
1184
|
9
|
|
|
9
|
1
|
2187
|
my ( $self, $doc ) = @_; |
|
1185
|
9
|
50
|
|
|
|
34
|
if ( defined $doc ) { |
|
1186
|
0
|
|
|
|
|
0
|
my $node = $self->{neighbors}{ _nodeify( 'D', $doc) }; |
|
1187
|
0
|
0
|
|
|
|
0
|
return 0 unless defined $node; |
|
1188
|
0
|
|
|
|
|
0
|
return scalar keys %{$node}; |
|
|
0
|
|
|
|
|
0
|
|
|
1189
|
|
|
|
|
|
|
} else { |
|
1190
|
9
|
|
|
|
|
8036
|
return scalar grep /T:/, |
|
1191
|
9
|
|
|
|
|
17
|
keys %{ $self->{neighbors} }; |
|
1192
|
|
|
|
|
|
|
} |
|
1193
|
|
|
|
|
|
|
} |
|
1194
|
|
|
|
|
|
|
|
|
1195
|
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
=item term_list [DOC] |
|
1197
|
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
Returns a sorted list of unique terms appearing in the document |
|
1199
|
|
|
|
|
|
|
with identifier DOC, in ASCII-betical order. If no argument is |
|
1200
|
|
|
|
|
|
|
given, returns a sorted term list for the whole collection. |
|
1201
|
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
=cut |
|
1203
|
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
sub term_list { |
|
1205
|
3
|
|
|
3
|
1
|
1345
|
my ( $self, $doc ) = @_; |
|
1206
|
|
|
|
|
|
|
|
|
1207
|
3
|
100
|
|
|
|
16
|
my $node = ( defined $doc ? |
|
1208
|
|
|
|
|
|
|
$self->{neighbors}{ _nodeify( 'D', $doc) } : |
|
1209
|
|
|
|
|
|
|
$self->{neighbors} |
|
1210
|
|
|
|
|
|
|
); |
|
1211
|
|
|
|
|
|
|
|
|
1212
|
1782
|
|
|
|
|
3740
|
sort map { s/^T://o; $_ } |
|
|
1782
|
|
|
|
|
4800
|
|
|
|
3
|
|
|
|
|
1120
|
|
|
1213
|
3
|
|
|
|
|
218
|
grep /^T:/, keys %{ $node }; |
|
1214
|
|
|
|
|
|
|
} |
|
1215
|
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
=item word_count [TERM] |
|
1219
|
|
|
|
|
|
|
|
|
1220
|
|
|
|
|
|
|
Returns the total occurence count for a term, or if no argument is given, |
|
1221
|
|
|
|
|
|
|
a word count for the entire collection. The word count is always greater than |
|
1222
|
|
|
|
|
|
|
or equal to the term count. |
|
1223
|
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
=cut |
|
1225
|
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
sub word_count { |
|
1227
|
|
|
|
|
|
|
|
|
1228
|
4
|
|
|
4
|
1
|
13
|
my ( $self, $term ) = @_; |
|
1229
|
|
|
|
|
|
|
|
|
1230
|
4
|
|
|
|
|
12
|
my $n = $self->{neighbors}; # shortcut |
|
1231
|
|
|
|
|
|
|
|
|
1232
|
4
|
|
|
|
|
9
|
my $count = 0; |
|
1233
|
4
|
|
|
|
|
9
|
my @terms; |
|
1234
|
4
|
100
|
|
|
|
17
|
if ( defined $term ) { |
|
1235
|
3
|
|
|
|
|
8
|
push @terms, $term; |
|
1236
|
|
|
|
|
|
|
} else { |
|
1237
|
1
|
|
|
|
|
3
|
@terms = $self->term_list(); |
|
1238
|
|
|
|
|
|
|
} |
|
1239
|
|
|
|
|
|
|
|
|
1240
|
4
|
|
|
|
|
138
|
foreach my $term (@terms ) { |
|
1241
|
879
|
50
|
|
|
|
2252
|
$term = _nodeify( 'T', $term) unless $term =~/^T:/o; |
|
1242
|
879
|
|
|
|
|
930
|
foreach my $doc ( keys %{ $n->{$term} } ) { |
|
|
879
|
|
|
|
|
2124
|
|
|
1243
|
1092
|
|
|
|
|
2615
|
( undef, my $lcount ) = split /,/, $n->{$term}{$doc}; |
|
1244
|
1092
|
|
|
|
|
2603
|
$count += $lcount; |
|
1245
|
|
|
|
|
|
|
} |
|
1246
|
|
|
|
|
|
|
} |
|
1247
|
|
|
|
|
|
|
|
|
1248
|
4
|
|
|
|
|
94
|
return $count; |
|
1249
|
|
|
|
|
|
|
} |
|
1250
|
|
|
|
|
|
|
|
|
1251
|
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
|
|
1254
|
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
=item search @QUERY |
|
1256
|
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
Searches the graph for all of the words in @QUERY. Use find_similar if you |
|
1258
|
|
|
|
|
|
|
want to do a document similarity instead, or mixed_search if you want |
|
1259
|
|
|
|
|
|
|
to search on any combination of words and documents. Returns a pair of hashrefs: |
|
1260
|
|
|
|
|
|
|
the first a reference to a hash of docs and relevance values, the second to |
|
1261
|
|
|
|
|
|
|
a hash of words and relevance values. |
|
1262
|
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
=cut |
|
1264
|
|
|
|
|
|
|
|
|
1265
|
|
|
|
|
|
|
sub search { |
|
1266
|
9
|
|
|
9
|
1
|
2906
|
my ( $self, @query ) = @_; |
|
1267
|
9
|
|
|
|
|
27
|
my @nodes = _nodeify( 'T', @query ); |
|
1268
|
9
|
|
|
|
|
40
|
my $results = $self->raw_search( @nodes ); |
|
1269
|
9
|
|
|
|
|
28
|
my ($docs, $words) = _partition( $results ); |
|
1270
|
9
|
|
|
|
|
50
|
return ( $docs, $words); |
|
1271
|
|
|
|
|
|
|
} |
|
1272
|
|
|
|
|
|
|
|
|
1273
|
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
|
|
1275
|
|
|
|
|
|
|
=item simple_search QUERY |
|
1276
|
|
|
|
|
|
|
|
|
1277
|
|
|
|
|
|
|
This is the DWIM method - takes a query string as its argument, and returns an array |
|
1278
|
|
|
|
|
|
|
of documents, sorted by relevance. |
|
1279
|
|
|
|
|
|
|
|
|
1280
|
|
|
|
|
|
|
=cut |
|
1281
|
|
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
|
sub simple_search { |
|
1283
|
1
|
|
|
1
|
1
|
12
|
my ( $self, $query ) = @_; |
|
1284
|
1
|
|
|
|
|
3
|
my @words = map { s/\W+//g; lc($_) } |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
5
|
|
|
1285
|
|
|
|
|
|
|
split m/\s+/, $query; |
|
1286
|
1
|
|
|
|
|
4
|
my @nodes = _nodeify( 'T', @words ); |
|
1287
|
1
|
|
|
|
|
4
|
my $results = $self->raw_search( @nodes ); |
|
1288
|
1
|
|
|
|
|
3
|
my ($docs, $words) = _partition( $results ); |
|
1289
|
1
|
|
|
|
|
3
|
my @sorted_docs = sort { $docs->{$b} <=> $docs->{$a} } keys %{$docs}; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
4
|
|
|
1290
|
1
|
|
|
|
|
7
|
return @sorted_docs; |
|
1291
|
|
|
|
|
|
|
} |
|
1292
|
|
|
|
|
|
|
|
|
1293
|
|
|
|
|
|
|
=item find_by_title @TITLES |
|
1294
|
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
Given a list of patterns, searches for documents with matching titles |
|
1296
|
|
|
|
|
|
|
|
|
1297
|
|
|
|
|
|
|
=cut |
|
1298
|
|
|
|
|
|
|
|
|
1299
|
|
|
|
|
|
|
sub find_by_title { |
|
1300
|
3
|
|
|
3
|
1
|
1035
|
my ( $self, @titles ) = @_; |
|
1301
|
3
|
|
|
|
|
4
|
my @found; |
|
1302
|
3
|
|
|
|
|
9
|
my @docs = $self->doc_list(); |
|
1303
|
3
|
|
|
|
|
109
|
my $pattern = join '|', @titles; |
|
1304
|
3
|
|
|
|
|
66
|
my $match_me = qr/$pattern/i; |
|
1305
|
|
|
|
|
|
|
#warn $match_me, "\n"; |
|
1306
|
3
|
|
|
|
|
6
|
foreach my $d ( @docs ) { |
|
1307
|
|
|
|
|
|
|
# warn $d, "\n"; |
|
1308
|
102
|
100
|
|
|
|
318
|
push @found, $d if $d =~ $match_me; |
|
1309
|
|
|
|
|
|
|
} |
|
1310
|
3
|
|
|
|
|
27
|
return @found; |
|
1311
|
|
|
|
|
|
|
} |
|
1312
|
|
|
|
|
|
|
|
|
1313
|
|
|
|
|
|
|
|
|
1314
|
|
|
|
|
|
|
=item find_similar @DOCS |
|
1315
|
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
Given an array of document identifiers, performs a similarity search |
|
1317
|
|
|
|
|
|
|
and returns a pair of hashrefs. First hashref is to a hash of docs and relevance |
|
1318
|
|
|
|
|
|
|
values, second is to a hash of words and relevance values. |
|
1319
|
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
=cut |
|
1321
|
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
sub find_similar { |
|
1323
|
2
|
|
|
2
|
1
|
2082
|
my ( $self, @docs ) = @_; |
|
1324
|
2
|
|
|
|
|
6
|
my @nodes = _nodeify( 'D', @docs ); |
|
1325
|
2
|
|
|
|
|
6
|
my $results = $self->raw_search( @nodes ); |
|
1326
|
2
|
|
|
|
|
5
|
my ($docs, $words) = _partition( $results ); |
|
1327
|
2
|
|
|
|
|
12
|
return ( $docs, $words); |
|
1328
|
|
|
|
|
|
|
} |
|
1329
|
|
|
|
|
|
|
|
|
1330
|
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
=item merge TYPE, GOOD, @BAD |
|
1332
|
|
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
|
Combine all the nodes in @BAD into the node with identifier GOOD. |
|
1334
|
|
|
|
|
|
|
First argument must be one of 'T' or 'D' to indicate term or |
|
1335
|
|
|
|
|
|
|
document nodes. Used to combine synonyms in the graph. |
|
1336
|
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
=cut |
|
1338
|
|
|
|
|
|
|
|
|
1339
|
|
|
|
|
|
|
sub merge { |
|
1340
|
2
|
|
|
2
|
1
|
1349
|
my ( $self, $type, $good, @bad ) = @_; |
|
1341
|
2
|
50
|
|
|
|
14
|
croak "must provide a type argument to merge" |
|
1342
|
|
|
|
|
|
|
unless defined $type; |
|
1343
|
2
|
50
|
|
|
|
11
|
croak "Invalid type argument $type to merge [must be one of (D,T)]" |
|
1344
|
|
|
|
|
|
|
unless $type =~ /^[DT]/io; |
|
1345
|
|
|
|
|
|
|
|
|
1346
|
2
|
|
|
|
|
6
|
my $target = _nodeify( $type, $good ); |
|
1347
|
2
|
|
|
|
|
5
|
my @sources = _nodeify( $type, @bad ); |
|
1348
|
|
|
|
|
|
|
|
|
1349
|
2
|
|
|
|
|
7
|
my $tnode = $self->{neighbors}{$target}; |
|
1350
|
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
|
|
1352
|
2
|
|
|
|
|
3
|
foreach my $bad_node ( @sources ) { |
|
1353
|
|
|
|
|
|
|
#print "Examining $bad_node\n"; |
|
1354
|
2
|
50
|
|
|
|
6
|
next if $bad_node eq $target; |
|
1355
|
2
|
|
|
|
|
4
|
my %neighbors = %{$self->{neighbors}{$bad_node}}; |
|
|
2
|
|
|
|
|
20
|
|
|
1356
|
|
|
|
|
|
|
|
|
1357
|
2
|
|
|
|
|
9
|
foreach my $n ( keys %neighbors ) { |
|
1358
|
|
|
|
|
|
|
|
|
1359
|
|
|
|
|
|
|
#print "\t $target ($bad_node) neighbor $n\n"; |
|
1360
|
14
|
100
|
|
|
|
35
|
if ( exists $self->{neighbors}{$target}{$n} ) { |
|
1361
|
|
|
|
|
|
|
#print "\t\t$n has link to $bad_node\n"; |
|
1362
|
|
|
|
|
|
|
# combine the local counts for the term members of the edge |
|
1363
|
5
|
|
|
|
|
9
|
my $curr_val = $tnode->{$n}; |
|
1364
|
5
|
|
|
|
|
11
|
my $aug_val = $self->{neighbors}{$bad_node}{$n}; |
|
1365
|
5
|
|
|
|
|
14
|
my ($w1, $c1) = split m/,/, $curr_val; |
|
1366
|
5
|
|
|
|
|
11
|
my ($w2, $c2) = split m/,/, $aug_val; |
|
1367
|
5
|
|
|
|
|
8
|
my $new_count = $c1 + $c2; |
|
1368
|
5
|
|
|
|
|
25
|
$curr_val =~ s/,\d+$/,$new_count/; |
|
1369
|
5
|
|
|
|
|
14
|
$tnode->{$n} = $curr_val; |
|
1370
|
|
|
|
|
|
|
|
|
1371
|
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
} else { |
|
1373
|
|
|
|
|
|
|
|
|
1374
|
9
|
50
|
|
|
|
21
|
die "sanity check failed for existence test" |
|
1375
|
|
|
|
|
|
|
if exists $self->{neighbors}{$target}{$n}; |
|
1376
|
|
|
|
|
|
|
|
|
1377
|
9
|
|
|
|
|
15
|
my $val = $self->{neighbors}{$bad_node}{$n}; |
|
1378
|
|
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
|
#print "\tno existing link -- reassigning $target -- $n\n"; |
|
1380
|
|
|
|
|
|
|
# reassign the current value of this edge |
|
1381
|
|
|
|
|
|
|
|
|
1382
|
9
|
|
|
|
|
22
|
$self->{neighbors}{$n}{$target} = $val; |
|
1383
|
9
|
|
|
|
|
20
|
$self->{neighbors}{$target}{$n} = $val; |
|
1384
|
|
|
|
|
|
|
} |
|
1385
|
|
|
|
|
|
|
|
|
1386
|
14
|
|
|
|
|
23
|
delete $self->{neighbors}{$bad_node}{$n}; |
|
1387
|
14
|
|
|
|
|
35
|
delete $self->{neighbors}{$n}{$bad_node}; |
|
1388
|
|
|
|
|
|
|
} |
|
1389
|
2
|
|
|
|
|
14
|
delete $self->{neighbors}{$bad_node}; |
|
1390
|
|
|
|
|
|
|
} |
|
1391
|
|
|
|
|
|
|
} |
|
1392
|
|
|
|
|
|
|
|
|
1393
|
|
|
|
|
|
|
=item mixed_search @DOCS |
|
1394
|
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
Given a hashref in the form: |
|
1396
|
|
|
|
|
|
|
{ docs => [ 'Title 1', 'Title 2' ], |
|
1397
|
|
|
|
|
|
|
terms => ['buffalo', 'fox' ], } |
|
1398
|
|
|
|
|
|
|
} |
|
1399
|
|
|
|
|
|
|
Runs a combined search on the terms and documents provided, and |
|
1400
|
|
|
|
|
|
|
returns a pair of hashrefs. The first hashref is to a hash of docs |
|
1401
|
|
|
|
|
|
|
and relevance values, second is to a hash of words and relevance values. |
|
1402
|
|
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
=cut |
|
1404
|
|
|
|
|
|
|
|
|
1405
|
|
|
|
|
|
|
sub mixed_search { |
|
1406
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $incoming ) = @_; |
|
1407
|
|
|
|
|
|
|
|
|
1408
|
0
|
0
|
0
|
|
|
0
|
croak "must provide hash ref to mixed_search method" |
|
|
|
|
0
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
unless defined $incoming && |
|
1410
|
|
|
|
|
|
|
ref( $incoming ) && |
|
1411
|
|
|
|
|
|
|
ref( $incoming ) eq 'HASH'; |
|
1412
|
|
|
|
|
|
|
|
|
1413
|
0
|
|
0
|
|
|
0
|
my $tref = $incoming->{'terms'} || []; |
|
1414
|
0
|
|
0
|
|
|
0
|
my $dref = $incoming->{'docs'} || []; |
|
1415
|
|
|
|
|
|
|
|
|
1416
|
0
|
|
|
|
|
0
|
my @dnodes = _nodeify( 'D', @{$dref} ); |
|
|
0
|
|
|
|
|
0
|
|
|
1417
|
0
|
|
|
|
|
0
|
my @tnodes = _nodeify( 'T', @{$tref} ); |
|
|
0
|
|
|
|
|
0
|
|
|
1418
|
|
|
|
|
|
|
|
|
1419
|
0
|
|
|
|
|
0
|
my $results = $self->raw_search( @dnodes, @tnodes ); |
|
1420
|
0
|
|
|
|
|
0
|
my ($docs, $words) = _partition( $results ); |
|
1421
|
0
|
|
|
|
|
0
|
return ( $docs, $words); |
|
1422
|
|
|
|
|
|
|
} |
|
1423
|
|
|
|
|
|
|
|
|
1424
|
|
|
|
|
|
|
|
|
1425
|
|
|
|
|
|
|
=item store FILENAME |
|
1426
|
|
|
|
|
|
|
|
|
1427
|
|
|
|
|
|
|
Stores the object to a file for later use. Not compatible (yet) |
|
1428
|
|
|
|
|
|
|
with compiled XS version, which will give a fatal error. |
|
1429
|
|
|
|
|
|
|
|
|
1430
|
|
|
|
|
|
|
=cut |
|
1431
|
|
|
|
|
|
|
|
|
1432
|
|
|
|
|
|
|
sub store { |
|
1433
|
0
|
|
|
0
|
1
|
0
|
my ( $self, @args ) = @_; |
|
1434
|
0
|
0
|
|
|
|
0
|
if ( $self->{'xs'} ) { |
|
1435
|
0
|
|
|
|
|
0
|
croak "Cannot store object when running in XS mode."; |
|
1436
|
|
|
|
|
|
|
} else { |
|
1437
|
0
|
|
|
|
|
0
|
$self->SUPER::nstore(@args); |
|
1438
|
|
|
|
|
|
|
} |
|
1439
|
|
|
|
|
|
|
} |
|
1440
|
|
|
|
|
|
|
|
|
1441
|
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
# Partition - internal method. |
|
1443
|
|
|
|
|
|
|
# Takes a result set and splits it into two hashrefs - one for |
|
1444
|
|
|
|
|
|
|
# words and one for documents |
|
1445
|
|
|
|
|
|
|
|
|
1446
|
|
|
|
|
|
|
sub _partition { |
|
1447
|
12
|
|
|
12
|
|
19
|
my ( $e ) = @_; |
|
1448
|
12
|
|
|
|
|
17
|
my ( $docs, $words ); |
|
1449
|
12
|
|
|
|
|
16
|
foreach my $k ( sort { $e->{$b} <=> $e->{$a} } |
|
|
209
|
|
|
|
|
277
|
|
|
|
12
|
|
|
|
|
83
|
|
|
1450
|
|
|
|
|
|
|
keys %{ $e } ) { |
|
1451
|
|
|
|
|
|
|
|
|
1452
|
81
|
|
|
|
|
219
|
(my $name = $k ) =~ s/^[DT]://o; |
|
1453
|
81
|
100
|
|
|
|
285
|
$k =~ /^D:/ ? |
|
1454
|
|
|
|
|
|
|
$docs->{$name} = $e->{$k} : |
|
1455
|
|
|
|
|
|
|
$words->{$name} = $e->{$k} ; |
|
1456
|
|
|
|
|
|
|
} |
|
1457
|
12
|
|
|
|
|
37
|
return ( $docs, $words ); |
|
1458
|
|
|
|
|
|
|
} |
|
1459
|
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
# return a list of all neighbor nodes |
|
1461
|
|
|
|
|
|
|
sub _neighbors { |
|
1462
|
4
|
|
|
4
|
|
8
|
my ( $self, $node ) = @_; |
|
1463
|
4
|
50
|
|
|
|
13
|
return unless exists $self->{neighbors}{$node}; |
|
1464
|
4
|
|
|
|
|
5
|
return keys %{ $self->{neighbors}{$node} }; |
|
|
4
|
|
|
|
|
44
|
|
|
1465
|
|
|
|
|
|
|
} |
|
1466
|
|
|
|
|
|
|
|
|
1467
|
|
|
|
|
|
|
|
|
1468
|
|
|
|
|
|
|
sub _nodeify { |
|
1469
|
188046
|
|
|
188046
|
|
383716
|
my ( $prefix, @list ) = @_; |
|
1470
|
188046
|
|
|
|
|
206997
|
my @nodes; |
|
1471
|
188046
|
|
|
|
|
275645
|
foreach my $item ( @list ) { |
|
1472
|
188046
|
|
|
|
|
531772
|
push @nodes, uc($prefix).':'.$item; |
|
1473
|
|
|
|
|
|
|
} |
|
1474
|
188046
|
100
|
|
|
|
578801
|
( wantarray ? @nodes : $nodes[0] ); |
|
1475
|
|
|
|
|
|
|
} |
|
1476
|
|
|
|
|
|
|
|
|
1477
|
|
|
|
|
|
|
|
|
1478
|
|
|
|
|
|
|
|
|
1479
|
|
|
|
|
|
|
sub _read_tdm { |
|
1480
|
0
|
|
|
0
|
|
0
|
my ( $self, $file ) = @_; |
|
1481
|
0
|
0
|
|
|
|
0
|
print "Loading TDM...\n" if $self->{'debug'} > 1; |
|
1482
|
|
|
|
|
|
|
|
|
1483
|
0
|
0
|
|
|
|
0
|
croak "File does not exist" unless -f $file; |
|
1484
|
0
|
0
|
|
|
|
0
|
open my $fh, $file or croak "Could not open $file: $!"; |
|
1485
|
0
|
|
|
|
|
0
|
for ( 1..4 ){ |
|
1486
|
0
|
|
|
|
|
0
|
my $skip = <$fh>; |
|
1487
|
|
|
|
|
|
|
} |
|
1488
|
0
|
|
|
|
|
0
|
my %neighbors; |
|
1489
|
0
|
|
|
|
|
0
|
my $doc = 0; |
|
1490
|
|
|
|
|
|
|
|
|
1491
|
|
|
|
|
|
|
|
|
1492
|
|
|
|
|
|
|
######### XS VERSION ############## |
|
1493
|
0
|
0
|
|
|
|
0
|
if ( $self->{'xs'} ) { |
|
1494
|
|
|
|
|
|
|
|
|
1495
|
0
|
|
|
|
|
0
|
my $map = $self->{'node_map'}; # shortcut alias |
|
1496
|
0
|
|
|
|
|
0
|
while (<$fh>) { |
|
1497
|
0
|
|
|
|
|
0
|
chomp; |
|
1498
|
0
|
|
|
|
|
0
|
my $dindex = $self->_add_node( "D:$doc", 2 ); |
|
1499
|
|
|
|
|
|
|
#warn "Added node $doc\n"; |
|
1500
|
0
|
|
|
|
|
0
|
my ( $count, %vals ) = split; |
|
1501
|
0
|
|
|
|
|
0
|
while ( my ( $term, $edge ) = each %vals ) { |
|
1502
|
0
|
|
|
|
|
0
|
$self->{'term_count'}{$term}++; |
|
1503
|
0
|
|
|
|
|
0
|
my $tnode = "T:$term"; |
|
1504
|
|
|
|
|
|
|
|
|
1505
|
0
|
0
|
|
|
|
0
|
my $tindex = ( defined $map->{$tnode} ? |
|
1506
|
|
|
|
|
|
|
$map->{$tnode} : |
|
1507
|
|
|
|
|
|
|
$self->_add_node( $tnode, 1 ) |
|
1508
|
|
|
|
|
|
|
); |
|
1509
|
0
|
|
|
|
|
0
|
$self->{Graph}->set_edge( $dindex, $tindex, $edge ); |
|
1510
|
|
|
|
|
|
|
} |
|
1511
|
0
|
|
|
|
|
0
|
$doc++; |
|
1512
|
|
|
|
|
|
|
} |
|
1513
|
|
|
|
|
|
|
|
|
1514
|
|
|
|
|
|
|
####### PURE PERL VERSION ########## |
|
1515
|
|
|
|
|
|
|
} else { |
|
1516
|
0
|
|
|
|
|
0
|
while (<$fh>) { |
|
1517
|
0
|
|
|
|
|
0
|
chomp; |
|
1518
|
0
|
|
|
|
|
0
|
my $dnode = "D:$doc"; |
|
1519
|
0
|
|
|
|
|
0
|
my ( $count, %vals ) = split; |
|
1520
|
0
|
|
|
|
|
0
|
while ( my ( $term, $edge ) = each %vals ) { |
|
1521
|
0
|
|
|
|
|
0
|
$self->{'term_count'}{$term}++; |
|
1522
|
0
|
|
|
|
|
0
|
my $tnode = "T:$term"; |
|
1523
|
|
|
|
|
|
|
|
|
1524
|
0
|
|
|
|
|
0
|
$neighbors{$dnode}{$tnode} = $edge.',1'; |
|
1525
|
0
|
|
|
|
|
0
|
$neighbors{$tnode}{$dnode} = $edge.',1'; |
|
1526
|
|
|
|
|
|
|
} |
|
1527
|
0
|
|
|
|
|
0
|
$doc++; |
|
1528
|
|
|
|
|
|
|
} |
|
1529
|
0
|
|
|
|
|
0
|
$self->{'neighbors'} = \%neighbors; |
|
1530
|
|
|
|
|
|
|
} |
|
1531
|
|
|
|
|
|
|
|
|
1532
|
0
|
0
|
|
|
|
0
|
print "Loaded.\n" if $self->{'debug'} > 1; |
|
1533
|
0
|
|
|
|
|
0
|
$self->{'from_TDM'} = 1; |
|
1534
|
0
|
|
|
|
|
0
|
$self->{'doc_count'} = $doc; |
|
1535
|
|
|
|
|
|
|
} |
|
1536
|
|
|
|
|
|
|
|
|
1537
|
|
|
|
|
|
|
|
|
1538
|
|
|
|
|
|
|
|
|
1539
|
|
|
|
|
|
|
# XS version only |
|
1540
|
|
|
|
|
|
|
# |
|
1541
|
|
|
|
|
|
|
# This sub maintains a mapping between node names and integer index |
|
1542
|
|
|
|
|
|
|
# values. |
|
1543
|
|
|
|
|
|
|
|
|
1544
|
|
|
|
|
|
|
sub _add_node { |
|
1545
|
0
|
|
|
0
|
|
0
|
my ( $self, $node_name, $type ) = @_; |
|
1546
|
0
|
0
|
|
|
|
0
|
croak "Must provide a type" unless $type; |
|
1547
|
0
|
0
|
|
|
|
0
|
croak "Must provide a node name" unless $node_name; |
|
1548
|
0
|
0
|
|
|
|
0
|
croak "This node already exists" if |
|
1549
|
|
|
|
|
|
|
$self->{'node_map'}{$node_name}; |
|
1550
|
|
|
|
|
|
|
|
|
1551
|
0
|
|
|
|
|
0
|
my $new_id = $self->{'next_free_id'}++; |
|
1552
|
0
|
|
|
|
|
0
|
$self->{'node_map'}{$node_name} = $new_id; |
|
1553
|
0
|
|
|
|
|
0
|
$self->{'id_map'}[$new_id] = $node_name; |
|
1554
|
0
|
|
|
|
|
0
|
$self->{'Graph'}->add_node( $new_id, $type ); |
|
1555
|
|
|
|
|
|
|
|
|
1556
|
0
|
|
|
|
|
0
|
return $new_id; |
|
1557
|
|
|
|
|
|
|
} |
|
1558
|
|
|
|
|
|
|
|
|
1559
|
|
|
|
|
|
|
|
|
1560
|
|
|
|
|
|
|
|
|
1561
|
|
|
|
|
|
|
# |
|
1562
|
|
|
|
|
|
|
# INTERNAL METHODS |
|
1563
|
|
|
|
|
|
|
# |
|
1564
|
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
# each node should have the same number of inbound |
|
1566
|
|
|
|
|
|
|
# and outbound links |
|
1567
|
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
sub check_consistency { |
|
1569
|
|
|
|
|
|
|
|
|
1570
|
37
|
|
|
37
|
0
|
60
|
my ( $self ) = @_; |
|
1571
|
37
|
|
|
|
|
57
|
my %inbound; |
|
1572
|
|
|
|
|
|
|
my %outbound; |
|
1573
|
|
|
|
|
|
|
|
|
1574
|
|
|
|
|
|
|
|
|
1575
|
37
|
|
|
|
|
62
|
foreach my $node ( keys %{$self->{neighbors}} ) { |
|
|
37
|
|
|
|
|
3748
|
|
|
1576
|
17194
|
50
|
|
|
|
48162
|
next unless $node =~ /^[DT]:/; # for MLDBM compatibility |
|
1577
|
17194
|
|
|
|
|
18401
|
$outbound{$node} = scalar keys %{$self->{neighbors}{$node}}; |
|
|
17194
|
|
|
|
|
40965
|
|
|
1578
|
17194
|
|
|
|
|
19908
|
foreach my $neighbor ( keys %{ $self->{neighbors}{$node} } ) { |
|
|
17194
|
|
|
|
|
43491
|
|
|
1579
|
39868
|
|
|
|
|
74583
|
$inbound{$neighbor}++; |
|
1580
|
|
|
|
|
|
|
} |
|
1581
|
|
|
|
|
|
|
} |
|
1582
|
|
|
|
|
|
|
|
|
1583
|
37
|
|
|
|
|
1361
|
my $in = scalar keys %inbound; |
|
1584
|
37
|
|
|
|
|
76
|
my $out = scalar keys %outbound; |
|
1585
|
37
|
50
|
|
|
|
128
|
carp "number of nodes with inbound links ($in) does not match number of nodes with outbound links ( $out )" |
|
1586
|
|
|
|
|
|
|
unless scalar keys %inbound == scalar keys %outbound; |
|
1587
|
|
|
|
|
|
|
|
|
1588
|
37
|
|
|
|
|
2195
|
foreach my $node ( keys %inbound ) { |
|
1589
|
17194
|
|
50
|
|
|
28993
|
$outbound{$node} ||= 0; |
|
1590
|
17194
|
50
|
|
|
|
37030
|
carp "$node has $inbound{$node} inbound links, $outbound{$node} outbound links\n" |
|
1591
|
|
|
|
|
|
|
unless $inbound{$node} == $outbound{$node}; |
|
1592
|
|
|
|
|
|
|
} |
|
1593
|
|
|
|
|
|
|
|
|
1594
|
|
|
|
|
|
|
} |
|
1595
|
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
|
|
1597
|
|
|
|
|
|
|
=item have_edge RAWNODE1, RAWNODE2 |
|
1598
|
|
|
|
|
|
|
|
|
1599
|
|
|
|
|
|
|
Returns true if the nodes share an edge. Node names must be prefixed with 'D' or 'T' |
|
1600
|
|
|
|
|
|
|
as appropriate. |
|
1601
|
|
|
|
|
|
|
|
|
1602
|
|
|
|
|
|
|
=cut |
|
1603
|
|
|
|
|
|
|
|
|
1604
|
|
|
|
|
|
|
sub have_edge { |
|
1605
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $node1, $node2 ) = @_; |
|
1606
|
0
|
|
|
|
|
0
|
return exists $self->{neighbors}{$node1}{$node2}; |
|
1607
|
|
|
|
|
|
|
} |
|
1608
|
|
|
|
|
|
|
|
|
1609
|
|
|
|
|
|
|
|
|
1610
|
|
|
|
|
|
|
{ |
|
1611
|
|
|
|
|
|
|
|
|
1612
|
|
|
|
|
|
|
my %visited; |
|
1613
|
|
|
|
|
|
|
my %component; |
|
1614
|
|
|
|
|
|
|
my $depth; |
|
1615
|
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
=item connected_components |
|
1617
|
|
|
|
|
|
|
|
|
1618
|
|
|
|
|
|
|
Returns an array of connected components in the graph. Each component is a list |
|
1619
|
|
|
|
|
|
|
of nodes that are mutually accessible by traveling along edges. |
|
1620
|
|
|
|
|
|
|
|
|
1621
|
|
|
|
|
|
|
=cut |
|
1622
|
|
|
|
|
|
|
|
|
1623
|
|
|
|
|
|
|
sub connected_components { |
|
1624
|
0
|
|
|
0
|
1
|
0
|
my ( $self ) = @_; |
|
1625
|
|
|
|
|
|
|
|
|
1626
|
0
|
|
|
|
|
0
|
%visited = (); # clear any old info |
|
1627
|
0
|
|
|
|
|
0
|
%component = (); |
|
1628
|
|
|
|
|
|
|
|
|
1629
|
|
|
|
|
|
|
|
|
1630
|
0
|
|
|
|
|
0
|
my $n = $self->{neighbors}; |
|
1631
|
|
|
|
|
|
|
|
|
1632
|
|
|
|
|
|
|
|
|
1633
|
0
|
|
|
|
|
0
|
my @node_list = keys %{$n}; |
|
|
0
|
|
|
|
|
0
|
|
|
1634
|
0
|
|
|
|
|
0
|
my @components; |
|
1635
|
|
|
|
|
|
|
|
|
1636
|
0
|
|
|
|
|
0
|
while ( @node_list ) { |
|
1637
|
0
|
|
|
|
|
0
|
my $start = shift @node_list; |
|
1638
|
0
|
0
|
|
|
|
0
|
next if exists $visited{$start}; |
|
1639
|
|
|
|
|
|
|
|
|
1640
|
0
|
0
|
|
|
|
0
|
last unless $start; |
|
1641
|
0
|
|
|
|
|
0
|
warn "Visiting neighbors for $start\n"; |
|
1642
|
0
|
|
|
|
|
0
|
visit_neighbors( $n, $start ); |
|
1643
|
0
|
|
|
|
|
0
|
push @components, [ keys %component ]; |
|
1644
|
0
|
|
|
|
|
0
|
%component = (); |
|
1645
|
|
|
|
|
|
|
} |
|
1646
|
|
|
|
|
|
|
|
|
1647
|
0
|
|
|
|
|
0
|
warn "Found ", scalar @components, " connected components\n"; |
|
1648
|
0
|
|
|
|
|
0
|
return @components; |
|
1649
|
|
|
|
|
|
|
|
|
1650
|
|
|
|
|
|
|
|
|
1651
|
|
|
|
|
|
|
} |
|
1652
|
|
|
|
|
|
|
|
|
1653
|
|
|
|
|
|
|
sub visit_neighbors { |
|
1654
|
0
|
|
|
0
|
0
|
0
|
my ( $g, $l ) = @_; |
|
1655
|
0
|
0
|
|
|
|
0
|
return if $visited{$l}; |
|
1656
|
0
|
|
|
|
|
0
|
$depth++; |
|
1657
|
0
|
|
|
|
|
0
|
$visited{$l}++; $component{$l}++; |
|
|
0
|
|
|
|
|
0
|
|
|
1658
|
0
|
|
|
|
|
0
|
warn $depth, " $l\n"; |
|
1659
|
0
|
|
|
|
|
0
|
my @neigh = keys %{ $g->{$l} }; |
|
|
0
|
|
|
|
|
0
|
|
|
1660
|
0
|
|
|
|
|
0
|
foreach my $n ( @neigh ) { |
|
1661
|
0
|
|
|
|
|
0
|
visit_neighbors( $g, $n ); |
|
1662
|
|
|
|
|
|
|
} |
|
1663
|
0
|
|
|
|
|
0
|
$depth--; |
|
1664
|
|
|
|
|
|
|
} |
|
1665
|
|
|
|
|
|
|
} |
|
1666
|
|
|
|
|
|
|
|
|
1667
|
|
|
|
|
|
|
|
|
1668
|
|
|
|
|
|
|
# Wipe the graph free of stored energies |
|
1669
|
|
|
|
|
|
|
|
|
1670
|
|
|
|
|
|
|
sub _clear { |
|
1671
|
12
|
|
|
12
|
|
18
|
my ( $self ) = @_; |
|
1672
|
12
|
|
|
|
|
31
|
$self->{'energy'} = undef; |
|
1673
|
|
|
|
|
|
|
} |
|
1674
|
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
|
|
1676
|
|
|
|
|
|
|
# Gather the stored energy values from the graph |
|
1677
|
|
|
|
|
|
|
|
|
1678
|
|
|
|
|
|
|
sub _collect { |
|
1679
|
12
|
|
|
12
|
|
15
|
my ( $self ) = @_; |
|
1680
|
12
|
|
|
|
|
17
|
my $e = $self->{'energy'}; |
|
1681
|
12
|
|
|
|
|
20
|
my $result = {}; |
|
1682
|
12
|
|
|
|
|
18
|
foreach my $k ( keys %{$self->{'energy'}} ) { |
|
|
12
|
|
|
|
|
40
|
|
|
1683
|
81
|
50
|
|
|
|
159
|
next unless $e->{$k} > $self->{'COLLECT_THRESHOLD'}; |
|
1684
|
81
|
|
|
|
|
130
|
$result->{$k} = $e->{$k}; |
|
1685
|
|
|
|
|
|
|
} |
|
1686
|
12
|
|
|
|
|
28
|
return $result; |
|
1687
|
|
|
|
|
|
|
} |
|
1688
|
|
|
|
|
|
|
|
|
1689
|
|
|
|
|
|
|
|
|
1690
|
|
|
|
|
|
|
|
|
1691
|
|
|
|
|
|
|
|
|
1692
|
|
|
|
|
|
|
# Assign a starting energy ENERGY to NODE, and recursively distribute the |
|
1693
|
|
|
|
|
|
|
# energy to neighbor nodes. Singleton nodes get special treatment |
|
1694
|
|
|
|
|
|
|
|
|
1695
|
|
|
|
|
|
|
sub _energize { |
|
1696
|
|
|
|
|
|
|
|
|
1697
|
81
|
|
|
81
|
|
113
|
my ( $self, $node, $energy ) = @_; |
|
1698
|
|
|
|
|
|
|
|
|
1699
|
|
|
|
|
|
|
|
|
1700
|
81
|
50
|
|
|
|
206
|
return unless defined $self->{neighbors}{$node}; |
|
1701
|
81
|
|
50
|
|
|
340
|
my $orig = $self->{energy}{$node} || 0; |
|
1702
|
81
|
|
|
|
|
154
|
$self->{energy}->{$node} += $energy; |
|
1703
|
81
|
50
|
|
|
|
180
|
return if ( $self->{depth} == $self->{max_depth} ); |
|
1704
|
81
|
|
|
|
|
83
|
$self->{depth}++; |
|
1705
|
|
|
|
|
|
|
|
|
1706
|
81
|
50
|
|
|
|
189
|
if ( $self->{'debug'} > 1 ) { |
|
1707
|
0
|
|
|
|
|
0
|
print ' ' x $self->{'depth'}; |
|
1708
|
0
|
|
|
|
|
0
|
print "$node: energizing $orig + $energy\n"; |
|
1709
|
|
|
|
|
|
|
} |
|
1710
|
|
|
|
|
|
|
|
|
1711
|
|
|
|
|
|
|
|
|
1712
|
81
|
|
|
|
|
98
|
my $n = $self->{neighbors}; |
|
1713
|
|
|
|
|
|
|
|
|
1714
|
|
|
|
|
|
|
#sleep 1; |
|
1715
|
81
|
|
|
|
|
78
|
my $degree = scalar keys %{ $n->{$node} }; |
|
|
81
|
|
|
|
|
138
|
|
|
1716
|
|
|
|
|
|
|
|
|
1717
|
|
|
|
|
|
|
|
|
1718
|
81
|
50
|
|
|
|
147
|
if ( $degree == 0 ) { |
|
1719
|
|
|
|
|
|
|
|
|
1720
|
0
|
|
|
|
|
0
|
carp "WARNING: reached a node without neighbors: $node at search depth $self->{depth}\n"; |
|
1721
|
0
|
|
|
|
|
0
|
$self->{depth}--; |
|
1722
|
0
|
|
|
|
|
0
|
return; |
|
1723
|
|
|
|
|
|
|
} |
|
1724
|
|
|
|
|
|
|
|
|
1725
|
|
|
|
|
|
|
|
|
1726
|
81
|
|
|
|
|
152
|
my $subenergy = $energy / (log($degree)+1); |
|
1727
|
|
|
|
|
|
|
|
|
1728
|
|
|
|
|
|
|
|
|
1729
|
|
|
|
|
|
|
# At singleton nodes (words that appear in only one document, for example) |
|
1730
|
|
|
|
|
|
|
# Don't spread energy any further. This avoids a "reflection" back and |
|
1731
|
|
|
|
|
|
|
# forth from singleton nodes to their neighbors. |
|
1732
|
|
|
|
|
|
|
|
|
1733
|
81
|
100
|
100
|
|
|
298
|
if ( $degree == 1 and $energy < $self->{'START_ENERGY'} ) { |
|
|
|
100
|
|
|
|
|
|
|
1734
|
|
|
|
|
|
|
|
|
1735
|
|
|
|
|
|
|
#do nothing |
|
1736
|
|
|
|
|
|
|
|
|
1737
|
|
|
|
|
|
|
} elsif ( $subenergy > $self->{ACTIVATE_THRESHOLD} ) { |
|
1738
|
12
|
50
|
|
|
|
33
|
print ' ' x $self->{'depth'}, |
|
1739
|
|
|
|
|
|
|
"$node: propagating subenergy $subenergy to $degree neighbors\n" |
|
1740
|
|
|
|
|
|
|
if $self->{'debug'} > 1; |
|
1741
|
12
|
|
|
|
|
15
|
foreach my $neighbor ( keys %{ $n->{$node} } ) { |
|
|
12
|
|
|
|
|
44
|
|
|
1742
|
69
|
|
|
|
|
148
|
my $pair = $n->{$node}{$neighbor}; |
|
1743
|
69
|
|
|
|
|
166
|
my ( $edge, undef ) = split /,/, $pair; |
|
1744
|
69
|
|
|
|
|
149
|
my $weighted_energy = $subenergy * $edge; |
|
1745
|
69
|
50
|
|
|
|
135
|
print ' ' x $self->{'depth'}, |
|
1746
|
|
|
|
|
|
|
" edge $edge ($node, $neighbor)\n" |
|
1747
|
|
|
|
|
|
|
if $self->{'debug'} > 1; |
|
1748
|
69
|
|
|
|
|
140
|
$self->_energize( $neighbor, $weighted_energy ); |
|
1749
|
|
|
|
|
|
|
} |
|
1750
|
|
|
|
|
|
|
} |
|
1751
|
81
|
|
|
|
|
101
|
$self->{'depth'}--; |
|
1752
|
81
|
|
|
|
|
153
|
return 1; |
|
1753
|
|
|
|
|
|
|
} |
|
1754
|
|
|
|
|
|
|
|
|
1755
|
|
|
|
|
|
|
|
|
1756
|
|
|
|
|
|
|
# Given an array, normalize using cosine normalization |
|
1757
|
|
|
|
|
|
|
|
|
1758
|
|
|
|
|
|
|
sub __normalize { |
|
1759
|
5630
|
|
|
5630
|
|
7511
|
my ( $arr ) = @_; |
|
1760
|
|
|
|
|
|
|
|
|
1761
|
5630
|
50
|
33
|
|
|
41861
|
croak "Must provide array ref to __normalize" unless |
|
|
|
|
33
|
|
|
|
|
|
1762
|
|
|
|
|
|
|
defined $arr and |
|
1763
|
|
|
|
|
|
|
ref $arr and |
|
1764
|
|
|
|
|
|
|
ref $arr eq 'ARRAY'; |
|
1765
|
|
|
|
|
|
|
|
|
1766
|
5630
|
|
|
|
|
6060
|
my $sum; |
|
1767
|
5630
|
|
|
|
|
6009
|
$sum += $_->[2] foreach @{$arr}; |
|
|
5630
|
|
|
|
|
81879
|
|
|
1768
|
5630
|
|
|
|
|
8577
|
$_->[2]/= $sum foreach @{$arr}; |
|
|
5630
|
|
|
|
|
68705
|
|
|
1769
|
5630
|
|
|
|
|
15044
|
return 1; |
|
1770
|
|
|
|
|
|
|
} |
|
1771
|
|
|
|
|
|
|
|
|
1772
|
|
|
|
|
|
|
|
|
1773
|
|
|
|
|
|
|
|
|
1774
|
|
|
|
|
|
|
|
|
1775
|
|
|
|
|
|
|
sub DESTROY { |
|
1776
|
15
|
|
|
15
|
|
35508
|
undef $_[0]->{Graph} |
|
1777
|
|
|
|
|
|
|
} |
|
1778
|
|
|
|
|
|
|
|
|
1779
|
|
|
|
|
|
|
1; |
|
1780
|
|
|
|
|
|
|
|
|
1781
|
|
|
|
|
|
|
__END__ |