File Coverage

blib/lib/Search/ContextGraph.pm

Criterion	Covered	Total	%
statement	383	649	59.0
branch	100	250	40.0
condition	25	66	37.8
subroutine	46	71	64.7
pod	35	50	70.0
total	589	1086	54.2

line	stmt	bran	cond	sub	pod	time	code
1							package Search::ContextGraph;
2
3	13			13		325052	use strict;
	13					32
	13					445
4	13			13		71	use warnings;
	13					40
	13					383
5	13			13		69	use Carp;
	13					27
	13					1223
6	13			13		94	use base "Storable";
	13					22
	13					16539
7	13			13		62118	use File::Find;
	13					29
	13					1383
8	13			13		27029	use IO::Socket;
	13					523789
	13					73
9
10							our $VERSION = '0.15';
11
12
13							my $count = 0;
14
15
16							=head1 NAME
17
18							Search::ContextGraph - spreading activation search engine
19
20							=head1 SYNOPSIS
21
22							use Search::ContextGraph;
23
24							my $cg = Search::ContextGraph->new();
25
26							# first you add some documents, perhaps all at once...
27
28							my %docs = (
29							'first' => [ 'elephant', 'snake' ],
30							'second' => [ 'camel', 'pony' ],
31							'third' => { 'snake' => 2, 'constrictor' => 1 },
32							);
33
34							$cg->bulk_add( %docs );
35
36							# or in a loop...
37
38							foreach my $title ( keys %docs ) {
39							$cg->add( $title, $docs{$title} );
40							}
41
42							# or from a file...
43
44							my $cg = Search::ContextGraph->load_from_dir( "./myfiles" );
45
46							# you can store a graph object for later use
47
48							$cg->store( "stored.cng" );
49
50							# and retrieve it later...
51
52							my $cg = ContextGraph->retrieve( "stored.cng" );
53
54
55							# SEARCHING
56
57							# the easiest way
58
59							my @ranked_docs = $cg->simple_search( 'peanuts' );
60
61
62							# get back both related terms and docs for more power
63
64							my ( $docs, $words ) = $cg->search('snake');
65
66
67							# you can use a document as your query
68
69							my ( $docs, $words ) = $cg->find_similar('First Document');
70
71
72							# Or you can query on a combination of things
73
74							my ( $docs, $words ) =
75							$cg->mixed_search( { docs => [ 'First Document' ],
76							terms => [ 'snake', 'pony' ]
77							);
78
79
80							# Print out result set of returned documents
81							foreach my $k ( sort { $docs->{$b} <=> $docs->{$a} }
82							keys %{ $docs } ) {
83							print "Document $k had relevance ", $docs->{$k}, "\n";
84							}
85
86
87
88							# Reload it
89							my $new = Search::ContextGraph->retrieve( "filename" );
90
91
92
93							=head1 DESCRIPTION
94
95							Spreading activation is a neat technique for building search engines that
96							return accurate results for a query even when there is no exact keyword match.
97							The engine works by building a data structure called a B, which
98							is a giant network of document and term nodes. All document nodes are connected
99							to the terms that occur in that document; similarly, every term node is connected
100							to all of the document nodes that term occurs in. We search the graph by
101							starting at a query node and distributing a set amount of energy to its neighbor
102							nodes. Then we recurse, diminishing the energy at each stage, until this
103							spreading energy falls below a given threshold. Each node keeps track of
104							accumulated energy, and this serves as our measure of relevance.
105
106							This means that documents that have many words in common will appear similar to the
107							search engine. Likewise, words that occur together in many documents will be
108							perceived as semantically related. Especially with larger, coherent document
109							collections, the search engine can be quite effective at recognizing synonyms
110							and finding useful relationships between documents. You can read a full
111							description of the algorithm at L.
112
113							The search engine gives expanded recall (relevant results even when there is no
114							keyword match) without incurring the kind of computational and patent issues
115							posed by latent semantic indexing (LSI). The technique used here was originally
116							described in a 1981 dissertation by Scott Preece.
117
118							=head1 CONSTRUCTORS
119
120							=over
121
122							=item new %PARAMS
123
124							Object constructor. Possible parameters:
125
126							=over
127
128							=item auto_reweight
129
130							Rebalance the graph every time a change occurs. Default is true.
131							Disable and do by hand using L for better performance in
132							graphs with frequent updates/additions/deletions.
133
134
135							=item debug LEVEL
136
137							Set this to 1 or 2 to turn on verbose debugging output
138
139							=item max_depth
140
141							Set the maximum distance to spread energy out from the start
142							node. Default is effectively unlimited. You can tweak it using L.
143							Comes in handy if you find searches are too slow.
144
145							=item xs
146
147							When true, tells the module to use compiled C internals. This reduces
148							memory requirements by about 60%, but actually runs a little slower than the
149							pure Perl version. Don't bother to turn it on unless you have a huge graph.
150							Default is pure Perl.
151
152							=over
153
154							=item * using the compiled version makes it impossible to store the graph to disk.
155
156							=item * xs is B in version 0.09. But it will return in triumph!
157
158							=back
159
160							=item START_ENERGY
161
162							Initial energy to assign to a query node. Default is 100.
163
164							=item ACTIVATE_THRESHOLD
165
166							Minimal energy needed to propagate search along the graph. Default is 1.
167
168							=item COLLECT_THRESHOLD
169
170							Minimal energy needed for a node to enter the result set. Default is 1.
171
172							=back
173
174							=cut
175
176
177							sub new {
178	15			15	1	212	my ( $class, %params) = @_;
179
180							# backwards compatible...
181	15					75	*add_document = \&add;
182	15					44	*add_documents = \&bulk_add;
183
184							# plucene friendly
185	15					42	*optimize = \&reweight_graph;
186	15					66	*is_indexed = \&has_doc;
187
188							# fail on all unknown paramters (helps fight typos)
189	15					82	my @allowed = qw/debug auto_reweight use_global_weights max_depth START_ENERGY ACTIVATE_THRESHOLD COLLECT_THRESHOLD use_file xs/;
190	15					28	my %check;
191	15					161	$check{$_}++ foreach @allowed;
192
193	15					34	my @forbidden;
194	15					55	foreach my $k ( keys %params ) {
195	12	50				52	push @forbidden, $k unless exists $check{$k};
196							}
197	15	50				57	if ( @forbidden ) {
198	0					0	croak "The following unrecognized parameters were detected: ",
199							join ", ", @forbidden;
200							}
201
202
203	15					167	my $obj = bless
204							{ debug => 0,
205							auto_reweight => 1,
206							use_global_weights => 1,
207							max_depth => 100000000,
208							START_ENERGY => 100,
209							ACTIVATE_THRESHOLD => 1,
210							COLLECT_THRESHOLD => .2,
211							%params,
212
213							depth => 0,
214							neighbors => {},
215
216							},
217							$class;
218
219
220	15	100				180	if ( $obj->{use_file} ) {
221	1					3	my %neighbors;
222	13			13		24279	use MLDBM qw/DB_File Storable/;
	13					51331
	13					100
223	13			13		518	use Fcntl;
	13					31
	13					138808
224	1					165	warn "Using MLDBM: $obj->{use_file}";
225	1	50				12	$obj->{neighbors} = tie %neighbors, 'MLDBM', $obj->{use_file} or die $!;
226							#$obj->{neighbors} = \%neighbors;
227
228
229							}
230
231	14					79	return $obj;
232
233							}
234
235
236							=item load_from_dir DIR [, \&PARSE ]
237
238							Load documents from a directory. Takes two arguments, a directory path
239							and an optional parsing subroutine. If the parsing subroutine is passed
240							an argument, it will use it to extract term tokens from the file.
241							By default, the file is split on whitespace and stripped of numbers and
242							punctuation.
243
244							=cut
245
246							{
247							my $parse_sub;
248
249							sub load_from_dir {
250	0			0	1	0	my ( $class, $dir, $code ) = @_;
251
252	0	0				0	croak "$dir is not a directory" unless -d $dir;
253
254	0					0	require File::Find;
255	0	0	0			0	unless ( defined $code
			0
256							and ref $code
257							and ref $code eq 'CODE' ) {
258							$code = sub {
259	0			0		0	my $text = shift;
260	0					0	$text =~ s/[^\w]/ /gs;
261	0					0	my @toks = split /\s+/m, $text;
262	0					0	return grep { length($_) > 1 } @toks;
	0					0
263	0					0	};
264							}
265
266	0					0	$parse_sub = $code;
267	0					0	my %docs;
268
269							# Recursively open every file and provide the contents
270							# to whatever parsing subroutine we're using
271
272							my $reader =
273
274							sub {
275	0			0		0	my ( $parse ) = @_;
276	0	0				0	return if /^\./;
277	0	0				0	return unless -f $_;
278	0	0				0	open my $fh, $_ or
279							croak "Could not open file $File::Find::name: $!";
280	0					0	local $/;
281	0					0	my $contents = <$fh>;
282	0	0				0	close $fh or croak "failed to close filehandle";
283	0					0	my @words = $parse_sub->($contents);
284	0					0	$docs{ $File::Find::name } = \@words;
285	0					0	};
286
287
288	0					0	find( $reader , $dir );
289	0					0	my $self = __PACKAGE__->new();
290	0					0	$self->bulk_add( %docs );
291	0					0	return $self;
292							}
293							}
294
295
296
297							=item load_from_tdm FILENAME
298
299							Opens and loads a term-document matrix (TDM) file to initialize the graph.
300							The TDM encodes information about term-to-document links.
301							This is a legacy method mainly for the convenience of the module author.
302							For notes on the proper file format, see the README file.
303							=cut
304
305							sub load_from_tdm {
306	0			0	1	0	my ( $self, $file ) = @_;
307	0	0				0	croak "TDM file $file does not exist" unless -f $file;
308	0	0				0	return if $self->{'loaded'};
309	0					0	$self->_read_tdm( $file );
310	0					0	$self->{'loaded'} = 1;
311	0					0	$self->reweight_graph();
312							}
313
314
315							=item rename OLD, NEW
316
317							Renames a document. Will return undef if the new name is already in use.
318
319							=cut
320							sub rename {
321
322	0			0	1	0	my ( $self, $old, $new ) = @_;
323	0	0	0			0	croak "rename method needs two arguments" unless
324							defined $old and defined $new;
325	0	0				0	croak "document $old not found" unless
326							exists $self->{neighbors}{ _nodeify('D', $old ) };
327
328	0					0	my $bad = _nodeify( 'D', $old );
329	0					0	my $good = _nodeify( 'D', $new );
330
331	0	0				0	return if exists $self->{neighbors}{$good};
332
333	0					0	my $s = $self->{neighbors};
334	0					0	foreach my $n ( keys %{ $s->{$bad} } ) {
	0					0
335	0					0	$s->{$good}{$n} =
336							$s->{$n}{$good} =
337							$s->{$bad}{$n};
338	0					0	delete $s->{$bad}{$n};
339	0					0	delete $s->{$n}{$bad};
340							}
341	0					0	delete $s->{$bad};
342	0					0	return 1;
343
344							}
345
346
347
348							=item retrieve FILENAME
349
350							Loads a previously stored graph from disk, using Storable.
351
352							=cut
353
354							sub retrieve {
355	0			0	1	0	my ( $self, $file ) = @_;
356	0	0				0	croak "Must provide a filename to retrieve graph"
357							unless $file;
358	0	0				0	croak "'$file' is not a file" unless
359							-f $file;
360
361	0					0	Storable::retrieve( $file );
362							}
363
364
365							=back
366
367							=head1 ACCESSORS
368
369							=over
370
371							=item [get\|set]_activate_threshold
372
373							Accessor for node activation threshold value. This value determines how far
374							energy can spread in the graph. Lower it to increase the number of results.
375							Default is 1.
376
377							=cut
378
379	2			2	0	1147	sub get_activate_threshold { $_[0]->{'ACTIVATE_THRESHOLD'} }
380							sub set_activate_threshold {
381	3			3	0	818	my ( $self, $threshold ) = @_;
382	3	100				123	croak "Can't set activate threshold to zero"
383							unless $threshold;
384	2	100				110	croak "Can't set activate threshold to negative value"
385							unless $threshold > 0;
386	1					6	$self->{'ACTIVATE_THRESHOLD'} = $_[1];
387							}
388
389
390							=item [get\|set]_auto_reweight
391
392							Accessor for auto reweight flag. If true, edge weights will be recalculated
393							every time a document is added, updated or removed. This can significantly slow
394							down large graphs. On by default.
395
396							=cut
397
398	0			0	0	0	sub get_auto_reweight{ $_[0]->{auto_reweight} }
399	0			0	0	0	sub set_auto_reweight{ $_[0]->{auto_reweight} = $_[0]->[1]; }
400
401
402							=item [get\|set]_collect_threshold
403
404							Accessor for collection threshold value. This determines how much energy a
405							node must have to make it into the result set. Lower it to increase the
406							number of results. Default is 1.
407
408							=cut
409
410							sub get_collect_threshold {
411	2	50		2	0	21	return ( $_[0]->{'xs'} ?
412							$_[0]->{Graph}->collectionThreshold :
413							$_[0]->{'COLLECT_THRESHOLD'})
414							}
415
416							sub set_collect_threshold {
417	2			2	0	5	my ( $self, $newval ) = @_;
418
419	2		100			10	$newval \|\|=0;
420
421	2	50				7	$self->{Graph}->collectionThreshold( $newval )
422							if $self->{'xs'};
423
424	2		100			10	$self->{'COLLECT_THRESHOLD'} = $newval \|\| 0;
425	2					6	return 1;
426							}
427
428							=item [get\|set]_debug_mode LEVEL
429
430							Turns debugging on or off. 1 is verbose, 2 is very verbose, 0 is off.
431
432							=cut
433
434	0			0	0	0	sub get_debug_mode { $_[0]->{debug} }
435							sub set_debug_mode {
436	0			0	0	0	my ( $self, $mode ) = @_;
437	0					0	$self->{'debug'} = $mode;
438							}
439
440
441
442							=item [get\|set]_initial_energy
443
444							Accessor for initial energy value at the query node. This controls how
445							much energy gets poured into the graph at the start of the search.
446							Increase this value to get more results from your queries.
447
448							=cut
449
450	2			2	0	15	sub get_initial_energy { $_[0]->{'START_ENERGY'} }
451							sub set_initial_energy {
452	2			2	0	6	my ( $self, $start_energy ) = @_;
453	2	50				9	croak "Can't set initial energy to zero"
454							unless $start_energy;
455	2	100				115	croak "Can't set initial energy to negative value"
456							unless $start_energy > 0;
457	1					30	$self->{'START_ENERGY'} = $start_energy ;
458							}
459
460							=item [get\|set]_max_depth LEVEL
461
462							You can tell the graph to cut off searches after a certain distance from
463							the query node. This can speed up searches on very large graphs, and has
464							little adverse effect, especially if you are interested in just the first
465							few search results. Set this value to undef to restore the default (10^8).
466
467							=cut
468
469	3			3	0	450	sub get_max_depth { $_[0]->{max_depth} }
470	3	100		3	0	215	sub set_max_depth { croak "Tried to set maximum depth to an undefined value"
471							unless defined $_[1];
472	2		100			19	$_[0]->{max_depth} = $_[1] \|\| 100000000
473							}
474
475
476
477
478							=back
479
480							=head1 METHODS
481
482							=over
483
484							=item add DOC, WORDS
485
486							Add a document to the search engine. Takes as arguments a unique doc
487							identifier and a reference to an array or hash of words in the
488							document.
489							For example:
490
491							TITLE => { WORD1 => COUNT1, WORD2 => COUNT2 ... }
492
493							or
494
495							TITLE => [ WORD1, WORD2, WORD3 ]
496
497							Use L if you want to pass in a bunch of docs all at once.
498
499							=cut
500
501
502							sub add {
503
504	420			420	1	28619	my ( $self, $title, $words ) = @_;
505
506
507	420	50				1228	croak "Please provide a word list" unless defined $words;
508	420	50	66			3263	croak "Word list is not a reference to an array or hash"
			66
509							unless ref $words and ref $words eq "HASH" or ref $words eq "ARRAY";
510
511	420	50				934	croak "Please provide a document identifier" unless defined $title;
512
513	420					853	my $dnode = _nodeify( 'D', $title );
514	420	50				1461	croak "Tried to add document with duplicate identifier: '$title'\n"
515							if exists $self->{neighbors}{$dnode};
516
517	420					668	my @list;
518	420	100				1104	if ( ref $words eq 'ARRAY' ) {
519	419					489	@list = @{$words};
	419					3733
520							} else {
521	1					3	@list = keys %{$words};
	1					46
522							}
523
524	420	50				1048	croak "Tried to add a document with no content" unless scalar @list;
525
526	420					493	my @edges;
527	420					905	foreach my $term ( @list ) {
528	13489					27818	my $tnode = _nodeify( 'T', lc( $term ) );
529
530							# Local weight for the document
531	13489	100				27162	my $lcount = ( ref $words eq 'HASH' ? $words->{$term} : 1 );
532
533							# Update number of docs this word occurs in
534	13489					35123	my $gcount = ++$self->{term_count}{lc( $term )};
535
536	13489					13877	my $final_weight = 1;
537	13489					33581	push @edges, [ $dnode, $tnode, $final_weight, $lcount ];
538
539							}
540	420					787	$self->{reweight_flag} = 1;
541	420					908	__normalize( \@edges );
542
543
544							=cut
545
546							DEVELOPMENT
547
548							if ( $self->{supersize} ) {
549							my $n = $self->{neighbors};
550							foreach my $e ( @edges ) {
551							#warn "adding edge $e->[0], $e->[1]\n";
552
553							$n->{$e->[0]} = {} unless exists $n->{$e->[0]};
554							$n->{$e->[1]} = {} unless exists $n->{$e->[1]};
555
556							my $tmp = $n->{$e->[0]};
557							$tmp->{$e->[1]} = join ',', $e->[2], $e->[3];
558							$tmp = $n->{$e->[1]};
559							$tmp->{$e->[0]} = join ',', $e->[2], $e->[3];
560							}
561							=cut
562
563
564							# PURE PERL VERSION
565							#} else {
566	420					698	foreach my $e ( @edges ) {
567	13489					72348	$self->{neighbors}{$e->[0]}{$e->[1]} = join ',', $e->[2], $e->[3];
568	13489					75829	$self->{neighbors}{$e->[1]}{$e->[0]} = join ',', $e->[2], $e->[3];
569							}
570							#}
571
572
573							#print "Reweighting graph\n";
574	420	100				2151	$self->reweight_graph() if $self->{auto_reweight};
575	420					15729	return 1;
576
577							}
578
579
580							=item add_file PATH [, name => NAME, parse => CODE]
581
582							Adds a document from a file. By default, uses the PATH provided as the document
583							identifier, and parses the file by splitting on whitespace. If a fancier title,
584							or more elegant parsing behavior is desired, pass in named arguments as indicated.
585							NAME can be any string, CODE should be a reference to a subroutine that takes one
586							argument (the contents of the file) and returns an array of tokens, or a hash in the
587							form TOKEN => COUNT, or a reference to the same.
588
589							=cut
590
591							sub add_file {
592	0			0	1	0	my ( $self, $path, %params ) = @_;
593
594	0	0	0			0	croak "Invalid file '$path' provided to add_file method."
595							unless defined $path and -f $path;
596
597	0	0				0	my $title = ( exists $params{name} ? $params{name} : $path );
598
599	0					0	local $/;
600	0	0				0	open my $fh, $path or croak "Unable to open $path: $!";
601	0					0	my $content = <$fh>;
602
603	0					0	my $ref;
604
605	0	0				0	if ( exists $params{parse} ) {
606	0	0				0	croak "code provided is not a reference" unless
607							ref $params{parse};
608	0	0				0	croak "code provided is not a subroutine" unless
609							ref $params{parse} eq 'CODE';
610
611	0					0	$ref = $params{parse}->( $content );
612	0	0	0			0	croak "did not get an appropriate reference back after parsing"
613							unless ref $ref and ref $ref =~ /(HASH\|ARRAY)/;
614
615
616							} else {
617
618							my $code = sub {
619	0			0		0	my $txt = shift;
620	0					0	$txt =~ s/\W/ /g;
621	0					0	my @toks = split m/\s+/, $txt;
622	0					0	\@toks;
623	0					0	};
624	0					0	$ref = $code->($content);
625							}
626
627	0	0				0	return unless $ref;
628	0					0	$self->add( $title, $ref );
629
630							}
631
632							=item bulk_add DOCS
633
634							Add documents to the graph in bulk. Takes as an argument a hash
635							whose keys are document identifiers, and values are references
636							to hashes in the form { WORD1 => COUNT, WORD2 => COUNT...}
637							This method is faster than adding in documents one by one if
638							you have auto_rebalance turned on.
639
640							=cut
641
642							sub bulk_add {
643
644	0			0	1	0	my ( $self, %incoming_docs ) = @_;
645
646							# Disable graph rebalancing until we've added everything
647							{
648	0					0	local $self->{auto_reweight} = 0;
	0					0
649
650	0					0	foreach my $doc ( keys %incoming_docs ) {
651	0					0	$self->add( $doc, $incoming_docs{$doc});
652							}
653							}
654	0	0				0	$self->reweight_graph() if $self->{auto_reweight};
655							}
656
657
658							=item degree NODE
659
660							Given a raw node, returns the degree (raw node means the node must
661							be prefixed with 'D:' or 'T:' depending on type )
662
663							=cut
664
665	5			5	1	3434	sub degree { scalar keys %{$_[0]->{neighbors}{$_[1]}} }
	5					46
666
667
668							=item delete DOC
669
670							Remove a document from the graph. Takes a document identifier
671							as an argument. Returns 1 if successful, undef otherwise.
672
673							=cut
674
675							sub delete {
676
677	37			37	1	282	my ( $self, $type, $name ) = @_;
678
679	37	50				153	croak "Must provide a node type to delete() method" unless defined $type;
680	37	50				215	croak "Invalid type $type passed to delete method. Must be one of [TD]"
681							unless $type =~ /^[TD]$/io;
682	37	50				84	croak "Please provide a node name" unless defined $name;
683
684	37	50				97	return unless defined $name;
685	37					87	my $node = _nodeify( $type, $name);
686
687	37					87	my $n = $self->{neighbors};
688	37	50				148	croak "Found a neighborless node $node"
689							unless exists $n->{$node};
690
691	37					60	my @terms = keys %{ $n->{$node} };
	37					462
692
693	37	50				179	warn "found ", scalar @terms, " neighbors attached to $node\n"
694							if $self->{debug};
695							# Check to see if we have orphaned any terms
696	37					61	foreach my $t ( @terms ) {
697
698	1205					2144	delete $n->{$node}{$t};
699	1205					2124	delete $n->{$t}{$node};
700
701	1205	100				1260	if ( scalar keys %{ $n->{$t} } == 0 ) {
	1205					2994
702	976	50				1922	warn "\tdeleting orphaned node $t" if $self->{debug};
703	976					3124	my ( $subtype, $name ) = $t =~ /^(.):(.*)$/;
704							#$self->delete( $subtype, $name );
705	976					2199	delete $n->{$t};
706							}
707							}
708
709	37					113	delete $n->{$node};
710	37					156	$self->check_consistency();
711	37					921	$self->{reweight_flag} = 1;
712	37	50				308	$self->reweight_graph if $self->{auto_reweight};
713	37					570	1;
714							}
715
716
717
718							=item has_doc DOC
719
720							Returns true if the document with identifier DOC is in the collection
721
722							=cut
723
724							sub has_doc {
725	4			4	1	15	my ( $self, $doc ) = @_;
726	4	50				10	carp "Received undefined value for has_doc" unless defined $doc;
727	4					9	my $node = _nodeify( 'D', $doc );
728	4		100			34	return exists $self->{neighbors}{$node} \|\| undef;
729							}
730
731							=item has_term TERM
732
733							Returns true if the term TERM is in the collection
734
735							=cut
736
737							sub has_term {
738	4			4	1	6	my ( $self, $term ) = @_;
739	4	50				11	carp "Received undefined value for has_term" unless defined $term;
740	4					8	my $node = _nodeify( 'T', $term );
741	4		100			31	return exists $self->{neighbors}{$node} \|\| undef;
742							}
743
744
745
746							=item distance NODE1, NODE2, TYPE
747
748							Calculates the distance between two nodes of the same type (D or T)
749							using the formula:
750
751							distance = ...
752							=cut
753
754							sub distance {
755	0			0	1	0	my ( $self, $n1, $n2, $type ) = @_;
756	0	0				0	croak unless $type;
757	0					0	$type = lc( $type );
758	0	0				0	croak unless $type =~ /^[dt]$/;
759	0	0				0	my $key = ( $type eq 't' ? 'terms' : 'documents' );
760	0					0	my @shared = $self->intersection( $key => [ $n1, $n2 ] );
761	0	0				0	return 0 unless @shared;
762							#warn "Found ", scalar @shared, " nodes shared between $n1 and $n2\n";
763
764	0					0	my $node1 = _nodeify( $type, $n1 );
765	0					0	my $node2 = _nodeify( $type, $n2 );
766							# formula is w(t1,d1)/deg(d1) + w(t1,d2)/deg(d2) ... ) /deg( t1 )
767
768							#warn "Calculating distance\n";
769	0					0	my $sum1 = 0;
770	0					0	my $sum2 = 0;
771	0					0	foreach my $next ( @shared ) {
772	0					0	my ( undef, $lcount1) = split m/,/, $self->{neighbors}{$node1}{$next};
773	0					0	my ( undef, $lcount2) = split m/,/, $self->{neighbors}{$node2}{$next};
774
775	0					0	my $degree = $self->degree( $next );
776							#warn "\t degree of $next is $degree\n";
777	0					0	my $elem1 = $lcount1 / $degree;
778	0					0	$sum1 += $elem1;
779	0					0	my $elem2 = $lcount2 / $degree;
780	0					0	$sum2 += $elem2;
781							}
782							#warn "sum is $sum1, $sum2\n";
783	0					0	my $final = ($sum1 / $self->degree( $node1 )) + ( $sum2 / $self->degree( $node2 ));
784							#warn "final is $final\n";
785	0					0	return $final;
786
787
788							}
789
790							=item distance_matrix TYPE LIMIT
791
792							Used for clustering using linear local embedding. Produces a similarity matrix
793							in a format I'm too tired to document right now. LIMIT is the maximum number
794							of neighbors to keep for each node.
795
796							=cut
797
798							sub distance_matrix {
799	0			0	1	0	my ( $self, $type, $limit ) = @_;
800	0	0				0	croak "Must provide type argument to distance_matrix()"
801							unless defined $type;
802	0	0				0	croak "must provide limit" unless $limit;
803	0					0	my @nodes;
804	0	0				0	if ( lc( $type ) eq 'd' ) {
		0
805	0					0	@nodes = $self->doc_list();
806							} elsif ( lc( $type ) eq 't' ) {
807	0					0	@nodes = $self->term_list();
808							} else {
809	0					0	croak "Unsupported type $type";
810							}
811
812	0					0	my @ret;
813	0					0	my $count = 0;
814	0					0	foreach my $from ( @nodes ) {
815	0					0	warn $from, " - $count\n";
816	0					0	$count++;
817	0					0	my $index = -1;
818	0					0	my @found;
819	0					0	foreach my $to ( @nodes ) {
820	0					0	$index++;
821	0	0				0	next if $from eq $to;
822	0					0	my $dist = $self->distance( $from, $to, $type );
823	0	0				0	push @found, [ $index, $dist ] if $dist;
824							#print( $index++, ' ', $dist, " " ) if $dist;
825							}
826	0					0	my @sorted = sort { $b->[1] <=> $a->[1] } @found;
	0					0
827	0					0	my @final = splice ( @sorted, 0, $limit );
828	0					0	push @ret, join " ", ( map { join ' ', $_->[0], substr($_->[1], 0, 7) }
	0					0
829	0					0	sort { $a->[0] <=> $b->[0] }
830							@final), "\n";
831							#print "\n";
832							}
833	0					0	return join "\n", @ret;
834
835							}
836
837							=item intersection @NODES
838
839							Returns a list of neighbor nodes that all the given nodes share in common
840
841							=cut
842
843							sub intersection {
844	2			2	1	1324	my ( $self, %nodes ) = @_;
845	2					3	my @nodes;
846	2	100				17	if ( exists $nodes{documents} ) {
847	1					3	push @nodes, map { _nodeify( 'D', $_ ) } @{ $nodes{documents}};
	2					7
	1					4
848							}
849	2	100				33	if ( exists $nodes{terms} ) {
850	1					1	push @nodes, map { _nodeify( 'T', $_ ) } @{ $nodes{terms}};
	2					7
	1					3
851							}
852
853	2					4	my %seen;
854	2					4	foreach my $n ( @nodes ) {
855	4					9	my @neighbors = $self->_neighbors( $n );
856	4					101	$seen{ $_ }++ foreach @neighbors;
857							}
858	4					12	return map { s/^[DT]://; $_ }
	4					23
	131					169
859	2					14	grep { $seen{$_} == scalar @nodes }
860							keys %seen;
861							}
862
863							=item raw_search @NODES
864
865							Given a list of nodes, returns a hash of nearest nodes with relevance values,
866							in the format NODE => RELEVANCE, for all nodes above the threshold value.
867							(You probably want one of L, L, or L instead).
868
869							=cut
870
871							sub raw_search {
872	12			12	1	27	my ( $self, @query ) = @_;
873
874	12					46	$self->_clear();
875	12					37	foreach ( @query ) {
876	12					51	$self->_energize( $_, $self->{'START_ENERGY'});
877							}
878	12					36	my $results_ref = $self->_collect();
879
880
881	12					30	return $results_ref;
882							}
883
884
885
886
887							=item reweight_graph
888
889							Iterates through the graph, calculating edge weights and normalizing
890							around nodes. This method is automatically called every time a
891							document is added, removed, or updated, unless you turn the option
892							off with auto_reweight(0). When adding a lot of docs, this can be
893							time consuming, so either set auto_reweight to off or use the
894							L method to add lots of docs at once
895
896							=cut
897
898							sub reweight_graph {
899	294			294	1	1615	my ( $self ) = @_;
900
901	294					543	my $n = $self->{neighbors}; #shortcut
902	294					766	my $doc_count = $self->doc_count();
903							#print "Renormalizing for doc count $doc_count\n" if $self->{debug};
904	294					7166	foreach my $node ( keys %{$n} ) {
	294					16117
905
906	150286	100				325623	next unless $node =~ /^D:/o;
907	5210	50				24072	warn "reweighting at node $node\n" if $self->{debug} > 1;
908	5210					6133	my @terms = keys %{ $n->{$node} };
	5210					73395
909	5210					19343	my @edges;
910	5210					7402	foreach my $t ( @terms ) {
911
912	173163					357962	my $pair = $n->{$node}{$t};
913	173163					411221	my ( undef, $lcount ) = split /,/, $pair;
914	173163					639016	( my $term = $t ) =~ s/^T://;
915	173163	50				375967	croak "did not receive a local count" unless $lcount;
916	173163					174757	my $weight;
917	173163	50				349496	if ( $self->{use_global_weights} ) {
918
919	173163					349136	my $gweight = log( $doc_count / $self->doc_count( $term ) ) + 1;
920	173163					384076	my $lweight = log( $lcount ) + 1;
921	173163					265720	$weight = ( $gweight * $lweight );
922
923							} else {
924
925	0					0	$weight = log( $lcount ) + 1;
926							}
927	173163					574606	push @edges, [ $node, $t, $weight, $lcount ];
928							}
929
930	5210					13666	__normalize( \@edges );
931
932	5210					9523	foreach my $e ( @edges ) {
933	173163					571156	my $pair = join ',', $e->[2], $e->[3];
934	173163					566737	$n->{$node}{$e->[1]} = $n->{$e->[1]}{$node} = $pair;
935							}
936							}
937	294					17793	$self->{reweight_flag} = 0;
938	294					648	return 1;
939							}
940
941
942
943
944							=item update ID, WORDS
945
946							Given a document identifier and a word list, updates the information for
947							that document in the graph. Returns the number of changes made
948
949							=cut
950
951							sub update {
952
953	2			2	1	68	my ( $self, $id, $words ) = @_;
954
955	2	50				10	croak "update not implemented in XS" if $self->{xs};
956	2	50				7	croak "Must provide a document identifier to update_document" unless defined $id;
957	2					8	my $dnode = _nodeify( 'D', $id );
958
959	2	100				13	return unless exists $self->{neighbors}{$dnode};
960	1	50	33			19	croak "must provide a word list "
			33
			33
961							unless defined $words and
962							ref $words and
963							( ref $words eq 'HASH' or
964							ref $words eq 'ARRAY' );
965
966	1					32	my $n = $self->{neighbors}{$dnode};
967
968							# Get the current word list
969	1					2	my @terms = keys %{ $n };
	1					6
970
971	1	50				6	if ( ref $words eq 'ARRAY' ) {
972	1					2	my %words;
973	1					8	$words{$_}++ foreach @$words;
974	1					4	$words = \%words;
975							}
976
977	1					4	local $self->{auto_reweight} = 0;
978
979	1					2	my $must_reweight = 0;
980	1					2	my %seen;
981
982	1					2	foreach my $term ( keys %{$words} ) {
	1					4
983
984	4					10	my $t = _nodeify( 'T', $term );
985
986	4	100				11	if ( exists $n->{$t} ){
987
988							# Update the local count, if necessary
989	3					7	my $curr_val = $n->{$t};
990	3					9	my ( undef, $loc ) = split m/,/, $curr_val;
991
992	3	50				12	unless ( $loc == $words->{$term} ) {
993	0					0	$n->{$t} = join ',', 1, $words->{$term};
994	0					0	$must_reweight++;
995							}
996							}
997
998							else {
999
1000	1					8	$n->{$t} =
1001							$self->{neighbors}{$t}{$dnode} =
1002							join ',', 1, $words->{$term};
1003	1					2	$must_reweight++;
1004							}
1005
1006	4					12	$seen{$t}++;
1007							}
1008
1009							# Check for deleted words
1010	1					5	foreach my $t ( @terms ) {
1011	3	50				9	$must_reweight++
1012							unless exists $seen{$t};
1013							}
1014
1015	1	50				16	$self->reweight_graph() if
1016							$must_reweight;
1017
1018	1					17	return $must_reweight;
1019
1020							}
1021
1022
1023							=item doc_count [TERM]
1024
1025							Returns a count of all documents that TERM occurs in.
1026							If no argument is provided, returns a document count
1027							for the entire collection.
1028
1029							=cut
1030
1031							sub doc_count {
1032	173490			173490	1	254700	my ( $self, $term ) = @_;
1033	173490	100				301573	if ( defined $term ) {
1034	173186	50				498440	$term = _nodeify( 'T', $term ) unless $term =~ /^T:/;
1035	173186					395011	my $node = $self->{neighbors}{$term};
1036	173186	100				326908	return 0 unless defined $node;
1037	173183					199690	return scalar keys %{$node};
	173183					568622
1038							} else {
1039	304					61002	return scalar grep /D:/,
1040	304					414	keys %{ $self->{'neighbors'} };
1041							}
1042							}
1043
1044
1045							=item doc_list [TERM]
1046
1047							Returns a sorted list of document identifiers that contain
1048							TERM, in ASCII-betical order. If no argument is given,
1049							returns a sorted document list for the whole collection.
1050
1051							=cut
1052
1053							sub doc_list {
1054	4			4	1	74	my ( $self, $term ) = @_;
1055	4					5	my $t;
1056	4	50	33			16	if ( defined $term and $term !~ /T:/) {
1057	0					0	$t = _nodeify( 'T', $term );
1058							}
1059	4	50				14	my $hash = ( defined $term ?
1060							$self->{neighbors}{$t} :
1061							$self->{neighbors} );
1062
1063	136					216	sort map { s/^D://o; $_ }
	136					235
	4					862
1064	4					6	grep /^D:/, keys %{ $hash };
1065							}
1066
1067
1068							sub dump {
1069	0			0	0	0	my ( $self ) = @_;
1070	0					0	my @docs = $self->doc_list();
1071
1072	0					0	foreach my $d ( @docs ) {
1073	0					0	print $self->dump_node( $d );
1074							}
1075							}
1076
1077							=item dump_node NODE
1078
1079							Lists all of the neighbors of a node, together with edge
1080							weights connecting to them
1081
1082							=cut
1083
1084							sub dump_node {
1085	2			2	1	10	my ( $self, $node ) = @_;
1086
1087	2					4	my @lines;
1088	2					8	push @lines, join "\t", "COUNT", "WEIGHT", "NEIGHBOR";
1089
1090	2					5	foreach my $n ( keys %{ $self->{neighbors}{$node} } ) {
	2					33
1091	116					180	my $v = $self->{neighbors}{$node}{$n};
1092	116					188	my ( $weight, $count ) = split /,/, $v;
1093	116					237	push @lines, join "\t", $count, substr( $weight, 0, 8 ), $n;
1094							}
1095	2					31	return @lines;
1096							}
1097
1098
1099
1100							=item dump_tdm [FILE]
1101
1102							Dumps internal state in term-document matrix (TDM) format, which looks
1103							like this:
1104
1105							A B C B C B C
1106							A B C B C B C
1107							A B C B C B C
1108
1109							Where each row represents a document, A is the number of terms in the
1110							document, B is the term node and C is the edge weight between the doc
1111							node and B. Mostly used as a legacy format by the module author.
1112							Doc and term nodes are printed in ASCII-betical sorted order, zero-based
1113							indexing. Up to you to keep track of the ID => title mappings, neener-neener!
1114							Use doc_list and term_list to get an equivalently sorted list
1115
1116							=cut
1117
1118							sub dump_tdm {
1119	0			0	1	0	my ( $self, $file ) = @_;
1120
1121	0					0	my $counter = 0;
1122	0					0	my %lookup;
1123	0					0	$lookup{$_} = $counter++ foreach $self->term_list;
1124
1125	0					0	my @docs = $self->doc_list;
1126
1127	0					0	my $fh;
1128	0	0				0	if ( defined $file ) {
1129	0	0				0	open $fh, "> $file" or croak
1130							"Could not open TDM output file: $!";
1131							} else {
1132	0					0	fh = STDOUT;
1133							}
1134	0					0	foreach my $doc ( @docs ) {
1135	0					0	my $n = $self->{neighbors}{$doc};
1136
1137	0					0	my $row_count = scalar keys %{$n};
	0					0
1138	0					0	print $fh $row_count;
1139
1140	0					0	foreach my $t ( sort keys %{$doc} ) {
	0					0
1141	0					0	my $index = $lookup{$t};
1142	0					0	my ( $weight, undef ) = split m/,/, $n->{$t};
1143	0					0	print $fh ' ', $index, ' ', $weight;
1144							}
1145	0					0	print $fh "\n";
1146							}
1147							}
1148
1149
1150
1151							=item near_neighbors [NODE]
1152
1153							Returns a list of neighbor nodes of the same type (doc/doc, or term/term) two
1154							hops away.
1155
1156							=cut
1157
1158							sub near_neighbors {
1159	0			0	1	0	my ( $self, $name, $type ) = @_;
1160
1161	0					0	my $node = _nodeify( $type, $name );
1162
1163	0					0	my $n = $self->{neighbors}{$node};
1164
1165	0					0	my %found;
1166	0					0	foreach my $next ( keys %{$n} ) {
	0					0
1167	0					0	foreach my $mynext ( keys %{ $self->{neighbors}{$next} }){
	0					0
1168	0					0	$found{$mynext}++;
1169							}
1170							}
1171	0					0	delete $found{$node};
1172	0					0	return keys %found;
1173							}
1174
1175
1176							=item term_count [DOC]
1177
1178							Returns the number of unique terms in a document or,
1179							if no document is specified, in the entire collection.
1180
1181							=cut
1182
1183							sub term_count {
1184	9			9	1	2187	my ( $self, $doc ) = @_;
1185	9	50				34	if ( defined $doc ) {
1186	0					0	my $node = $self->{neighbors}{ _nodeify( 'D', $doc) };
1187	0	0				0	return 0 unless defined $node;
1188	0					0	return scalar keys %{$node};
	0					0
1189							} else {
1190	9					8036	return scalar grep /T:/,
1191	9					17	keys %{ $self->{neighbors} };
1192							}
1193							}
1194
1195
1196							=item term_list [DOC]
1197
1198							Returns a sorted list of unique terms appearing in the document
1199							with identifier DOC, in ASCII-betical order. If no argument is
1200							given, returns a sorted term list for the whole collection.
1201
1202							=cut
1203
1204							sub term_list {
1205	3			3	1	1345	my ( $self, $doc ) = @_;
1206
1207	3	100				16	my $node = ( defined $doc ?
1208							$self->{neighbors}{ _nodeify( 'D', $doc) } :
1209							$self->{neighbors}
1210							);
1211
1212	1782					3740	sort map { s/^T://o; $_ }
	1782					4800
	3					1120
1213	3					218	grep /^T:/, keys %{ $node };
1214							}
1215
1216
1217
1218							=item word_count [TERM]
1219
1220							Returns the total occurence count for a term, or if no argument is given,
1221							a word count for the entire collection. The word count is always greater than
1222							or equal to the term count.
1223
1224							=cut
1225
1226							sub word_count {
1227
1228	4			4	1	13	my ( $self, $term ) = @_;
1229
1230	4					12	my $n = $self->{neighbors}; # shortcut
1231
1232	4					9	my $count = 0;
1233	4					9	my @terms;
1234	4	100				17	if ( defined $term ) {
1235	3					8	push @terms, $term;
1236							} else {
1237	1					3	@terms = $self->term_list();
1238							}
1239
1240	4					138	foreach my $term (@terms ) {
1241	879	50				2252	$term = _nodeify( 'T', $term) unless $term =~/^T:/o;
1242	879					930	foreach my $doc ( keys %{ $n->{$term} } ) {
	879					2124
1243	1092					2615	( undef, my $lcount ) = split /,/, $n->{$term}{$doc};
1244	1092					2603	$count += $lcount;
1245							}
1246							}
1247
1248	4					94	return $count;
1249							}
1250
1251
1252
1253
1254
1255							=item search @QUERY
1256
1257							Searches the graph for all of the words in @QUERY. Use find_similar if you
1258							want to do a document similarity instead, or mixed_search if you want
1259							to search on any combination of words and documents. Returns a pair of hashrefs:
1260							the first a reference to a hash of docs and relevance values, the second to
1261							a hash of words and relevance values.
1262
1263							=cut
1264
1265							sub search {
1266	9			9	1	2906	my ( $self, @query ) = @_;
1267	9					27	my @nodes = _nodeify( 'T', @query );
1268	9					40	my $results = $self->raw_search( @nodes );
1269	9					28	my ($docs, $words) = _partition( $results );
1270	9					50	return ( $docs, $words);
1271							}
1272
1273
1274
1275							=item simple_search QUERY
1276
1277							This is the DWIM method - takes a query string as its argument, and returns an array
1278							of documents, sorted by relevance.
1279
1280							=cut
1281
1282							sub simple_search {
1283	1			1	1	12	my ( $self, $query ) = @_;
1284	1					3	my @words = map { s/\W+//g; lc($_) }
	1					3
	1					5
1285							split m/\s+/, $query;
1286	1					4	my @nodes = _nodeify( 'T', @words );
1287	1					4	my $results = $self->raw_search( @nodes );
1288	1					3	my ($docs, $words) = _partition( $results );
1289	1					3	my @sorted_docs = sort { $docs->{$b} <=> $docs->{$a} } keys %{$docs};
	1					4
	1					4
1290	1					7	return @sorted_docs;
1291							}
1292
1293							=item find_by_title @TITLES
1294
1295							Given a list of patterns, searches for documents with matching titles
1296
1297							=cut
1298
1299							sub find_by_title {
1300	3			3	1	1035	my ( $self, @titles ) = @_;
1301	3					4	my @found;
1302	3					9	my @docs = $self->doc_list();
1303	3					109	my $pattern = join '\|', @titles;
1304	3					66	my $match_me = qr/$pattern/i;
1305							#warn $match_me, "\n";
1306	3					6	foreach my $d ( @docs ) {
1307							# warn $d, "\n";
1308	102	100				318	push @found, $d if $d =~ $match_me;
1309							}
1310	3					27	return @found;
1311							}
1312
1313
1314							=item find_similar @DOCS
1315
1316							Given an array of document identifiers, performs a similarity search
1317							and returns a pair of hashrefs. First hashref is to a hash of docs and relevance
1318							values, second is to a hash of words and relevance values.
1319
1320							=cut
1321
1322							sub find_similar {
1323	2			2	1	2082	my ( $self, @docs ) = @_;
1324	2					6	my @nodes = _nodeify( 'D', @docs );
1325	2					6	my $results = $self->raw_search( @nodes );
1326	2					5	my ($docs, $words) = _partition( $results );
1327	2					12	return ( $docs, $words);
1328							}
1329
1330
1331							=item merge TYPE, GOOD, @BAD
1332
1333							Combine all the nodes in @BAD into the node with identifier GOOD.
1334							First argument must be one of 'T' or 'D' to indicate term or
1335							document nodes. Used to combine synonyms in the graph.
1336
1337							=cut
1338
1339							sub merge {
1340	2			2	1	1349	my ( $self, $type, $good, @bad ) = @_;
1341	2	50				14	croak "must provide a type argument to merge"
1342							unless defined $type;
1343	2	50				11	croak "Invalid type argument $type to merge [must be one of (D,T)]"
1344							unless $type =~ /^[DT]/io;
1345
1346	2					6	my $target = _nodeify( $type, $good );
1347	2					5	my @sources = _nodeify( $type, @bad );
1348
1349	2					7	my $tnode = $self->{neighbors}{$target};
1350
1351
1352	2					3	foreach my $bad_node ( @sources ) {
1353							#print "Examining $bad_node\n";
1354	2	50				6	next if $bad_node eq $target;
1355	2					4	my %neighbors = %{$self->{neighbors}{$bad_node}};
	2					20
1356
1357	2					9	foreach my $n ( keys %neighbors ) {
1358
1359							#print "\t $target ($bad_node) neighbor $n\n";
1360	14	100				35	if ( exists $self->{neighbors}{$target}{$n} ) {
1361							#print "\t\t$n has link to $bad_node\n";
1362							# combine the local counts for the term members of the edge
1363	5					9	my $curr_val = $tnode->{$n};
1364	5					11	my $aug_val = $self->{neighbors}{$bad_node}{$n};
1365	5					14	my ($w1, $c1) = split m/,/, $curr_val;
1366	5					11	my ($w2, $c2) = split m/,/, $aug_val;
1367	5					8	my $new_count = $c1 + $c2;
1368	5					25	$curr_val =~ s/,\d+$/,$new_count/;
1369	5					14	$tnode->{$n} = $curr_val;
1370
1371
1372							} else {
1373
1374	9	50				21	die "sanity check failed for existence test"
1375							if exists $self->{neighbors}{$target}{$n};
1376
1377	9					15	my $val = $self->{neighbors}{$bad_node}{$n};
1378
1379							#print "\tno existing link -- reassigning $target -- $n\n";
1380							# reassign the current value of this edge
1381
1382	9					22	$self->{neighbors}{$n}{$target} = $val;
1383	9					20	$self->{neighbors}{$target}{$n} = $val;
1384							}
1385
1386	14					23	delete $self->{neighbors}{$bad_node}{$n};
1387	14					35	delete $self->{neighbors}{$n}{$bad_node};
1388							}
1389	2					14	delete $self->{neighbors}{$bad_node};
1390							}
1391							}
1392
1393							=item mixed_search @DOCS
1394
1395							Given a hashref in the form:
1396							{ docs => [ 'Title 1', 'Title 2' ],
1397							terms => ['buffalo', 'fox' ], }
1398							}
1399							Runs a combined search on the terms and documents provided, and
1400							returns a pair of hashrefs. The first hashref is to a hash of docs
1401							and relevance values, second is to a hash of words and relevance values.
1402
1403							=cut
1404
1405							sub mixed_search {
1406	0			0	1	0	my ( $self, $incoming ) = @_;
1407
1408	0	0	0			0	croak "must provide hash ref to mixed_search method"
			0
1409							unless defined $incoming &&
1410							ref( $incoming ) &&
1411							ref( $incoming ) eq 'HASH';
1412
1413	0		0			0	my $tref = $incoming->{'terms'} \|\| [];
1414	0		0			0	my $dref = $incoming->{'docs'} \|\| [];
1415
1416	0					0	my @dnodes = _nodeify( 'D', @{$dref} );
	0					0
1417	0					0	my @tnodes = _nodeify( 'T', @{$tref} );
	0					0
1418
1419	0					0	my $results = $self->raw_search( @dnodes, @tnodes );
1420	0					0	my ($docs, $words) = _partition( $results );
1421	0					0	return ( $docs, $words);
1422							}
1423
1424
1425							=item store FILENAME
1426
1427							Stores the object to a file for later use. Not compatible (yet)
1428							with compiled XS version, which will give a fatal error.
1429
1430							=cut
1431
1432							sub store {
1433	0			0	1	0	my ( $self, @args ) = @_;
1434	0	0				0	if ( $self->{'xs'} ) {
1435	0					0	croak "Cannot store object when running in XS mode.";
1436							} else {
1437	0					0	$self->SUPER::nstore(@args);
1438							}
1439							}
1440
1441
1442							# Partition - internal method.
1443							# Takes a result set and splits it into two hashrefs - one for
1444							# words and one for documents
1445
1446							sub _partition {
1447	12			12		19	my ( $e ) = @_;
1448	12					17	my ( $docs, $words );
1449	12					16	foreach my $k ( sort { $e->{$b} <=> $e->{$a} }
	209					277
	12					83
1450							keys %{ $e } ) {
1451
1452	81					219	(my $name = $k ) =~ s/^[DT]://o;
1453	81	100				285	$k =~ /^D:/ ?
1454							$docs->{$name} = $e->{$k} :
1455							$words->{$name} = $e->{$k} ;
1456							}
1457	12					37	return ( $docs, $words );
1458							}
1459
1460							# return a list of all neighbor nodes
1461							sub _neighbors {
1462	4			4		8	my ( $self, $node ) = @_;
1463	4	50				13	return unless exists $self->{neighbors}{$node};
1464	4					5	return keys %{ $self->{neighbors}{$node} };
	4					44
1465							}
1466
1467
1468							sub _nodeify {
1469	188046			188046		383716	my ( $prefix, @list ) = @_;
1470	188046					206997	my @nodes;
1471	188046					275645	foreach my $item ( @list ) {
1472	188046					531772	push @nodes, uc($prefix).':'.$item;
1473							}
1474	188046	100				578801	( wantarray ? @nodes : $nodes[0] );
1475							}
1476
1477
1478
1479							sub _read_tdm {
1480	0			0		0	my ( $self, $file ) = @_;
1481	0	0				0	print "Loading TDM...\n" if $self->{'debug'} > 1;
1482
1483	0	0				0	croak "File does not exist" unless -f $file;
1484	0	0				0	open my $fh, $file or croak "Could not open $file: $!";
1485	0					0	for ( 1..4 ){
1486	0					0	my $skip = <$fh>;
1487							}
1488	0					0	my %neighbors;
1489	0					0	my $doc = 0;
1490
1491
1492							######### XS VERSION ##############
1493	0	0				0	if ( $self->{'xs'} ) {
1494
1495	0					0	my $map = $self->{'node_map'}; # shortcut alias
1496	0					0	while (<$fh>) {
1497	0					0	chomp;
1498	0					0	my $dindex = $self->_add_node( "D:$doc", 2 );
1499							#warn "Added node $doc\n";
1500	0					0	my ( $count, %vals ) = split;
1501	0					0	while ( my ( $term, $edge ) = each %vals ) {
1502	0					0	$self->{'term_count'}{$term}++;
1503	0					0	my $tnode = "T:$term";
1504
1505	0	0				0	my $tindex = ( defined $map->{$tnode} ?
1506							$map->{$tnode} :
1507							$self->_add_node( $tnode, 1 )
1508							);
1509	0					0	$self->{Graph}->set_edge( $dindex, $tindex, $edge );
1510							}
1511	0					0	$doc++;
1512							}
1513
1514							####### PURE PERL VERSION ##########
1515							} else {
1516	0					0	while (<$fh>) {
1517	0					0	chomp;
1518	0					0	my $dnode = "D:$doc";
1519	0					0	my ( $count, %vals ) = split;
1520	0					0	while ( my ( $term, $edge ) = each %vals ) {
1521	0					0	$self->{'term_count'}{$term}++;
1522	0					0	my $tnode = "T:$term";
1523
1524	0					0	$neighbors{$dnode}{$tnode} = $edge.',1';
1525	0					0	$neighbors{$tnode}{$dnode} = $edge.',1';
1526							}
1527	0					0	$doc++;
1528							}
1529	0					0	$self->{'neighbors'} = \%neighbors;
1530							}
1531
1532	0	0				0	print "Loaded.\n" if $self->{'debug'} > 1;
1533	0					0	$self->{'from_TDM'} = 1;
1534	0					0	$self->{'doc_count'} = $doc;
1535							}
1536
1537
1538
1539							# XS version only
1540							#
1541							# This sub maintains a mapping between node names and integer index
1542							# values.
1543
1544							sub _add_node {
1545	0			0		0	my ( $self, $node_name, $type ) = @_;
1546	0	0				0	croak "Must provide a type" unless $type;
1547	0	0				0	croak "Must provide a node name" unless $node_name;
1548	0	0				0	croak "This node already exists" if
1549							$self->{'node_map'}{$node_name};
1550
1551	0					0	my $new_id = $self->{'next_free_id'}++;
1552	0					0	$self->{'node_map'}{$node_name} = $new_id;
1553	0					0	$self->{'id_map'}[$new_id] = $node_name;
1554	0					0	$self->{'Graph'}->add_node( $new_id, $type );
1555
1556	0					0	return $new_id;
1557							}
1558
1559
1560
1561							#
1562							# INTERNAL METHODS
1563							#
1564
1565							# each node should have the same number of inbound
1566							# and outbound links
1567
1568							sub check_consistency {
1569
1570	37			37	0	60	my ( $self ) = @_;
1571	37					57	my %inbound;
1572							my %outbound;
1573
1574
1575	37					62	foreach my $node ( keys %{$self->{neighbors}} ) {
	37					3748
1576	17194	50				48162	next unless $node =~ /^[DT]:/; # for MLDBM compatibility
1577	17194					18401	$outbound{$node} = scalar keys %{$self->{neighbors}{$node}};
	17194					40965
1578	17194					19908	foreach my $neighbor ( keys %{ $self->{neighbors}{$node} } ) {
	17194					43491
1579	39868					74583	$inbound{$neighbor}++;
1580							}
1581							}
1582
1583	37					1361	my $in = scalar keys %inbound;
1584	37					76	my $out = scalar keys %outbound;
1585	37	50				128	carp "number of nodes with inbound links ($in) does not match number of nodes with outbound links ( $out )"
1586							unless scalar keys %inbound == scalar keys %outbound;
1587
1588	37					2195	foreach my $node ( keys %inbound ) {
1589	17194		50			28993	$outbound{$node} \|\|= 0;
1590	17194	50				37030	carp "$node has $inbound{$node} inbound links, $outbound{$node} outbound links\n"
1591							unless $inbound{$node} == $outbound{$node};
1592							}
1593
1594							}
1595
1596
1597							=item have_edge RAWNODE1, RAWNODE2
1598
1599							Returns true if the nodes share an edge. Node names must be prefixed with 'D' or 'T'
1600							as appropriate.
1601
1602							=cut
1603
1604							sub have_edge {
1605	0			0	1	0	my ( $self, $node1, $node2 ) = @_;
1606	0					0	return exists $self->{neighbors}{$node1}{$node2};
1607							}
1608
1609
1610							{
1611
1612							my %visited;
1613							my %component;
1614							my $depth;
1615
1616							=item connected_components
1617
1618							Returns an array of connected components in the graph. Each component is a list
1619							of nodes that are mutually accessible by traveling along edges.
1620
1621							=cut
1622
1623							sub connected_components {
1624	0			0	1	0	my ( $self ) = @_;
1625
1626	0					0	%visited = (); # clear any old info
1627	0					0	%component = ();
1628
1629
1630	0					0	my $n = $self->{neighbors};
1631
1632
1633	0					0	my @node_list = keys %{$n};
	0					0
1634	0					0	my @components;
1635
1636	0					0	while ( @node_list ) {
1637	0					0	my $start = shift @node_list;
1638	0	0				0	next if exists $visited{$start};
1639
1640	0	0				0	last unless $start;
1641	0					0	warn "Visiting neighbors for $start\n";
1642	0					0	visit_neighbors( $n, $start );
1643	0					0	push @components, [ keys %component ];
1644	0					0	%component = ();
1645							}
1646
1647	0					0	warn "Found ", scalar @components, " connected components\n";
1648	0					0	return @components;
1649
1650
1651							}
1652
1653							sub visit_neighbors {
1654	0			0	0	0	my ( $g, $l ) = @_;
1655	0	0				0	return if $visited{$l};
1656	0					0	$depth++;
1657	0					0	$visited{$l}++; $component{$l}++;
	0					0
1658	0					0	warn $depth, " $l\n";
1659	0					0	my @neigh = keys %{ $g->{$l} };
	0					0
1660	0					0	foreach my $n ( @neigh ) {
1661	0					0	visit_neighbors( $g, $n );
1662							}
1663	0					0	$depth--;
1664							}
1665							}
1666
1667
1668							# Wipe the graph free of stored energies
1669
1670							sub _clear {
1671	12			12		18	my ( $self ) = @_;
1672	12					31	$self->{'energy'} = undef;
1673							}
1674
1675
1676							# Gather the stored energy values from the graph
1677
1678							sub _collect {
1679	12			12		15	my ( $self ) = @_;
1680	12					17	my $e = $self->{'energy'};
1681	12					20	my $result = {};
1682	12					18	foreach my $k ( keys %{$self->{'energy'}} ) {
	12					40
1683	81	50				159	next unless $e->{$k} > $self->{'COLLECT_THRESHOLD'};
1684	81					130	$result->{$k} = $e->{$k};
1685							}
1686	12					28	return $result;
1687							}
1688
1689
1690
1691
1692							# Assign a starting energy ENERGY to NODE, and recursively distribute the
1693							# energy to neighbor nodes. Singleton nodes get special treatment
1694
1695							sub _energize {
1696
1697	81			81		113	my ( $self, $node, $energy ) = @_;
1698
1699
1700	81	50				206	return unless defined $self->{neighbors}{$node};
1701	81		50			340	my $orig = $self->{energy}{$node} \|\| 0;
1702	81					154	$self->{energy}->{$node} += $energy;
1703	81	50				180	return if ( $self->{depth} == $self->{max_depth} );
1704	81					83	$self->{depth}++;
1705
1706	81	50				189	if ( $self->{'debug'} > 1 ) {
1707	0					0	print ' ' x $self->{'depth'};
1708	0					0	print "$node: energizing $orig + $energy\n";
1709							}
1710
1711
1712	81					98	my $n = $self->{neighbors};
1713
1714							#sleep 1;
1715	81					78	my $degree = scalar keys %{ $n->{$node} };
	81					138
1716
1717
1718	81	50				147	if ( $degree == 0 ) {
1719
1720	0					0	carp "WARNING: reached a node without neighbors: $node at search depth $self->{depth}\n";
1721	0					0	$self->{depth}--;
1722	0					0	return;
1723							}
1724
1725
1726	81					152	my $subenergy = $energy / (log($degree)+1);
1727
1728
1729							# At singleton nodes (words that appear in only one document, for example)
1730							# Don't spread energy any further. This avoids a "reflection" back and
1731							# forth from singleton nodes to their neighbors.
1732
1733	81	100	100			298	if ( $degree == 1 and $energy < $self->{'START_ENERGY'} ) {
		100
1734
1735							#do nothing
1736
1737							} elsif ( $subenergy > $self->{ACTIVATE_THRESHOLD} ) {
1738	12	50				33	print ' ' x $self->{'depth'},
1739							"$node: propagating subenergy $subenergy to $degree neighbors\n"
1740							if $self->{'debug'} > 1;
1741	12					15	foreach my $neighbor ( keys %{ $n->{$node} } ) {
	12					44
1742	69					148	my $pair = $n->{$node}{$neighbor};
1743	69					166	my ( $edge, undef ) = split /,/, $pair;
1744	69					149	my $weighted_energy = $subenergy * $edge;
1745	69	50				135	print ' ' x $self->{'depth'},
1746							" edge $edge ($node, $neighbor)\n"
1747							if $self->{'debug'} > 1;
1748	69					140	$self->_energize( $neighbor, $weighted_energy );
1749							}
1750							}
1751	81					101	$self->{'depth'}--;
1752	81					153	return 1;
1753							}
1754
1755
1756							# Given an array, normalize using cosine normalization
1757
1758							sub __normalize {
1759	5630			5630		7511	my ( $arr ) = @_;
1760
1761	5630	50	33			41861	croak "Must provide array ref to __normalize" unless
			33
1762							defined $arr and
1763							ref $arr and
1764							ref $arr eq 'ARRAY';
1765
1766	5630					6060	my $sum;
1767	5630					6009	$sum += $_->[2] foreach @{$arr};
	5630					81879
1768	5630					8577	$_->[2]/= $sum foreach @{$arr};
	5630					68705
1769	5630					15044	return 1;
1770							}
1771
1772
1773
1774
1775							sub DESTROY {
1776	15			15		35508	undef $_[0]->{Graph}
1777							}
1778
1779							1;
1780
1781							__END__