File Coverage

blib/lib/Search/ContextGraph.pm
Criterion Covered Total %
statement 383 649 59.0
branch 100 250 40.0
condition 25 66 37.8
subroutine 46 71 64.7
pod 35 50 70.0
total 589 1086 54.2


line stmt bran cond sub pod time code
1             package Search::ContextGraph;
2              
3 13     13   325052 use strict;
  13         32  
  13         445  
4 13     13   71 use warnings;
  13         40  
  13         383  
5 13     13   69 use Carp;
  13         27  
  13         1223  
6 13     13   94 use base "Storable";
  13         22  
  13         16539  
7 13     13   62118 use File::Find;
  13         29  
  13         1383  
8 13     13   27029 use IO::Socket;
  13         523789  
  13         73  
9              
10             our $VERSION = '0.15';
11              
12              
13             my $count = 0;
14              
15              
16             =head1 NAME
17              
18             Search::ContextGraph - spreading activation search engine
19              
20             =head1 SYNOPSIS
21              
22             use Search::ContextGraph;
23              
24             my $cg = Search::ContextGraph->new();
25              
26             # first you add some documents, perhaps all at once...
27            
28             my %docs = (
29             'first' => [ 'elephant', 'snake' ],
30             'second' => [ 'camel', 'pony' ],
31             'third' => { 'snake' => 2, 'constrictor' => 1 },
32             );
33              
34             $cg->bulk_add( %docs );
35            
36             # or in a loop...
37            
38             foreach my $title ( keys %docs ) {
39             $cg->add( $title, $docs{$title} );
40             }
41              
42             # or from a file...
43              
44             my $cg = Search::ContextGraph->load_from_dir( "./myfiles" );
45              
46             # you can store a graph object for later use
47            
48             $cg->store( "stored.cng" );
49            
50             # and retrieve it later...
51            
52             my $cg = ContextGraph->retrieve( "stored.cng" );
53            
54            
55             # SEARCHING
56            
57             # the easiest way
58              
59             my @ranked_docs = $cg->simple_search( 'peanuts' );
60              
61              
62             # get back both related terms and docs for more power
63              
64             my ( $docs, $words ) = $cg->search('snake');
65              
66              
67             # you can use a document as your query
68              
69             my ( $docs, $words ) = $cg->find_similar('First Document');
70              
71              
72             # Or you can query on a combination of things
73              
74             my ( $docs, $words ) =
75             $cg->mixed_search( { docs => [ 'First Document' ],
76             terms => [ 'snake', 'pony' ]
77             );
78              
79              
80             # Print out result set of returned documents
81             foreach my $k ( sort { $docs->{$b} <=> $docs->{$a} }
82             keys %{ $docs } ) {
83             print "Document $k had relevance ", $docs->{$k}, "\n";
84             }
85              
86              
87              
88             # Reload it
89             my $new = Search::ContextGraph->retrieve( "filename" );
90              
91              
92              
93             =head1 DESCRIPTION
94              
95             Spreading activation is a neat technique for building search engines that
96             return accurate results for a query even when there is no exact keyword match.
97             The engine works by building a data structure called a B, which
98             is a giant network of document and term nodes. All document nodes are connected
99             to the terms that occur in that document; similarly, every term node is connected
100             to all of the document nodes that term occurs in. We search the graph by
101             starting at a query node and distributing a set amount of energy to its neighbor
102             nodes. Then we recurse, diminishing the energy at each stage, until this
103             spreading energy falls below a given threshold. Each node keeps track of
104             accumulated energy, and this serves as our measure of relevance.
105              
106             This means that documents that have many words in common will appear similar to the
107             search engine. Likewise, words that occur together in many documents will be
108             perceived as semantically related. Especially with larger, coherent document
109             collections, the search engine can be quite effective at recognizing synonyms
110             and finding useful relationships between documents. You can read a full
111             description of the algorithm at L.
112              
113             The search engine gives expanded recall (relevant results even when there is no
114             keyword match) without incurring the kind of computational and patent issues
115             posed by latent semantic indexing (LSI). The technique used here was originally
116             described in a 1981 dissertation by Scott Preece.
117              
118             =head1 CONSTRUCTORS
119              
120             =over
121              
122             =item new %PARAMS
123              
124             Object constructor. Possible parameters:
125              
126             =over
127              
128             =item auto_reweight
129              
130             Rebalance the graph every time a change occurs. Default is true.
131             Disable and do by hand using L for better performance in
132             graphs with frequent updates/additions/deletions.
133              
134              
135             =item debug LEVEL
136              
137             Set this to 1 or 2 to turn on verbose debugging output
138              
139             =item max_depth
140              
141             Set the maximum distance to spread energy out from the start
142             node. Default is effectively unlimited. You can tweak it using L.
143             Comes in handy if you find searches are too slow.
144              
145             =item xs
146              
147             When true, tells the module to use compiled C internals. This reduces
148             memory requirements by about 60%, but actually runs a little slower than the
149             pure Perl version. Don't bother to turn it on unless you have a huge graph.
150             Default is pure Perl.
151              
152             =over
153              
154             =item * using the compiled version makes it impossible to store the graph to disk.
155              
156             =item * xs is B in version 0.09. But it will return in triumph!
157              
158             =back
159              
160             =item START_ENERGY
161              
162             Initial energy to assign to a query node. Default is 100.
163              
164             =item ACTIVATE_THRESHOLD
165              
166             Minimal energy needed to propagate search along the graph. Default is 1.
167              
168             =item COLLECT_THRESHOLD
169              
170             Minimal energy needed for a node to enter the result set. Default is 1.
171              
172             =back
173              
174             =cut
175              
176              
177             sub new {
178 15     15 1 212 my ( $class, %params) = @_;
179              
180             # backwards compatible...
181 15         75 *add_document = \&add;
182 15         44 *add_documents = \&bulk_add;
183            
184             # plucene friendly
185 15         42 *optimize = \&reweight_graph;
186 15         66 *is_indexed = \&has_doc;
187            
188             # fail on all unknown paramters (helps fight typos)
189 15         82 my @allowed = qw/debug auto_reweight use_global_weights max_depth START_ENERGY ACTIVATE_THRESHOLD COLLECT_THRESHOLD use_file xs/;
190 15         28 my %check;
191 15         161 $check{$_}++ foreach @allowed;
192            
193 15         34 my @forbidden;
194 15         55 foreach my $k ( keys %params ) {
195 12 50       52 push @forbidden, $k unless exists $check{$k};
196             }
197 15 50       57 if ( @forbidden ) {
198 0         0 croak "The following unrecognized parameters were detected: ",
199             join ", ", @forbidden;
200             }
201              
202              
203 15         167 my $obj = bless
204             { debug => 0,
205             auto_reweight => 1,
206             use_global_weights => 1,
207             max_depth => 100000000,
208             START_ENERGY => 100,
209             ACTIVATE_THRESHOLD => 1,
210             COLLECT_THRESHOLD => .2,
211             %params,
212              
213             depth => 0,
214             neighbors => {},
215              
216             },
217             $class;
218            
219            
220 15 100       180 if ( $obj->{use_file} ) {
221 1         3 my %neighbors;
222 13     13   24279 use MLDBM qw/DB_File Storable/;
  13         51331  
  13         100  
223 13     13   518 use Fcntl;
  13         31  
  13         138808  
224 1         165 warn "Using MLDBM: $obj->{use_file}";
225 1 50       12 $obj->{neighbors} = tie %neighbors, 'MLDBM', $obj->{use_file} or die $!;
226             #$obj->{neighbors} = \%neighbors;
227            
228            
229             }
230              
231 14         79 return $obj;
232              
233             }
234              
235              
236             =item load_from_dir DIR [, \&PARSE ]
237              
238             Load documents from a directory. Takes two arguments, a directory path
239             and an optional parsing subroutine. If the parsing subroutine is passed
240             an argument, it will use it to extract term tokens from the file.
241             By default, the file is split on whitespace and stripped of numbers and
242             punctuation.
243              
244             =cut
245              
246             {
247             my $parse_sub;
248              
249             sub load_from_dir {
250 0     0 1 0 my ( $class, $dir, $code ) = @_;
251              
252 0 0       0 croak "$dir is not a directory" unless -d $dir;
253              
254 0         0 require File::Find;
255 0 0 0     0 unless ( defined $code
      0        
256             and ref $code
257             and ref $code eq 'CODE' ) {
258             $code = sub {
259 0     0   0 my $text = shift;
260 0         0 $text =~ s/[^\w]/ /gs;
261 0         0 my @toks = split /\s+/m, $text;
262 0         0 return grep { length($_) > 1 } @toks;
  0         0  
263 0         0 };
264             }
265              
266 0         0 $parse_sub = $code;
267 0         0 my %docs;
268              
269             # Recursively open every file and provide the contents
270             # to whatever parsing subroutine we're using
271              
272             my $reader =
273              
274             sub {
275 0     0   0 my ( $parse ) = @_;
276 0 0       0 return if /^\./;
277 0 0       0 return unless -f $_;
278 0 0       0 open my $fh, $_ or
279             croak "Could not open file $File::Find::name: $!";
280 0         0 local $/;
281 0         0 my $contents = <$fh>;
282 0 0       0 close $fh or croak "failed to close filehandle";
283 0         0 my @words = $parse_sub->($contents);
284 0         0 $docs{ $File::Find::name } = \@words;
285 0         0 };
286              
287              
288 0         0 find( $reader , $dir );
289 0         0 my $self = __PACKAGE__->new();
290 0         0 $self->bulk_add( %docs );
291 0         0 return $self;
292             }
293             }
294              
295              
296              
297             =item load_from_tdm FILENAME
298              
299             Opens and loads a term-document matrix (TDM) file to initialize the graph.
300             The TDM encodes information about term-to-document links.
301             This is a legacy method mainly for the convenience of the module author.
302             For notes on the proper file format, see the README file.
303             =cut
304              
305             sub load_from_tdm {
306 0     0 1 0 my ( $self, $file ) = @_;
307 0 0       0 croak "TDM file $file does not exist" unless -f $file;
308 0 0       0 return if $self->{'loaded'};
309 0         0 $self->_read_tdm( $file );
310 0         0 $self->{'loaded'} = 1;
311 0         0 $self->reweight_graph();
312             }
313              
314              
315             =item rename OLD, NEW
316              
317             Renames a document. Will return undef if the new name is already in use.
318              
319             =cut
320             sub rename {
321              
322 0     0 1 0 my ( $self, $old, $new ) = @_;
323 0 0 0     0 croak "rename method needs two arguments" unless
324             defined $old and defined $new;
325 0 0       0 croak "document $old not found" unless
326             exists $self->{neighbors}{ _nodeify('D', $old ) };
327            
328 0         0 my $bad = _nodeify( 'D', $old );
329 0         0 my $good = _nodeify( 'D', $new );
330            
331 0 0       0 return if exists $self->{neighbors}{$good};
332            
333 0         0 my $s = $self->{neighbors};
334 0         0 foreach my $n ( keys %{ $s->{$bad} } ) {
  0         0  
335 0         0 $s->{$good}{$n} =
336             $s->{$n}{$good} =
337             $s->{$bad}{$n};
338 0         0 delete $s->{$bad}{$n};
339 0         0 delete $s->{$n}{$bad};
340             }
341 0         0 delete $s->{$bad};
342 0         0 return 1;
343              
344             }
345              
346              
347              
348             =item retrieve FILENAME
349              
350             Loads a previously stored graph from disk, using Storable.
351              
352             =cut
353              
354             sub retrieve {
355 0     0 1 0 my ( $self, $file ) = @_;
356 0 0       0 croak "Must provide a filename to retrieve graph"
357             unless $file;
358 0 0       0 croak "'$file' is not a file" unless
359             -f $file;
360              
361 0         0 Storable::retrieve( $file );
362             }
363              
364              
365             =back
366              
367             =head1 ACCESSORS
368              
369             =over
370              
371             =item [get|set]_activate_threshold
372              
373             Accessor for node activation threshold value. This value determines how far
374             energy can spread in the graph. Lower it to increase the number of results.
375             Default is 1.
376              
377             =cut
378              
379 2     2 0 1147 sub get_activate_threshold { $_[0]->{'ACTIVATE_THRESHOLD'} }
380             sub set_activate_threshold {
381 3     3 0 818 my ( $self, $threshold ) = @_;
382 3 100       123 croak "Can't set activate threshold to zero"
383             unless $threshold;
384 2 100       110 croak "Can't set activate threshold to negative value"
385             unless $threshold > 0;
386 1         6 $self->{'ACTIVATE_THRESHOLD'} = $_[1];
387             }
388              
389              
390             =item [get|set]_auto_reweight
391              
392             Accessor for auto reweight flag. If true, edge weights will be recalculated
393             every time a document is added, updated or removed. This can significantly slow
394             down large graphs. On by default.
395              
396             =cut
397              
398 0     0 0 0 sub get_auto_reweight{ $_[0]->{auto_reweight} }
399 0     0 0 0 sub set_auto_reweight{ $_[0]->{auto_reweight} = $_[0]->[1]; }
400              
401              
402             =item [get|set]_collect_threshold
403              
404             Accessor for collection threshold value. This determines how much energy a
405             node must have to make it into the result set. Lower it to increase the
406             number of results. Default is 1.
407              
408             =cut
409              
410             sub get_collect_threshold {
411 2 50   2 0 21 return ( $_[0]->{'xs'} ?
412             $_[0]->{Graph}->collectionThreshold :
413             $_[0]->{'COLLECT_THRESHOLD'})
414             }
415              
416             sub set_collect_threshold {
417 2     2 0 5 my ( $self, $newval ) = @_;
418              
419 2   100     10 $newval ||=0;
420              
421 2 50       7 $self->{Graph}->collectionThreshold( $newval )
422             if $self->{'xs'};
423              
424 2   100     10 $self->{'COLLECT_THRESHOLD'} = $newval || 0;
425 2         6 return 1;
426             }
427              
428             =item [get|set]_debug_mode LEVEL
429              
430             Turns debugging on or off. 1 is verbose, 2 is very verbose, 0 is off.
431              
432             =cut
433              
434 0     0 0 0 sub get_debug_mode { $_[0]->{debug} }
435             sub set_debug_mode {
436 0     0 0 0 my ( $self, $mode ) = @_;
437 0         0 $self->{'debug'} = $mode;
438             }
439              
440              
441              
442             =item [get|set]_initial_energy
443              
444             Accessor for initial energy value at the query node. This controls how
445             much energy gets poured into the graph at the start of the search.
446             Increase this value to get more results from your queries.
447              
448             =cut
449              
450 2     2 0 15 sub get_initial_energy { $_[0]->{'START_ENERGY'} }
451             sub set_initial_energy {
452 2     2 0 6 my ( $self, $start_energy ) = @_;
453 2 50       9 croak "Can't set initial energy to zero"
454             unless $start_energy;
455 2 100       115 croak "Can't set initial energy to negative value"
456             unless $start_energy > 0;
457 1         30 $self->{'START_ENERGY'} = $start_energy ;
458             }
459              
460             =item [get|set]_max_depth LEVEL
461              
462             You can tell the graph to cut off searches after a certain distance from
463             the query node. This can speed up searches on very large graphs, and has
464             little adverse effect, especially if you are interested in just the first
465             few search results. Set this value to undef to restore the default (10^8).
466              
467             =cut
468              
469 3     3 0 450 sub get_max_depth { $_[0]->{max_depth} }
470 3 100   3 0 215 sub set_max_depth { croak "Tried to set maximum depth to an undefined value"
471             unless defined $_[1];
472 2   100     19 $_[0]->{max_depth} = $_[1] || 100000000
473             }
474              
475              
476              
477              
478             =back
479              
480             =head1 METHODS
481              
482             =over
483              
484             =item add DOC, WORDS
485              
486             Add a document to the search engine. Takes as arguments a unique doc
487             identifier and a reference to an array or hash of words in the
488             document.
489             For example:
490              
491             TITLE => { WORD1 => COUNT1, WORD2 => COUNT2 ... }
492              
493             or
494              
495             TITLE => [ WORD1, WORD2, WORD3 ]
496              
497             Use L if you want to pass in a bunch of docs all at once.
498              
499             =cut
500              
501              
502             sub add {
503              
504 420     420 1 28619 my ( $self, $title, $words ) = @_;
505              
506              
507 420 50       1228 croak "Please provide a word list" unless defined $words;
508 420 50 66     3263 croak "Word list is not a reference to an array or hash"
      66        
509             unless ref $words and ref $words eq "HASH" or ref $words eq "ARRAY";
510              
511 420 50       934 croak "Please provide a document identifier" unless defined $title;
512              
513 420         853 my $dnode = _nodeify( 'D', $title );
514 420 50       1461 croak "Tried to add document with duplicate identifier: '$title'\n"
515             if exists $self->{neighbors}{$dnode};
516              
517 420         668 my @list;
518 420 100       1104 if ( ref $words eq 'ARRAY' ) {
519 419         489 @list = @{$words};
  419         3733  
520             } else {
521 1         3 @list = keys %{$words};
  1         46  
522             }
523              
524 420 50       1048 croak "Tried to add a document with no content" unless scalar @list;
525              
526 420         493 my @edges;
527 420         905 foreach my $term ( @list ) {
528 13489         27818 my $tnode = _nodeify( 'T', lc( $term ) );
529              
530             # Local weight for the document
531 13489 100       27162 my $lcount = ( ref $words eq 'HASH' ? $words->{$term} : 1 );
532              
533             # Update number of docs this word occurs in
534 13489         35123 my $gcount = ++$self->{term_count}{lc( $term )};
535              
536 13489         13877 my $final_weight = 1;
537 13489         33581 push @edges, [ $dnode, $tnode, $final_weight, $lcount ];
538              
539             }
540 420         787 $self->{reweight_flag} = 1;
541 420         908 __normalize( \@edges );
542              
543              
544             =cut
545              
546             DEVELOPMENT
547              
548             if ( $self->{supersize} ) {
549             my $n = $self->{neighbors};
550             foreach my $e ( @edges ) {
551             #warn "adding edge $e->[0], $e->[1]\n";
552            
553             $n->{$e->[0]} = {} unless exists $n->{$e->[0]};
554             $n->{$e->[1]} = {} unless exists $n->{$e->[1]};
555            
556             my $tmp = $n->{$e->[0]};
557             $tmp->{$e->[1]} = join ',', $e->[2], $e->[3];
558             $tmp = $n->{$e->[1]};
559             $tmp->{$e->[0]} = join ',', $e->[2], $e->[3];
560             }
561             =cut
562              
563            
564             # PURE PERL VERSION
565             #} else {
566 420         698 foreach my $e ( @edges ) {
567 13489         72348 $self->{neighbors}{$e->[0]}{$e->[1]} = join ',', $e->[2], $e->[3];
568 13489         75829 $self->{neighbors}{$e->[1]}{$e->[0]} = join ',', $e->[2], $e->[3];
569             }
570             #}
571            
572            
573             #print "Reweighting graph\n";
574 420 100       2151 $self->reweight_graph() if $self->{auto_reweight};
575 420         15729 return 1;
576              
577             }
578              
579              
580             =item add_file PATH [, name => NAME, parse => CODE]
581              
582             Adds a document from a file. By default, uses the PATH provided as the document
583             identifier, and parses the file by splitting on whitespace. If a fancier title,
584             or more elegant parsing behavior is desired, pass in named arguments as indicated.
585             NAME can be any string, CODE should be a reference to a subroutine that takes one
586             argument (the contents of the file) and returns an array of tokens, or a hash in the
587             form TOKEN => COUNT, or a reference to the same.
588              
589             =cut
590              
591             sub add_file {
592 0     0 1 0 my ( $self, $path, %params ) = @_;
593            
594 0 0 0     0 croak "Invalid file '$path' provided to add_file method."
595             unless defined $path and -f $path;
596            
597 0 0       0 my $title = ( exists $params{name} ? $params{name} : $path );
598              
599 0         0 local $/;
600 0 0       0 open my $fh, $path or croak "Unable to open $path: $!";
601 0         0 my $content = <$fh>;
602            
603 0         0 my $ref;
604            
605 0 0       0 if ( exists $params{parse} ) {
606 0 0       0 croak "code provided is not a reference" unless
607             ref $params{parse};
608 0 0       0 croak "code provided is not a subroutine" unless
609             ref $params{parse} eq 'CODE';
610            
611 0         0 $ref = $params{parse}->( $content );
612 0 0 0     0 croak "did not get an appropriate reference back after parsing"
613             unless ref $ref and ref $ref =~ /(HASH|ARRAY)/;
614            
615            
616             } else {
617            
618             my $code = sub {
619 0     0   0 my $txt = shift;
620 0         0 $txt =~ s/\W/ /g;
621 0         0 my @toks = split m/\s+/, $txt;
622 0         0 \@toks;
623 0         0 };
624 0         0 $ref = $code->($content);
625             }
626            
627 0 0       0 return unless $ref;
628 0         0 $self->add( $title, $ref );
629            
630             }
631              
632             =item bulk_add DOCS
633              
634             Add documents to the graph in bulk. Takes as an argument a hash
635             whose keys are document identifiers, and values are references
636             to hashes in the form { WORD1 => COUNT, WORD2 => COUNT...}
637             This method is faster than adding in documents one by one if
638             you have auto_rebalance turned on.
639              
640             =cut
641              
642             sub bulk_add {
643              
644 0     0 1 0 my ( $self, %incoming_docs ) = @_;
645              
646             # Disable graph rebalancing until we've added everything
647             {
648 0         0 local $self->{auto_reweight} = 0;
  0         0  
649              
650 0         0 foreach my $doc ( keys %incoming_docs ) {
651 0         0 $self->add( $doc, $incoming_docs{$doc});
652             }
653             }
654 0 0       0 $self->reweight_graph() if $self->{auto_reweight};
655             }
656              
657              
658             =item degree NODE
659              
660             Given a raw node, returns the degree (raw node means the node must
661             be prefixed with 'D:' or 'T:' depending on type )
662              
663             =cut
664              
665 5     5 1 3434 sub degree { scalar keys %{$_[0]->{neighbors}{$_[1]}} }
  5         46  
666              
667              
668             =item delete DOC
669              
670             Remove a document from the graph. Takes a document identifier
671             as an argument. Returns 1 if successful, undef otherwise.
672              
673             =cut
674              
675             sub delete {
676              
677 37     37 1 282 my ( $self, $type, $name ) = @_;
678            
679 37 50       153 croak "Must provide a node type to delete() method" unless defined $type;
680 37 50       215 croak "Invalid type $type passed to delete method. Must be one of [TD]"
681             unless $type =~ /^[TD]$/io;
682 37 50       84 croak "Please provide a node name" unless defined $name;
683            
684 37 50       97 return unless defined $name;
685 37         87 my $node = _nodeify( $type, $name);
686              
687 37         87 my $n = $self->{neighbors};
688 37 50       148 croak "Found a neighborless node $node"
689             unless exists $n->{$node};
690              
691 37         60 my @terms = keys %{ $n->{$node} };
  37         462  
692              
693 37 50       179 warn "found ", scalar @terms, " neighbors attached to $node\n"
694             if $self->{debug};
695             # Check to see if we have orphaned any terms
696 37         61 foreach my $t ( @terms ) {
697            
698 1205         2144 delete $n->{$node}{$t};
699 1205         2124 delete $n->{$t}{$node};
700              
701 1205 100       1260 if ( scalar keys %{ $n->{$t} } == 0 ) {
  1205         2994  
702 976 50       1922 warn "\tdeleting orphaned node $t" if $self->{debug};
703 976         3124 my ( $subtype, $name ) = $t =~ /^(.):(.*)$/;
704             #$self->delete( $subtype, $name );
705 976         2199 delete $n->{$t};
706             }
707             }
708              
709 37         113 delete $n->{$node};
710 37         156 $self->check_consistency();
711 37         921 $self->{reweight_flag} = 1;
712 37 50       308 $self->reweight_graph if $self->{auto_reweight};
713 37         570 1;
714             }
715              
716              
717              
718             =item has_doc DOC
719              
720             Returns true if the document with identifier DOC is in the collection
721              
722             =cut
723              
724             sub has_doc {
725 4     4 1 15 my ( $self, $doc ) = @_;
726 4 50       10 carp "Received undefined value for has_doc" unless defined $doc;
727 4         9 my $node = _nodeify( 'D', $doc );
728 4   100     34 return exists $self->{neighbors}{$node} || undef;
729             }
730              
731             =item has_term TERM
732              
733             Returns true if the term TERM is in the collection
734              
735             =cut
736              
737             sub has_term {
738 4     4 1 6 my ( $self, $term ) = @_;
739 4 50       11 carp "Received undefined value for has_term" unless defined $term;
740 4         8 my $node = _nodeify( 'T', $term );
741 4   100     31 return exists $self->{neighbors}{$node} || undef;
742             }
743              
744              
745              
746             =item distance NODE1, NODE2, TYPE
747              
748             Calculates the distance between two nodes of the same type (D or T)
749             using the formula:
750              
751             distance = ...
752             =cut
753              
754             sub distance {
755 0     0 1 0 my ( $self, $n1, $n2, $type ) = @_;
756 0 0       0 croak unless $type;
757 0         0 $type = lc( $type );
758 0 0       0 croak unless $type =~ /^[dt]$/;
759 0 0       0 my $key = ( $type eq 't' ? 'terms' : 'documents' );
760 0         0 my @shared = $self->intersection( $key => [ $n1, $n2 ] );
761 0 0       0 return 0 unless @shared;
762             #warn "Found ", scalar @shared, " nodes shared between $n1 and $n2\n";
763            
764 0         0 my $node1 = _nodeify( $type, $n1 );
765 0         0 my $node2 = _nodeify( $type, $n2 );
766             # formula is w(t1,d1)/deg(d1) + w(t1,d2)/deg(d2) ... ) /deg( t1 )
767            
768             #warn "Calculating distance\n";
769 0         0 my $sum1 = 0;
770 0         0 my $sum2 = 0;
771 0         0 foreach my $next ( @shared ) {
772 0         0 my ( undef, $lcount1) = split m/,/, $self->{neighbors}{$node1}{$next};
773 0         0 my ( undef, $lcount2) = split m/,/, $self->{neighbors}{$node2}{$next};
774              
775 0         0 my $degree = $self->degree( $next );
776             #warn "\t degree of $next is $degree\n";
777 0         0 my $elem1 = $lcount1 / $degree;
778 0         0 $sum1 += $elem1;
779 0         0 my $elem2 = $lcount2 / $degree;
780 0         0 $sum2 += $elem2;
781             }
782             #warn "sum is $sum1, $sum2\n";
783 0         0 my $final = ($sum1 / $self->degree( $node1 )) + ( $sum2 / $self->degree( $node2 ));
784             #warn "final is $final\n";
785 0         0 return $final;
786            
787            
788             }
789              
790             =item distance_matrix TYPE LIMIT
791              
792             Used for clustering using linear local embedding. Produces a similarity matrix
793             in a format I'm too tired to document right now. LIMIT is the maximum number
794             of neighbors to keep for each node.
795              
796             =cut
797              
798             sub distance_matrix {
799 0     0 1 0 my ( $self, $type, $limit ) = @_;
800 0 0       0 croak "Must provide type argument to distance_matrix()"
801             unless defined $type;
802 0 0       0 croak "must provide limit" unless $limit;
803 0         0 my @nodes;
804 0 0       0 if ( lc( $type ) eq 'd' ) {
    0          
805 0         0 @nodes = $self->doc_list();
806             } elsif ( lc( $type ) eq 't' ) {
807 0         0 @nodes = $self->term_list();
808             } else {
809 0         0 croak "Unsupported type $type";
810             }
811            
812 0         0 my @ret;
813 0         0 my $count = 0;
814 0         0 foreach my $from ( @nodes ) {
815 0         0 warn $from, " - $count\n";
816 0         0 $count++;
817 0         0 my $index = -1;
818 0         0 my @found;
819 0         0 foreach my $to ( @nodes ) {
820 0         0 $index++;
821 0 0       0 next if $from eq $to;
822 0         0 my $dist = $self->distance( $from, $to, $type );
823 0 0       0 push @found, [ $index, $dist ] if $dist;
824             #print( $index++, ' ', $dist, " " ) if $dist;
825             }
826 0         0 my @sorted = sort { $b->[1] <=> $a->[1] } @found;
  0         0  
827 0         0 my @final = splice ( @sorted, 0, $limit );
828 0         0 push @ret, join " ", ( map { join ' ', $_->[0], substr($_->[1], 0, 7) }
  0         0  
829 0         0 sort { $a->[0] <=> $b->[0] }
830             @final), "\n";
831             #print "\n";
832             }
833 0         0 return join "\n", @ret;
834              
835             }
836              
837             =item intersection @NODES
838              
839             Returns a list of neighbor nodes that all the given nodes share in common
840              
841             =cut
842              
843             sub intersection {
844 2     2 1 1324 my ( $self, %nodes ) = @_;
845 2         3 my @nodes;
846 2 100       17 if ( exists $nodes{documents} ) {
847 1         3 push @nodes, map { _nodeify( 'D', $_ ) } @{ $nodes{documents}};
  2         7  
  1         4  
848             }
849 2 100       33 if ( exists $nodes{terms} ) {
850 1         1 push @nodes, map { _nodeify( 'T', $_ ) } @{ $nodes{terms}};
  2         7  
  1         3  
851             }
852            
853 2         4 my %seen;
854 2         4 foreach my $n ( @nodes ) {
855 4         9 my @neighbors = $self->_neighbors( $n );
856 4         101 $seen{ $_ }++ foreach @neighbors;
857             }
858 4         12 return map { s/^[DT]://; $_ }
  4         23  
  131         169  
859 2         14 grep { $seen{$_} == scalar @nodes }
860             keys %seen;
861             }
862              
863             =item raw_search @NODES
864              
865             Given a list of nodes, returns a hash of nearest nodes with relevance values,
866             in the format NODE => RELEVANCE, for all nodes above the threshold value.
867             (You probably want one of L, L, or L instead).
868              
869             =cut
870              
871             sub raw_search {
872 12     12 1 27 my ( $self, @query ) = @_;
873              
874 12         46 $self->_clear();
875 12         37 foreach ( @query ) {
876 12         51 $self->_energize( $_, $self->{'START_ENERGY'});
877             }
878 12         36 my $results_ref = $self->_collect();
879              
880              
881 12         30 return $results_ref;
882             }
883              
884              
885              
886              
887             =item reweight_graph
888              
889             Iterates through the graph, calculating edge weights and normalizing
890             around nodes. This method is automatically called every time a
891             document is added, removed, or updated, unless you turn the option
892             off with auto_reweight(0). When adding a lot of docs, this can be
893             time consuming, so either set auto_reweight to off or use the
894             L method to add lots of docs at once
895              
896             =cut
897              
898             sub reweight_graph {
899 294     294 1 1615 my ( $self ) = @_;
900              
901 294         543 my $n = $self->{neighbors}; #shortcut
902 294         766 my $doc_count = $self->doc_count();
903             #print "Renormalizing for doc count $doc_count\n" if $self->{debug};
904 294         7166 foreach my $node ( keys %{$n} ) {
  294         16117  
905              
906 150286 100       325623 next unless $node =~ /^D:/o;
907 5210 50       24072 warn "reweighting at node $node\n" if $self->{debug} > 1;
908 5210         6133 my @terms = keys %{ $n->{$node} };
  5210         73395  
909 5210         19343 my @edges;
910 5210         7402 foreach my $t ( @terms ) {
911              
912 173163         357962 my $pair = $n->{$node}{$t};
913 173163         411221 my ( undef, $lcount ) = split /,/, $pair;
914 173163         639016 ( my $term = $t ) =~ s/^T://;
915 173163 50       375967 croak "did not receive a local count" unless $lcount;
916 173163         174757 my $weight;
917 173163 50       349496 if ( $self->{use_global_weights} ) {
918              
919 173163         349136 my $gweight = log( $doc_count / $self->doc_count( $term ) ) + 1;
920 173163         384076 my $lweight = log( $lcount ) + 1;
921 173163         265720 $weight = ( $gweight * $lweight );
922            
923             } else {
924              
925 0         0 $weight = log( $lcount ) + 1;
926             }
927 173163         574606 push @edges, [ $node, $t, $weight, $lcount ];
928             }
929              
930 5210         13666 __normalize( \@edges );
931              
932 5210         9523 foreach my $e ( @edges ) {
933 173163         571156 my $pair = join ',', $e->[2], $e->[3];
934 173163         566737 $n->{$node}{$e->[1]} = $n->{$e->[1]}{$node} = $pair;
935             }
936             }
937 294         17793 $self->{reweight_flag} = 0;
938 294         648 return 1;
939             }
940              
941              
942              
943              
944             =item update ID, WORDS
945              
946             Given a document identifier and a word list, updates the information for
947             that document in the graph. Returns the number of changes made
948              
949             =cut
950              
951             sub update {
952              
953 2     2 1 68 my ( $self, $id, $words ) = @_;
954              
955 2 50       10 croak "update not implemented in XS" if $self->{xs};
956 2 50       7 croak "Must provide a document identifier to update_document" unless defined $id;
957 2         8 my $dnode = _nodeify( 'D', $id );
958              
959 2 100       13 return unless exists $self->{neighbors}{$dnode};
960 1 50 33     19 croak "must provide a word list "
      33        
      33        
961             unless defined $words and
962             ref $words and
963             ( ref $words eq 'HASH' or
964             ref $words eq 'ARRAY' );
965              
966 1         32 my $n = $self->{neighbors}{$dnode};
967            
968             # Get the current word list
969 1         2 my @terms = keys %{ $n };
  1         6  
970              
971 1 50       6 if ( ref $words eq 'ARRAY' ) {
972 1         2 my %words;
973 1         8 $words{$_}++ foreach @$words;
974 1         4 $words = \%words;
975             }
976              
977 1         4 local $self->{auto_reweight} = 0;
978              
979 1         2 my $must_reweight = 0;
980 1         2 my %seen;
981              
982 1         2 foreach my $term ( keys %{$words} ) {
  1         4  
983              
984 4         10 my $t = _nodeify( 'T', $term );
985              
986 4 100       11 if ( exists $n->{$t} ){
987              
988             # Update the local count, if necessary
989 3         7 my $curr_val = $n->{$t};
990 3         9 my ( undef, $loc ) = split m/,/, $curr_val;
991              
992 3 50       12 unless ( $loc == $words->{$term} ) {
993 0         0 $n->{$t} = join ',', 1, $words->{$term};
994 0         0 $must_reweight++;
995             }
996             }
997              
998             else {
999              
1000 1         8 $n->{$t} =
1001             $self->{neighbors}{$t}{$dnode} =
1002             join ',', 1, $words->{$term};
1003 1         2 $must_reweight++;
1004             }
1005              
1006 4         12 $seen{$t}++;
1007             }
1008              
1009             # Check for deleted words
1010 1         5 foreach my $t ( @terms ) {
1011 3 50       9 $must_reweight++
1012             unless exists $seen{$t};
1013             }
1014              
1015 1 50       16 $self->reweight_graph() if
1016             $must_reweight;
1017              
1018 1         17 return $must_reweight;
1019              
1020             }
1021              
1022              
1023             =item doc_count [TERM]
1024              
1025             Returns a count of all documents that TERM occurs in.
1026             If no argument is provided, returns a document count
1027             for the entire collection.
1028              
1029             =cut
1030              
1031             sub doc_count {
1032 173490     173490 1 254700 my ( $self, $term ) = @_;
1033 173490 100       301573 if ( defined $term ) {
1034 173186 50       498440 $term = _nodeify( 'T', $term ) unless $term =~ /^T:/;
1035 173186         395011 my $node = $self->{neighbors}{$term};
1036 173186 100       326908 return 0 unless defined $node;
1037 173183         199690 return scalar keys %{$node};
  173183         568622  
1038             } else {
1039 304         61002 return scalar grep /D:/,
1040 304         414 keys %{ $self->{'neighbors'} };
1041             }
1042             }
1043              
1044              
1045             =item doc_list [TERM]
1046              
1047             Returns a sorted list of document identifiers that contain
1048             TERM, in ASCII-betical order. If no argument is given,
1049             returns a sorted document list for the whole collection.
1050              
1051             =cut
1052              
1053             sub doc_list {
1054 4     4 1 74 my ( $self, $term ) = @_;
1055 4         5 my $t;
1056 4 50 33     16 if ( defined $term and $term !~ /T:/) {
1057 0         0 $t = _nodeify( 'T', $term );
1058             }
1059 4 50       14 my $hash = ( defined $term ?
1060             $self->{neighbors}{$t} :
1061             $self->{neighbors} );
1062              
1063 136         216 sort map { s/^D://o; $_ }
  136         235  
  4         862  
1064 4         6 grep /^D:/, keys %{ $hash };
1065             }
1066              
1067              
1068             sub dump {
1069 0     0 0 0 my ( $self ) = @_;
1070 0         0 my @docs = $self->doc_list();
1071              
1072 0         0 foreach my $d ( @docs ) {
1073 0         0 print $self->dump_node( $d );
1074             }
1075             }
1076              
1077             =item dump_node NODE
1078              
1079             Lists all of the neighbors of a node, together with edge
1080             weights connecting to them
1081              
1082             =cut
1083              
1084             sub dump_node {
1085 2     2 1 10 my ( $self, $node ) = @_;
1086              
1087 2         4 my @lines;
1088 2         8 push @lines, join "\t", "COUNT", "WEIGHT", "NEIGHBOR";
1089              
1090 2         5 foreach my $n ( keys %{ $self->{neighbors}{$node} } ) {
  2         33  
1091 116         180 my $v = $self->{neighbors}{$node}{$n};
1092 116         188 my ( $weight, $count ) = split /,/, $v;
1093 116         237 push @lines, join "\t", $count, substr( $weight, 0, 8 ), $n;
1094             }
1095 2         31 return @lines;
1096             }
1097              
1098              
1099              
1100             =item dump_tdm [FILE]
1101              
1102             Dumps internal state in term-document matrix (TDM) format, which looks
1103             like this:
1104              
1105             A B C B C B C
1106             A B C B C B C
1107             A B C B C B C
1108              
1109             Where each row represents a document, A is the number of terms in the
1110             document, B is the term node and C is the edge weight between the doc
1111             node and B. Mostly used as a legacy format by the module author.
1112             Doc and term nodes are printed in ASCII-betical sorted order, zero-based
1113             indexing. Up to you to keep track of the ID => title mappings, neener-neener!
1114             Use doc_list and term_list to get an equivalently sorted list
1115              
1116             =cut
1117              
1118             sub dump_tdm {
1119 0     0 1 0 my ( $self, $file ) = @_;
1120              
1121 0         0 my $counter = 0;
1122 0         0 my %lookup;
1123 0         0 $lookup{$_} = $counter++ foreach $self->term_list;
1124              
1125 0         0 my @docs = $self->doc_list;
1126              
1127 0         0 my $fh;
1128 0 0       0 if ( defined $file ) {
1129 0 0       0 open $fh, "> $file" or croak
1130             "Could not open TDM output file: $!";
1131             } else {
1132 0         0 *fh = *STDOUT;
1133             }
1134 0         0 foreach my $doc ( @docs ) {
1135 0         0 my $n = $self->{neighbors}{$doc};
1136              
1137 0         0 my $row_count = scalar keys %{$n};
  0         0  
1138 0         0 print $fh $row_count;
1139              
1140 0         0 foreach my $t ( sort keys %{$doc} ) {
  0         0  
1141 0         0 my $index = $lookup{$t};
1142 0         0 my ( $weight, undef ) = split m/,/, $n->{$t};
1143 0         0 print $fh ' ', $index, ' ', $weight;
1144             }
1145 0         0 print $fh "\n";
1146             }
1147             }
1148              
1149              
1150              
1151             =item near_neighbors [NODE]
1152              
1153             Returns a list of neighbor nodes of the same type (doc/doc, or term/term) two
1154             hops away.
1155              
1156             =cut
1157              
1158             sub near_neighbors {
1159 0     0 1 0 my ( $self, $name, $type ) = @_;
1160            
1161 0         0 my $node = _nodeify( $type, $name );
1162            
1163 0         0 my $n = $self->{neighbors}{$node};
1164            
1165 0         0 my %found;
1166 0         0 foreach my $next ( keys %{$n} ) {
  0         0  
1167 0         0 foreach my $mynext ( keys %{ $self->{neighbors}{$next} }){
  0         0  
1168 0         0 $found{$mynext}++;
1169             }
1170             }
1171 0         0 delete $found{$node};
1172 0         0 return keys %found;
1173             }
1174              
1175              
1176             =item term_count [DOC]
1177              
1178             Returns the number of unique terms in a document or,
1179             if no document is specified, in the entire collection.
1180              
1181             =cut
1182              
1183             sub term_count {
1184 9     9 1 2187 my ( $self, $doc ) = @_;
1185 9 50       34 if ( defined $doc ) {
1186 0         0 my $node = $self->{neighbors}{ _nodeify( 'D', $doc) };
1187 0 0       0 return 0 unless defined $node;
1188 0         0 return scalar keys %{$node};
  0         0  
1189             } else {
1190 9         8036 return scalar grep /T:/,
1191 9         17 keys %{ $self->{neighbors} };
1192             }
1193             }
1194              
1195              
1196             =item term_list [DOC]
1197              
1198             Returns a sorted list of unique terms appearing in the document
1199             with identifier DOC, in ASCII-betical order. If no argument is
1200             given, returns a sorted term list for the whole collection.
1201              
1202             =cut
1203              
1204             sub term_list {
1205 3     3 1 1345 my ( $self, $doc ) = @_;
1206              
1207 3 100       16 my $node = ( defined $doc ?
1208             $self->{neighbors}{ _nodeify( 'D', $doc) } :
1209             $self->{neighbors}
1210             );
1211              
1212 1782         3740 sort map { s/^T://o; $_ }
  1782         4800  
  3         1120  
1213 3         218 grep /^T:/, keys %{ $node };
1214             }
1215              
1216              
1217              
1218             =item word_count [TERM]
1219              
1220             Returns the total occurence count for a term, or if no argument is given,
1221             a word count for the entire collection. The word count is always greater than
1222             or equal to the term count.
1223              
1224             =cut
1225              
1226             sub word_count {
1227              
1228 4     4 1 13 my ( $self, $term ) = @_;
1229              
1230 4         12 my $n = $self->{neighbors}; # shortcut
1231              
1232 4         9 my $count = 0;
1233 4         9 my @terms;
1234 4 100       17 if ( defined $term ) {
1235 3         8 push @terms, $term;
1236             } else {
1237 1         3 @terms = $self->term_list();
1238             }
1239              
1240 4         138 foreach my $term (@terms ) {
1241 879 50       2252 $term = _nodeify( 'T', $term) unless $term =~/^T:/o;
1242 879         930 foreach my $doc ( keys %{ $n->{$term} } ) {
  879         2124  
1243 1092         2615 ( undef, my $lcount ) = split /,/, $n->{$term}{$doc};
1244 1092         2603 $count += $lcount;
1245             }
1246             }
1247              
1248 4         94 return $count;
1249             }
1250              
1251              
1252              
1253              
1254              
1255             =item search @QUERY
1256              
1257             Searches the graph for all of the words in @QUERY. Use find_similar if you
1258             want to do a document similarity instead, or mixed_search if you want
1259             to search on any combination of words and documents. Returns a pair of hashrefs:
1260             the first a reference to a hash of docs and relevance values, the second to
1261             a hash of words and relevance values.
1262              
1263             =cut
1264              
1265             sub search {
1266 9     9 1 2906 my ( $self, @query ) = @_;
1267 9         27 my @nodes = _nodeify( 'T', @query );
1268 9         40 my $results = $self->raw_search( @nodes );
1269 9         28 my ($docs, $words) = _partition( $results );
1270 9         50 return ( $docs, $words);
1271             }
1272              
1273              
1274              
1275             =item simple_search QUERY
1276              
1277             This is the DWIM method - takes a query string as its argument, and returns an array
1278             of documents, sorted by relevance.
1279              
1280             =cut
1281              
1282             sub simple_search {
1283 1     1 1 12 my ( $self, $query ) = @_;
1284 1         3 my @words = map { s/\W+//g; lc($_) }
  1         3  
  1         5  
1285             split m/\s+/, $query;
1286 1         4 my @nodes = _nodeify( 'T', @words );
1287 1         4 my $results = $self->raw_search( @nodes );
1288 1         3 my ($docs, $words) = _partition( $results );
1289 1         3 my @sorted_docs = sort { $docs->{$b} <=> $docs->{$a} } keys %{$docs};
  1         4  
  1         4  
1290 1         7 return @sorted_docs;
1291             }
1292              
1293             =item find_by_title @TITLES
1294              
1295             Given a list of patterns, searches for documents with matching titles
1296              
1297             =cut
1298              
1299             sub find_by_title {
1300 3     3 1 1035 my ( $self, @titles ) = @_;
1301 3         4 my @found;
1302 3         9 my @docs = $self->doc_list();
1303 3         109 my $pattern = join '|', @titles;
1304 3         66 my $match_me = qr/$pattern/i;
1305             #warn $match_me, "\n";
1306 3         6 foreach my $d ( @docs ) {
1307             # warn $d, "\n";
1308 102 100       318 push @found, $d if $d =~ $match_me;
1309             }
1310 3         27 return @found;
1311             }
1312              
1313              
1314             =item find_similar @DOCS
1315              
1316             Given an array of document identifiers, performs a similarity search
1317             and returns a pair of hashrefs. First hashref is to a hash of docs and relevance
1318             values, second is to a hash of words and relevance values.
1319              
1320             =cut
1321              
1322             sub find_similar {
1323 2     2 1 2082 my ( $self, @docs ) = @_;
1324 2         6 my @nodes = _nodeify( 'D', @docs );
1325 2         6 my $results = $self->raw_search( @nodes );
1326 2         5 my ($docs, $words) = _partition( $results );
1327 2         12 return ( $docs, $words);
1328             }
1329              
1330              
1331             =item merge TYPE, GOOD, @BAD
1332              
1333             Combine all the nodes in @BAD into the node with identifier GOOD.
1334             First argument must be one of 'T' or 'D' to indicate term or
1335             document nodes. Used to combine synonyms in the graph.
1336              
1337             =cut
1338              
1339             sub merge {
1340 2     2 1 1349 my ( $self, $type, $good, @bad ) = @_;
1341 2 50       14 croak "must provide a type argument to merge"
1342             unless defined $type;
1343 2 50       11 croak "Invalid type argument $type to merge [must be one of (D,T)]"
1344             unless $type =~ /^[DT]/io;
1345            
1346 2         6 my $target = _nodeify( $type, $good );
1347 2         5 my @sources = _nodeify( $type, @bad );
1348            
1349 2         7 my $tnode = $self->{neighbors}{$target};
1350            
1351              
1352 2         3 foreach my $bad_node ( @sources ) {
1353             #print "Examining $bad_node\n";
1354 2 50       6 next if $bad_node eq $target;
1355 2         4 my %neighbors = %{$self->{neighbors}{$bad_node}};
  2         20  
1356            
1357 2         9 foreach my $n ( keys %neighbors ) {
1358            
1359             #print "\t $target ($bad_node) neighbor $n\n";
1360 14 100       35 if ( exists $self->{neighbors}{$target}{$n} ) {
1361             #print "\t\t$n has link to $bad_node\n";
1362             # combine the local counts for the term members of the edge
1363 5         9 my $curr_val = $tnode->{$n};
1364 5         11 my $aug_val = $self->{neighbors}{$bad_node}{$n};
1365 5         14 my ($w1, $c1) = split m/,/, $curr_val;
1366 5         11 my ($w2, $c2) = split m/,/, $aug_val;
1367 5         8 my $new_count = $c1 + $c2;
1368 5         25 $curr_val =~ s/,\d+$/,$new_count/;
1369 5         14 $tnode->{$n} = $curr_val;
1370            
1371            
1372             } else {
1373            
1374 9 50       21 die "sanity check failed for existence test"
1375             if exists $self->{neighbors}{$target}{$n};
1376            
1377 9         15 my $val = $self->{neighbors}{$bad_node}{$n};
1378            
1379             #print "\tno existing link -- reassigning $target -- $n\n";
1380             # reassign the current value of this edge
1381            
1382 9         22 $self->{neighbors}{$n}{$target} = $val;
1383 9         20 $self->{neighbors}{$target}{$n} = $val;
1384             }
1385            
1386 14         23 delete $self->{neighbors}{$bad_node}{$n};
1387 14         35 delete $self->{neighbors}{$n}{$bad_node};
1388             }
1389 2         14 delete $self->{neighbors}{$bad_node};
1390             }
1391             }
1392              
1393             =item mixed_search @DOCS
1394              
1395             Given a hashref in the form:
1396             { docs => [ 'Title 1', 'Title 2' ],
1397             terms => ['buffalo', 'fox' ], }
1398             }
1399             Runs a combined search on the terms and documents provided, and
1400             returns a pair of hashrefs. The first hashref is to a hash of docs
1401             and relevance values, second is to a hash of words and relevance values.
1402              
1403             =cut
1404              
1405             sub mixed_search {
1406 0     0 1 0 my ( $self, $incoming ) = @_;
1407              
1408 0 0 0     0 croak "must provide hash ref to mixed_search method"
      0        
1409             unless defined $incoming &&
1410             ref( $incoming ) &&
1411             ref( $incoming ) eq 'HASH';
1412              
1413 0   0     0 my $tref = $incoming->{'terms'} || [];
1414 0   0     0 my $dref = $incoming->{'docs'} || [];
1415              
1416 0         0 my @dnodes = _nodeify( 'D', @{$dref} );
  0         0  
1417 0         0 my @tnodes = _nodeify( 'T', @{$tref} );
  0         0  
1418              
1419 0         0 my $results = $self->raw_search( @dnodes, @tnodes );
1420 0         0 my ($docs, $words) = _partition( $results );
1421 0         0 return ( $docs, $words);
1422             }
1423              
1424              
1425             =item store FILENAME
1426              
1427             Stores the object to a file for later use. Not compatible (yet)
1428             with compiled XS version, which will give a fatal error.
1429              
1430             =cut
1431              
1432             sub store {
1433 0     0 1 0 my ( $self, @args ) = @_;
1434 0 0       0 if ( $self->{'xs'} ) {
1435 0         0 croak "Cannot store object when running in XS mode.";
1436             } else {
1437 0         0 $self->SUPER::nstore(@args);
1438             }
1439             }
1440              
1441              
1442             # Partition - internal method.
1443             # Takes a result set and splits it into two hashrefs - one for
1444             # words and one for documents
1445              
1446             sub _partition {
1447 12     12   19 my ( $e ) = @_;
1448 12         17 my ( $docs, $words );
1449 12         16 foreach my $k ( sort { $e->{$b} <=> $e->{$a} }
  209         277  
  12         83  
1450             keys %{ $e } ) {
1451              
1452 81         219 (my $name = $k ) =~ s/^[DT]://o;
1453 81 100       285 $k =~ /^D:/ ?
1454             $docs->{$name} = $e->{$k} :
1455             $words->{$name} = $e->{$k} ;
1456             }
1457 12         37 return ( $docs, $words );
1458             }
1459              
1460             # return a list of all neighbor nodes
1461             sub _neighbors {
1462 4     4   8 my ( $self, $node ) = @_;
1463 4 50       13 return unless exists $self->{neighbors}{$node};
1464 4         5 return keys %{ $self->{neighbors}{$node} };
  4         44  
1465             }
1466              
1467              
1468             sub _nodeify {
1469 188046     188046   383716 my ( $prefix, @list ) = @_;
1470 188046         206997 my @nodes;
1471 188046         275645 foreach my $item ( @list ) {
1472 188046         531772 push @nodes, uc($prefix).':'.$item;
1473             }
1474 188046 100       578801 ( wantarray ? @nodes : $nodes[0] );
1475             }
1476              
1477              
1478              
1479             sub _read_tdm {
1480 0     0   0 my ( $self, $file ) = @_;
1481 0 0       0 print "Loading TDM...\n" if $self->{'debug'} > 1;
1482              
1483 0 0       0 croak "File does not exist" unless -f $file;
1484 0 0       0 open my $fh, $file or croak "Could not open $file: $!";
1485 0         0 for ( 1..4 ){
1486 0         0 my $skip = <$fh>;
1487             }
1488 0         0 my %neighbors;
1489 0         0 my $doc = 0;
1490              
1491              
1492             ######### XS VERSION ##############
1493 0 0       0 if ( $self->{'xs'} ) {
1494              
1495 0         0 my $map = $self->{'node_map'}; # shortcut alias
1496 0         0 while (<$fh>) {
1497 0         0 chomp;
1498 0         0 my $dindex = $self->_add_node( "D:$doc", 2 );
1499             #warn "Added node $doc\n";
1500 0         0 my ( $count, %vals ) = split;
1501 0         0 while ( my ( $term, $edge ) = each %vals ) {
1502 0         0 $self->{'term_count'}{$term}++;
1503 0         0 my $tnode = "T:$term";
1504              
1505 0 0       0 my $tindex = ( defined $map->{$tnode} ?
1506             $map->{$tnode} :
1507             $self->_add_node( $tnode, 1 )
1508             );
1509 0         0 $self->{Graph}->set_edge( $dindex, $tindex, $edge );
1510             }
1511 0         0 $doc++;
1512             }
1513              
1514             ####### PURE PERL VERSION ##########
1515             } else {
1516 0         0 while (<$fh>) {
1517 0         0 chomp;
1518 0         0 my $dnode = "D:$doc";
1519 0         0 my ( $count, %vals ) = split;
1520 0         0 while ( my ( $term, $edge ) = each %vals ) {
1521 0         0 $self->{'term_count'}{$term}++;
1522 0         0 my $tnode = "T:$term";
1523              
1524 0         0 $neighbors{$dnode}{$tnode} = $edge.',1';
1525 0         0 $neighbors{$tnode}{$dnode} = $edge.',1';
1526             }
1527 0         0 $doc++;
1528             }
1529 0         0 $self->{'neighbors'} = \%neighbors;
1530             }
1531              
1532 0 0       0 print "Loaded.\n" if $self->{'debug'} > 1;
1533 0         0 $self->{'from_TDM'} = 1;
1534 0         0 $self->{'doc_count'} = $doc;
1535             }
1536              
1537              
1538              
1539             # XS version only
1540             #
1541             # This sub maintains a mapping between node names and integer index
1542             # values.
1543              
1544             sub _add_node {
1545 0     0   0 my ( $self, $node_name, $type ) = @_;
1546 0 0       0 croak "Must provide a type" unless $type;
1547 0 0       0 croak "Must provide a node name" unless $node_name;
1548 0 0       0 croak "This node already exists" if
1549             $self->{'node_map'}{$node_name};
1550              
1551 0         0 my $new_id = $self->{'next_free_id'}++;
1552 0         0 $self->{'node_map'}{$node_name} = $new_id;
1553 0         0 $self->{'id_map'}[$new_id] = $node_name;
1554 0         0 $self->{'Graph'}->add_node( $new_id, $type );
1555              
1556 0         0 return $new_id;
1557             }
1558              
1559              
1560              
1561             #
1562             # INTERNAL METHODS
1563             #
1564              
1565             # each node should have the same number of inbound
1566             # and outbound links
1567              
1568             sub check_consistency {
1569              
1570 37     37 0 60 my ( $self ) = @_;
1571 37         57 my %inbound;
1572             my %outbound;
1573            
1574            
1575 37         62 foreach my $node ( keys %{$self->{neighbors}} ) {
  37         3748  
1576 17194 50       48162 next unless $node =~ /^[DT]:/; # for MLDBM compatibility
1577 17194         18401 $outbound{$node} = scalar keys %{$self->{neighbors}{$node}};
  17194         40965  
1578 17194         19908 foreach my $neighbor ( keys %{ $self->{neighbors}{$node} } ) {
  17194         43491  
1579 39868         74583 $inbound{$neighbor}++;
1580             }
1581             }
1582            
1583 37         1361 my $in = scalar keys %inbound;
1584 37         76 my $out = scalar keys %outbound;
1585 37 50       128 carp "number of nodes with inbound links ($in) does not match number of nodes with outbound links ( $out )"
1586             unless scalar keys %inbound == scalar keys %outbound;
1587            
1588 37         2195 foreach my $node ( keys %inbound ) {
1589 17194   50     28993 $outbound{$node} ||= 0;
1590 17194 50       37030 carp "$node has $inbound{$node} inbound links, $outbound{$node} outbound links\n"
1591             unless $inbound{$node} == $outbound{$node};
1592             }
1593              
1594             }
1595              
1596              
1597             =item have_edge RAWNODE1, RAWNODE2
1598              
1599             Returns true if the nodes share an edge. Node names must be prefixed with 'D' or 'T'
1600             as appropriate.
1601              
1602             =cut
1603              
1604             sub have_edge {
1605 0     0 1 0 my ( $self, $node1, $node2 ) = @_;
1606 0         0 return exists $self->{neighbors}{$node1}{$node2};
1607             }
1608              
1609              
1610             {
1611              
1612             my %visited;
1613             my %component;
1614             my $depth;
1615            
1616             =item connected_components
1617              
1618             Returns an array of connected components in the graph. Each component is a list
1619             of nodes that are mutually accessible by traveling along edges.
1620              
1621             =cut
1622              
1623             sub connected_components {
1624 0     0 1 0 my ( $self ) = @_;
1625            
1626 0         0 %visited = (); # clear any old info
1627 0         0 %component = ();
1628            
1629            
1630 0         0 my $n = $self->{neighbors};
1631            
1632            
1633 0         0 my @node_list = keys %{$n};
  0         0  
1634 0         0 my @components;
1635            
1636 0         0 while ( @node_list ) {
1637 0         0 my $start = shift @node_list;
1638 0 0       0 next if exists $visited{$start};
1639            
1640 0 0       0 last unless $start;
1641 0         0 warn "Visiting neighbors for $start\n";
1642 0         0 visit_neighbors( $n, $start );
1643 0         0 push @components, [ keys %component ];
1644 0         0 %component = ();
1645             }
1646            
1647 0         0 warn "Found ", scalar @components, " connected components\n";
1648 0         0 return @components;
1649            
1650            
1651             }
1652              
1653             sub visit_neighbors {
1654 0     0 0 0 my ( $g, $l ) = @_;
1655 0 0       0 return if $visited{$l};
1656 0         0 $depth++;
1657 0         0 $visited{$l}++; $component{$l}++;
  0         0  
1658 0         0 warn $depth, " $l\n";
1659 0         0 my @neigh = keys %{ $g->{$l} };
  0         0  
1660 0         0 foreach my $n ( @neigh ) {
1661 0         0 visit_neighbors( $g, $n );
1662             }
1663 0         0 $depth--;
1664             }
1665             }
1666              
1667              
1668             # Wipe the graph free of stored energies
1669              
1670             sub _clear {
1671 12     12   18 my ( $self ) = @_;
1672 12         31 $self->{'energy'} = undef;
1673             }
1674              
1675              
1676             # Gather the stored energy values from the graph
1677              
1678             sub _collect {
1679 12     12   15 my ( $self ) = @_;
1680 12         17 my $e = $self->{'energy'};
1681 12         20 my $result = {};
1682 12         18 foreach my $k ( keys %{$self->{'energy'}} ) {
  12         40  
1683 81 50       159 next unless $e->{$k} > $self->{'COLLECT_THRESHOLD'};
1684 81         130 $result->{$k} = $e->{$k};
1685             }
1686 12         28 return $result;
1687             }
1688              
1689              
1690              
1691              
1692             # Assign a starting energy ENERGY to NODE, and recursively distribute the
1693             # energy to neighbor nodes. Singleton nodes get special treatment
1694              
1695             sub _energize {
1696              
1697 81     81   113 my ( $self, $node, $energy ) = @_;
1698              
1699              
1700 81 50       206 return unless defined $self->{neighbors}{$node};
1701 81   50     340 my $orig = $self->{energy}{$node} || 0;
1702 81         154 $self->{energy}->{$node} += $energy;
1703 81 50       180 return if ( $self->{depth} == $self->{max_depth} );
1704 81         83 $self->{depth}++;
1705              
1706 81 50       189 if ( $self->{'debug'} > 1 ) {
1707 0         0 print ' ' x $self->{'depth'};
1708 0         0 print "$node: energizing $orig + $energy\n";
1709             }
1710              
1711              
1712 81         98 my $n = $self->{neighbors};
1713            
1714             #sleep 1;
1715 81         78 my $degree = scalar keys %{ $n->{$node} };
  81         138  
1716              
1717              
1718 81 50       147 if ( $degree == 0 ) {
1719            
1720 0         0 carp "WARNING: reached a node without neighbors: $node at search depth $self->{depth}\n";
1721 0         0 $self->{depth}--;
1722 0         0 return;
1723             }
1724            
1725            
1726 81         152 my $subenergy = $energy / (log($degree)+1);
1727              
1728              
1729             # At singleton nodes (words that appear in only one document, for example)
1730             # Don't spread energy any further. This avoids a "reflection" back and
1731             # forth from singleton nodes to their neighbors.
1732              
1733 81 100 100     298 if ( $degree == 1 and $energy < $self->{'START_ENERGY'} ) {
    100          
1734              
1735             #do nothing
1736              
1737             } elsif ( $subenergy > $self->{ACTIVATE_THRESHOLD} ) {
1738 12 50       33 print ' ' x $self->{'depth'},
1739             "$node: propagating subenergy $subenergy to $degree neighbors\n"
1740             if $self->{'debug'} > 1;
1741 12         15 foreach my $neighbor ( keys %{ $n->{$node} } ) {
  12         44  
1742 69         148 my $pair = $n->{$node}{$neighbor};
1743 69         166 my ( $edge, undef ) = split /,/, $pair;
1744 69         149 my $weighted_energy = $subenergy * $edge;
1745 69 50       135 print ' ' x $self->{'depth'},
1746             " edge $edge ($node, $neighbor)\n"
1747             if $self->{'debug'} > 1;
1748 69         140 $self->_energize( $neighbor, $weighted_energy );
1749             }
1750             }
1751 81         101 $self->{'depth'}--;
1752 81         153 return 1;
1753             }
1754              
1755              
1756             # Given an array, normalize using cosine normalization
1757              
1758             sub __normalize {
1759 5630     5630   7511 my ( $arr ) = @_;
1760              
1761 5630 50 33     41861 croak "Must provide array ref to __normalize" unless
      33        
1762             defined $arr and
1763             ref $arr and
1764             ref $arr eq 'ARRAY';
1765              
1766 5630         6060 my $sum;
1767 5630         6009 $sum += $_->[2] foreach @{$arr};
  5630         81879  
1768 5630         8577 $_->[2]/= $sum foreach @{$arr};
  5630         68705  
1769 5630         15044 return 1;
1770             }
1771              
1772              
1773              
1774              
1775             sub DESTROY {
1776 15     15   35508 undef $_[0]->{Graph}
1777             }
1778              
1779             1;
1780              
1781             __END__