File Coverage

blib/lib/Search/Tools/HeatMap.pm
Criterion Covered Total %
statement 192 219 87.6
branch 64 104 61.5
condition 36 47 76.6
subroutine 10 10 100.0
pod 2 2 100.0
total 304 382 79.5


line stmt bran cond sub pod time code
1             package Search::Tools::HeatMap;
2 17     17   61 use Moo;
  17         57  
  17         76  
3 17     17   3115 use Carp;
  17         20  
  17         790  
4 17     17   62 use Data::Dump qw( dump );
  17         17  
  17         657  
5             extends 'Search::Tools::Object';
6              
7 17     17   58 use namespace::autoclean;
  17         19  
  17         93  
8              
9             our $VERSION = '1.004';
10              
11             # debugging only
12             my $OPEN = '[';
13             my $CLOSE = ']';
14             eval { require Term::ANSIColor; };
15             if ( !$@ ) {
16             $OPEN .= Term::ANSIColor::color('bold red');
17             $CLOSE = Term::ANSIColor::color('reset') . $CLOSE;
18             }
19              
20             my @attrs = qw( window_size
21             tokens
22             spans
23             as_sentences
24             _treat_phrases_as_singles
25             _qre
26             _query
27             _stemmer
28             );
29              
30             for my $attr (@attrs) {
31             has $attr => ( is => 'rw' );
32             }
33              
34             =head1 NAME
35              
36             Search::Tools::HeatMap - locate the best matches in a snippet extract
37              
38             =head1 SYNOPSIS
39              
40             use Search::Tools::Tokenizer;
41             use Search::Tools::HeatMap;
42            
43             my $tokens = $self->tokenizer->tokenize( $my_string, qr/^(interesting)$/ );
44             my $heatmap = Search::Tools::HeatMap->new(
45             tokens => $tokens,
46             window_size => 20, # default
47             as_sentences => 0, # default
48             );
49              
50             if ( $heatmap->has_spans ) {
51            
52             my $tokens_arr = $tokens->as_array;
53              
54             # stringify positions
55             my @snips;
56             for my $span ( @{ $heatmap->spans } ) {
57             push( @snips, $span->{str} );
58             }
59             my $occur_index = $self->occur - 1;
60             if ( $#snips > $occur_index ) {
61             @snips = @snips[ 0 .. $occur_index ];
62             }
63             printf("%s\n", join( ' ... ', @snips ));
64            
65             }
66              
67             =head1 DESCRIPTION
68              
69             Search::Tools::HeatMap implements a simple algorithm for locating
70             the densest clusters of unique, hot terms in a TokenList.
71              
72             HeatMap is used internally by Snipper but documented here in case
73             someone wants to abuse and/or improve it.
74              
75             =head1 METHODS
76              
77             =head2 new( tokens => I )
78              
79             Create a new HeatMap. The I object may be either a
80             Search::Tools::TokenList or Search::Tools::TokenListPP object.
81              
82             =head2 BUILD
83              
84             Builds the HeatMap object. Called internally by new().
85              
86             =cut
87              
88             sub BUILD {
89 30     30 1 485 my $self = shift;
90 30         96 $self->_build;
91 30         741 return $self;
92             }
93              
94             =head2 window_size
95              
96             The max width of a span. Defaults to 20 tokens, including the
97             matches.
98              
99             Set this in new(). Access it later if you need to, but the spans
100             will have already been created by new().
101              
102             =head2 as_sentences
103              
104             Try to match clusters at sentence boundaries. Default is false.
105              
106             Set this in new().
107              
108             =head2 spans
109              
110             Returns an array ref of matching clusters. Each span in the array
111             is a hash ref with the following keys:
112              
113             =over
114              
115             =item cluster
116              
117             =item pos
118              
119             =item heat
120              
121             =item str
122              
123             =item str_w_pos
124              
125             This item is available only if debug() is true.
126              
127             =item unique
128              
129             =back
130              
131             =cut
132              
133             # TODO this is mostly integer math and might be much
134             # faster if rewritten in XS once the algorithm is "final".
135             sub _build {
136 30     30   39 my $self = shift;
137 30 50       544 my $tokens = $self->tokens or croak "tokens required";
138 30   50     126 my $window = $self->window_size || 20;
139 30   100     152 my $as_sentences = $self->as_sentences || 0;
140 30 100       131 return $as_sentences
141             ? $self->_as_sentences( $tokens, $window )
142             : $self->_no_sentences( $tokens, $window );
143             }
144              
145             # currently _as_sentences() is mostly identical to _no_sentences()
146             # with slightly fewer gymnastics.
147             # Since we already know via sentence_starts where our boundaries are,
148             # we do not have to call $tokens->get_window().
149             # Who knows how we might improve the sentence algorithm in future,
150             # so already having it in its own method seems like a win.
151             sub _as_sentences {
152 13     13   24 my ( $self, $tokens, $window ) = @_;
153 13   50     265 my $debug = $self->debug || 0;
154 13         121 my $sentence_length = $window * 2;
155              
156             # build heatmap with sentence starts
157 13         51 my $num_tokens = $tokens->len;
158 13         41 my $tokens_arr = $tokens->as_array;
159 13         26 my %heatmap = ();
160 13         44 my $token_list_heat = $tokens->get_heat;
161 13         43 my $heat_sentence_starts = $tokens->get_sentence_starts;
162              
163             # this regex is a sanity check for phrases. we replace the \ with a
164             # more promiscuous check because the single space is too naive
165             # for real text (e.g. st. john's)
166 13         24 my $qre = $self->{_qre};
167 13         20 my @phrases = @{ $self->{_query}->phrases };
  13         62  
168 13         53 my $n_terms = $self->{_query}->num_terms;
169 13         80 my $query_has_phrase = $qre =~ s/(\\ )+/.+/g;
170              
171 13 50       41 if ($debug) {
172 0         0 warn "heat_sentence_starts: " . dump($heat_sentence_starts);
173 0         0 warn "token_list_heat: " . dump($token_list_heat);
174 0         0 warn "n_terms: $n_terms";
175 0         0 warn "phrases: " . dump( \@phrases );
176 0         0 warn "query_has_phrase: $query_has_phrase";
177             }
178              
179             # find the "sentence" that each hot token appears in.
180 13         14 my @starts_ends;
181 13         15 my $i = 0;
182 13         23 my %heat_sentence_ends = (); # cache
183 13         29 for (@$token_list_heat) {
184 42         111 my $token = $tokens->get_token($_);
185 42         100 my $token_pos = $token->pos;
186 42         54 my $start = $heat_sentence_starts->[ $i++ ];
187 42         138 $heatmap{$token_pos} = $token->is_hot;
188              
189             # a little optimization for when we've got
190             # multiple hot tokens in the same sentence
191 42 100       84 if ( exists $heat_sentence_ends{$start} ) {
192 22 50       40 $debug
193             and warn "found cached end $heat_sentence_ends{$start} "
194             . "for start $start token $token_pos\n";
195              
196             push( @starts_ends,
197 22         47 [ $start, $token_pos, $heat_sentence_ends{$start} ] );
198 22         35 next;
199             }
200              
201             # find the outermost limit of where this sentence might end
202 20         19 my $max_end;
203              
204             # is there a "next" start?
205 20 100 100     98 if ( defined $heat_sentence_starts->[$i]
206             and $heat_sentence_starts->[$i] != $start )
207             {
208              
209             # this token is unique in this non-final sentence
210 3         8 $max_end = $heat_sentence_starts->[$i] - 1;
211             }
212             else {
213              
214             # this is the final sentence
215 17         30 $max_end = $num_tokens - 1;
216             }
217 20         29 my $end = $start;
218              
219             # find the nearest sentence end to the start
220 20         47 while ( $end < $max_end ) {
221 1990         2299 my $tok = $tokens->get_token( $end++ );
222 1990 50       2868 if ( !$tok ) {
223 0 0       0 $debug and warn "No token at end=$end";
224 0         0 last;
225             }
226 1990 100       4422 if ( $tok->is_sentence_end ) {
227 10         17 $end--; # move back one position
228 10 50       25 if ($debug) {
229 0         0 warn "tok $_ is_sentence_end end=$end";
230 0         0 $tok->dump;
231             }
232 10         23 last;
233             }
234             }
235              
236             # back up if we've exceeded the 0-based tokens array.
237 20 50       50 $end = $num_tokens if $end > $num_tokens;
238              
239 20 50       45 $debug
240             and warn "start=$start max_end=$max_end "
241             . "sentence_length=$sentence_length end=$end "
242             . "token_pos=$token_pos\n";
243              
244             # if we didn't yet set the actual hot token,
245             # include everything up to it.
246 20 50       45 if ( $end < $token_pos ) {
247 0 0       0 $debug
248             and warn "resetting end=$token_pos\n";
249              
250 0         0 $end = $token_pos;
251             }
252 20         57 push( @starts_ends, [ $start, $token_pos, $end ] );
253              
254             # cache
255 20         61 $heat_sentence_ends{$start} = $end;
256             }
257              
258 13 50       36 $debug and warn "starts_ends: " . dump( \@starts_ends );
259              
260 13         19 my @spans;
261             my %seen_pos;
262             START_END:
263 13         44 for my $start_end (@starts_ends) {
264              
265             # get full window, ignoring positions we've already seen.
266 42         42 my $heat = 0;
267 42         37 my %span;
268             my @cluster_tokens;
269              
270 42         85 my ( $start, $hot_pos, $end ) = @$start_end;
271 42         89 POS: for my $pos ( $start .. $end ) {
272 5075 100       9889 next POS if $seen_pos{$pos}++;
273 2000 100       2064 $heat += ( exists $heatmap{$pos} ? $heatmap{$pos} : 0 );
274 2000         2426 push( @cluster_tokens, $tokens->get_token($pos) );
275             }
276              
277             # if we had already seen_pos all positions.
278 42 100       109 next START_END unless @cluster_tokens;
279              
280             # sanity: make sure we still have something hot
281 20         29 my $has_hot = 0;
282 20         24 my @cluster_pos;
283             my @strings;
284 20         34 TOK: for (@cluster_tokens) {
285 2000         1855 my $pos = $_->pos;
286 2000 100       2246 $has_hot++ if exists $heatmap{$pos};
287 2000         2235 push @strings, $_->str;
288 2000         1799 push @cluster_pos, $pos;
289             }
290 20 50       43 next START_END unless $has_hot;
291              
292             # the final string is a sentence end,
293             # but we only want the first char in it,
294             # and not any whitespace, stray punctuation or other
295             # non-word noise.
296 20         122 $strings[$#strings] =~ s/^([\.\?\!]).*/$1/;
297              
298 20         48 $span{start_end} = $start_end;
299 20         32 $span{heat} = $heat;
300 20         33 $span{pos} = \@cluster_pos;
301 20         30 $span{tokens} = \@cluster_tokens;
302 20         130 $span{str} = join( '', @strings );
303              
304             # spans with more *unique* hot tokens in a single span rank higher
305             # spans with more *proximate* hot tokens in a single span rank higher
306 20         40 my %uniq = ();
307 20         19 my $i = 0;
308 20         23 my $num_proximate = 1; # one for the single hot token
309 20         36 for (@cluster_pos) {
310 2000 100       2124 if ( exists $heatmap{$_} ) {
311 42         107 $uniq{ lc $strings[$i] } += $heatmap{$_};
312 42 100 100     171 if ( $i && exists $heatmap{ $cluster_pos[ $i - 2 ] } ) {
313 10         11 $num_proximate++;
314             }
315             }
316 2000         1282 $i++;
317             }
318 20         45 $span{unique} = scalar keys %uniq;
319 20         30 $span{proximate} = $num_proximate;
320              
321             # no false phrase matches if !_treat_phrases_as_singles
322             # stemmer check because regex will likely fail
323             # when stemmer is on
324 20 100 66     80 if ( $query_has_phrase
325             and !$self->{_treat_phrases_as_singles} )
326             {
327 7 100       16 if ( !$self->{_stemmer} ) {
328              
329             #warn "_treat_phrases_as_singles NOT true";
330 3 50       106 if ( $span{str} !~ m/$qre/ ) {
331 0 0       0 $debug
332             and warn
333             "treat_phrases_as_singles=FALSE and '$span{str}' failed to match $qre\n";
334 0         0 next START_END;
335             }
336             }
337             else {
338              
339             # if stemmer was on, we cannot rely on the regex,
340             # but we assume that number of uniq terms must match query
341              
342 4 50 66     16 if ( $n_terms == $query_has_phrase
343             && $n_terms > $span{unique} )
344             {
345              
346 0 0       0 $debug
347             and warn
348             "treat_phrases_as_singles=FALSE and '$span{str}' "
349             . "expected $n_terms unique terms, got $span{unique}\n";
350 0         0 next START_END;
351             }
352              
353             }
354             }
355              
356             # just for debug
357 20 50       53 if ($debug) {
358 0         0 my $i = 0;
359             $span{str_w_pos} = join(
360             '',
361             map {
362 0         0 $strings[ $i++ ]
363             . ( exists $heatmap{$_} ? $OPEN : '[' )
364             . $_
365 0 0       0 . ( exists $heatmap{$_} ? $CLOSE : ']' )
    0          
366             } @cluster_pos
367             );
368             }
369              
370 20         130 push @spans, \%span;
371              
372             }
373              
374 13         50 $self->{spans} = $self->_sort_spans( \@spans );
375 13         30 $self->{heatmap} = \%heatmap;
376              
377 13         195 return $self;
378             }
379              
380             sub _sort_spans {
381             return [
382              
383             # sort by unique,
384             # then by proximity
385             # then by heat
386             # then by pos
387              
388             sort {
389             $b->{unique} <=> $a->{unique}
390             || $b->{proximate} <=> $a->{proximate}
391             || $b->{heat} <=> $a->{heat}
392 51 50 66     267 || $a->{pos}->[0] <=> $b->{pos}->[0]
      100        
393 30     30   49 } @{ $_[1] }
  30         126  
394              
395             ];
396             }
397              
398             sub _no_sentences {
399 17     17   31 my ( $self, $tokens, $window ) = @_;
400 17         52 my $lhs_window = int( $window / 2 );
401 17   50     299 my $debug = $self->debug || 0;
402              
403 17         154 my $num_tokens = $tokens->len;
404 17         46 my $tokens_arr = $tokens->as_array;
405 17         29 my %heatmap = ();
406 17         50 my $token_list_heat = $tokens->get_heat;
407              
408             # this regex is a sanity check for phrases. we replace the \ with a
409             # more promiscuous check because the single space is too naive
410             # for real text (e.g. st. john's)
411 17         30 my $qre = $self->{_qre};
412 17         21 my @phrases = @{ $self->{_query}->phrases };
  17         65  
413 17         64 my $n_terms = $self->{_query}->num_terms;
414 17         70 my $query_has_phrase = $qre =~ s/(\\ )+/.+/g;
415              
416 17 50       43 if ($debug) {
417 0         0 warn "token_list_heat: " . dump($token_list_heat);
418 0         0 warn "n_terms: $n_terms";
419 0         0 warn "phrases: " . dump( \@phrases );
420 0         0 warn "query_has_phrase: $query_has_phrase";
421             }
422              
423             # build heatmap
424 17         39 for (@$token_list_heat) {
425 77         139 my $token = $tokens->get_token($_);
426 77         247 $heatmap{ $token->pos } = $token->is_hot;
427             }
428              
429             # make clusters
430              
431             # $proximity == (1/4 of $window)+1 is somewhat arbitrary,
432             # but since we want to err in having too much context,
433             # we aim high. Worst case scenario is where there are
434             # multiple hot spots in a cluster and each is a full
435             # $proximity length apart, which will grow the
436             # eventual span far beyond $window size. We rely
437             # on max_chars in Snipper to catch that worst case.
438 17         45 my $proximity = int( $lhs_window / 2 ) + 1;
439 17         76 my @positions = sort { $a <=> $b } keys %heatmap;
  151         142  
440 17         38 my @clusters = ( [] );
441 17         21 my $i = 0;
442 17         30 for my $pos (@positions) {
443              
444             # if we have advanced past the first position
445             # and the previous position is not "close" to this one,
446             # start a new cluster
447 77 100 100     214 if ( $i && ( $pos - $positions[ $i - 1 ] ) > $proximity ) {
448 33         46 push( @clusters, [$pos] );
449             }
450             else {
451 44         33 push( @{ $clusters[-1] }, $pos );
  44         60  
452             }
453 77         72 $i++;
454             }
455              
456             $debug
457 17 50       36 and warn "proximity: $proximity clusters: " . dump \@clusters;
458              
459             # create spans from each cluster, each with a weight.
460             # we do the initial sort so that clusters that overlap
461             # other clusters via get_window() are weeded out via %seen_pos.
462 17         26 my @spans;
463             my %seen_pos;
464             CLUSTER:
465 17         32 for my $cluster (
466             sort {
467             scalar(@$b) <=> scalar(@$a)
468 61 50 66     206 || $heatmap{ $b->[0] } <=> $heatmap{ $a->[0] }
469             || $a->[0] <=> $b->[0]
470             } @clusters
471             )
472             {
473              
474             # get full window, ignoring positions we've already seen.
475 50         45 my $heat = 0;
476 50         36 my %span;
477             my @cluster_tokens;
478 50         59 POS: for my $pos (@$cluster) {
479 77         198 my ( $start, $end ) = $tokens->get_window( $pos, $window );
480 77         122 POS_TWO: for my $pos2 ( $start .. $end ) {
481 3357 100       5728 next if $seen_pos{$pos2}++;
482 1513 100       1431 $heat += ( exists $heatmap{$pos2} ? $heatmap{$pos2} : 0 );
483 1513         1851 push( @cluster_tokens, $tokens->get_token($pos2) );
484             }
485             }
486              
487             # we may have skipped a $seen_pos from the $slice above
488             # so make sure we still start/end on a match
489 50   66     226 while ( @cluster_tokens && !$cluster_tokens[0]->is_match ) {
490 11         49 shift @cluster_tokens;
491             }
492 50   66     2075 while ( @cluster_tokens && !$cluster_tokens[-1]->is_match ) {
493 6         25 pop @cluster_tokens;
494             }
495              
496 50 50       79 next CLUSTER unless @cluster_tokens;
497              
498             # sanity: make sure we still have something hot
499 50         43 my $has_hot = 0;
500 50         38 my @cluster_pos;
501             my @strings;
502 50         62 for (@cluster_tokens) {
503 1496         1378 my $pos = $_->pos;
504 1496 100       1711 $has_hot++ if exists $heatmap{$pos};
505 1496         1715 push @strings, $_->str;
506 1496         1366 push @cluster_pos, $pos;
507             }
508 50 100       111 next CLUSTER unless $has_hot;
509              
510 39         52 $span{cluster} = $cluster;
511 39         51 $span{heat} = $heat;
512 39         44 $span{pos} = \@cluster_pos;
513 39         48 $span{tokens} = \@cluster_tokens;
514 39         125 $span{str} = join( '', @strings );
515              
516             # spans with more *unique* hot tokens in a single span rank higher
517             # spans with more *proximate* hot tokens in a single span rank higher
518 39         53 my %uniq = ();
519 39         34 my $i = 0;
520 39         30 my $num_proximate = 1; # one for the single hot token
521 39         46 for (@cluster_pos) {
522 1341 100       1454 if ( exists $heatmap{$_} ) {
523 77         154 $uniq{ lc $strings[$i] } += $heatmap{$_};
524 77 100 100     251 if ( $i && exists $heatmap{ $cluster_pos[ $i - 2 ] } ) {
525 23         22 $num_proximate++;
526             }
527             }
528 1341         849 $i++;
529             }
530 39         55 $span{unique} = scalar keys %uniq;
531 39         45 $span{proximate} = $num_proximate;
532              
533             # no false phrase matches if !_treat_phrases_as_singles
534             # stemmer check because regex will likely fail when stemmer is on
535 39 100 66     98 if ( $query_has_phrase
536             and !$self->{_treat_phrases_as_singles} )
537             {
538 3 100       11 if ( !$self->{_stemmer} ) {
539              
540             #warn "_treat_phrases_as_singles NOT true";
541 1 50       37 if ( $span{str} !~ m/$qre/ ) {
542 1 50       3 $debug
543             and warn
544             "treat_phrases_as_singles=FALSE and '$span{str}' failed to match $qre\n";
545 1         6 next CLUSTER;
546             }
547             }
548             else {
549              
550             # stemmer used, so check unique term count against n_terms
551 2 50 66     13 if ( $n_terms == $query_has_phrase
552             && $n_terms > $span{unique} )
553             {
554 0 0       0 $debug
555             and warn
556             "treat_phrases_as_singles=FALSE and '$span{str}' "
557             . "expected $n_terms but got $span{unique}\n";
558 0         0 next CLUSTER;
559             }
560              
561             }
562             }
563              
564             # just for debug
565 38 50       61 if ($debug) {
566 0         0 my $i = 0;
567             $span{str_w_pos} = join(
568             '',
569             map {
570 0         0 $strings[ $i++ ]
571             . ( exists $heatmap{$_} ? $OPEN : '[' )
572             . $_
573 0 0       0 . ( exists $heatmap{$_} ? $CLOSE : ']' )
    0          
574             } @cluster_pos
575             );
576             }
577              
578 38         137 push @spans, \%span;
579              
580             }
581              
582 17         1288 $self->{spans} = $self->_sort_spans( \@spans );
583 17         33 $self->{heatmap} = \%heatmap;
584              
585 17         161 return $self;
586             }
587              
588             =head2 has_spans
589              
590             Returns the number of spans found.
591              
592             =cut
593              
594             sub has_spans {
595 30     30 1 43 return scalar @{ $_[0]->{spans} };
  30         111  
596             }
597              
598             1;
599              
600             __END__