File Coverage

blib/lib/Search/Tools/HeatMap.pm
Criterion Covered Total %
statement 192 219 87.6
branch 64 104 61.5
condition 36 47 76.6
subroutine 10 10 100.0
pod 2 2 100.0
total 304 382 79.5


line stmt bran cond sub pod time code
1             package Search::Tools::HeatMap;
2 16     16   124 use Moo;
  16         37  
  16         107  
3 16     16   5598 use Carp;
  16         46  
  16         1239  
4 16     16   116 use Data::Dump qw( dump );
  16         38  
  16         1297  
5             extends 'Search::Tools::Object';
6              
7 16     16   123 use namespace::autoclean;
  16         215  
  16         164  
8              
9             our $VERSION = '1.007';
10              
11             # debugging only
12             my $OPEN = '[';
13             my $CLOSE = ']';
14             eval { require Term::ANSIColor; };
15             if ( !$@ ) {
16             $OPEN .= Term::ANSIColor::color('bold red');
17             $CLOSE = Term::ANSIColor::color('reset') . $CLOSE;
18             }
19              
20             my @attrs = qw( window_size
21             tokens
22             spans
23             as_sentences
24             _treat_phrases_as_singles
25             _qre
26             _query
27             _stemmer
28             );
29              
30             for my $attr (@attrs) {
31             has $attr => ( is => 'rw' );
32             }
33              
34             =head1 NAME
35              
36             Search::Tools::HeatMap - locate the best matches in a snippet extract
37              
38             =head1 SYNOPSIS
39              
40             use Search::Tools::Tokenizer;
41             use Search::Tools::HeatMap;
42            
43             my $tokens = $self->tokenizer->tokenize( $my_string, qr/^(interesting)$/ );
44             my $heatmap = Search::Tools::HeatMap->new(
45             tokens => $tokens,
46             window_size => 20, # default
47             as_sentences => 0, # default
48             );
49              
50             if ( $heatmap->has_spans ) {
51            
52             my $tokens_arr = $tokens->as_array;
53              
54             # stringify positions
55             my @snips;
56             for my $span ( @{ $heatmap->spans } ) {
57             push( @snips, $span->{str} );
58             }
59             my $occur_index = $self->occur - 1;
60             if ( $#snips > $occur_index ) {
61             @snips = @snips[ 0 .. $occur_index ];
62             }
63             printf("%s\n", join( ' ... ', @snips ));
64            
65             }
66              
67             =head1 DESCRIPTION
68              
69             Search::Tools::HeatMap implements a simple algorithm for locating
70             the densest clusters of unique, hot terms in a TokenList.
71              
72             HeatMap is used internally by Snipper but documented here in case
73             someone wants to abuse and/or improve it.
74              
75             =head1 METHODS
76              
77             =head2 new( tokens => I )
78              
79             Create a new HeatMap. The I object may be either a
80             Search::Tools::TokenList or Search::Tools::TokenListPP object.
81              
82             =head2 BUILD
83              
84             Builds the HeatMap object. Called internally by new().
85              
86             =cut
87              
88             sub BUILD {
89 30     30 1 503 my $self = shift;
90 30         146 $self->_build;
91 30         207 return $self;
92             }
93              
94             =head2 window_size
95              
96             The max width of a span. Defaults to 20 tokens, including the
97             matches.
98              
99             Set this in new(). Access it later if you need to, but the spans
100             will have already been created by new().
101              
102             =head2 as_sentences
103              
104             Try to match clusters at sentence boundaries. Default is false.
105              
106             Set this in new().
107              
108             =head2 spans
109              
110             Returns an array ref of matching clusters. Each span in the array
111             is a hash ref with the following keys:
112              
113             =over
114              
115             =item cluster
116              
117             =item pos
118              
119             =item heat
120              
121             =item str
122              
123             =item str_w_pos
124              
125             This item is available only if debug() is true.
126              
127             =item unique
128              
129             =back
130              
131             =cut
132              
133             # TODO this is mostly integer math and might be much
134             # faster if rewritten in XS once the algorithm is "final".
135             sub _build {
136 30     30   58 my $self = shift;
137 30 50       666 my $tokens = $self->tokens or croak "tokens required";
138 30   50     204 my $window = $self->window_size || 20;
139 30   100     207 my $as_sentences = $self->as_sentences || 0;
140 30 100       179 return $as_sentences
141             ? $self->_as_sentences( $tokens, $window )
142             : $self->_no_sentences( $tokens, $window );
143             }
144              
145             # currently _as_sentences() is mostly identical to _no_sentences()
146             # with slightly fewer gymnastics.
147             # Since we already know via sentence_starts where our boundaries are,
148             # we do not have to call $tokens->get_window().
149             # Who knows how we might improve the sentence algorithm in future,
150             # so already having it in its own method seems like a win.
151             sub _as_sentences {
152 13     13   42 my ( $self, $tokens, $window ) = @_;
153 13   50     292 my $debug = $self->debug || 0;
154 13         145 my $sentence_length = $window * 2;
155              
156             # build heatmap with sentence starts
157 13         65 my $num_tokens = $tokens->len;
158 13         45 my $tokens_arr = $tokens->as_array;
159 13         108 my %heatmap = ();
160 13         56 my $token_list_heat = $tokens->get_heat;
161 13         48 my $heat_sentence_starts = $tokens->get_sentence_starts;
162              
163             # this regex is a sanity check for phrases. we replace the \ with a
164             # more promiscuous check because the single space is too naive
165             # for real text (e.g. st. john's)
166 13         33 my $qre = $self->{_qre};
167 13         25 my @phrases = @{ $self->{_query}->phrases };
  13         65  
168 13         83 my $n_terms = $self->{_query}->num_terms;
169 13         108 my $query_has_phrase = $qre =~ s/(\\ )+/.+/g;
170              
171 13 50       47 if ($debug) {
172 0         0 warn "heat_sentence_starts: " . dump($heat_sentence_starts);
173 0         0 warn "token_list_heat: " . dump($token_list_heat);
174 0         0 warn "n_terms: $n_terms";
175 0         0 warn "phrases: " . dump( \@phrases );
176 0         0 warn "query_has_phrase: $query_has_phrase";
177             }
178              
179             # find the "sentence" that each hot token appears in.
180 13         25 my @starts_ends;
181 13         28 my $i = 0;
182 13         27 my %heat_sentence_ends = (); # cache
183 13         37 for (@$token_list_heat) {
184 42         118 my $token = $tokens->get_token($_);
185 42         118 my $token_pos = $token->pos;
186 42         83 my $start = $heat_sentence_starts->[ $i++ ];
187 42         183 $heatmap{$token_pos} = $token->is_hot;
188              
189             # a little optimization for when we've got
190             # multiple hot tokens in the same sentence
191 42 100       134 if ( exists $heat_sentence_ends{$start} ) {
192 22 50       53 $debug
193             and warn "found cached end $heat_sentence_ends{$start} "
194             . "for start $start token $token_pos\n";
195              
196             push( @starts_ends,
197 22         92 [ $start, $token_pos, $heat_sentence_ends{$start} ] );
198 22         56 next;
199             }
200              
201             # find the outermost limit of where this sentence might end
202 20         32 my $max_end;
203              
204             # is there a "next" start?
205 20 100 100     159 if ( defined $heat_sentence_starts->[$i]
206             and $heat_sentence_starts->[$i] != $start )
207             {
208              
209             # this token is unique in this non-final sentence
210 3         7 $max_end = $heat_sentence_starts->[$i] - 1;
211             }
212             else {
213              
214             # this is the final sentence
215 17         43 $max_end = $num_tokens - 1;
216             }
217 20         41 my $end = $start;
218              
219             # find the nearest sentence end to the start
220 20         65 while ( $end < $max_end ) {
221 1990         4200 my $tok = $tokens->get_token( $end++ );
222 1990 50       4827 if ( !$tok ) {
223 0 0       0 $debug and warn "No token at end=$end";
224 0         0 last;
225             }
226 1990 100       5997 if ( $tok->is_sentence_end ) {
227 10         21 $end--; # move back one position
228 10 50       30 if ($debug) {
229 0         0 warn "tok $_ is_sentence_end end=$end";
230 0         0 $tok->dump;
231             }
232 10         36 last;
233             }
234             }
235              
236             # back up if we've exceeded the 0-based tokens array.
237 20 50       59 $end = $num_tokens if $end > $num_tokens;
238              
239 20 50       88 $debug
240             and warn "start=$start max_end=$max_end "
241             . "sentence_length=$sentence_length end=$end "
242             . "token_pos=$token_pos\n";
243              
244             # if we didn't yet set the actual hot token,
245             # include everything up to it.
246 20 50       55 if ( $end < $token_pos ) {
247 0 0       0 $debug
248             and warn "resetting end=$token_pos\n";
249              
250 0         0 $end = $token_pos;
251             }
252 20         72 push( @starts_ends, [ $start, $token_pos, $end ] );
253              
254             # cache
255 20         83 $heat_sentence_ends{$start} = $end;
256             }
257              
258 13 50       49 $debug and warn "starts_ends: " . dump( \@starts_ends );
259              
260 13         31 my @spans;
261             my %seen_pos;
262             START_END:
263 13         31 for my $start_end (@starts_ends) {
264              
265             # get full window, ignoring positions we've already seen.
266 42         69 my $heat = 0;
267 42         75 my %span;
268             my @cluster_tokens;
269              
270 42         121 my ( $start, $hot_pos, $end ) = @$start_end;
271 42         106 POS: for my $pos ( $start .. $end ) {
272 5075 100       10563 next POS if $seen_pos{$pos}++;
273 2000 100       3275 $heat += ( exists $heatmap{$pos} ? $heatmap{$pos} : 0 );
274 2000         4318 push( @cluster_tokens, $tokens->get_token($pos) );
275             }
276              
277             # if we had already seen_pos all positions.
278 42 100       135 next START_END unless @cluster_tokens;
279              
280             # sanity: make sure we still have something hot
281 20         38 my $has_hot = 0;
282 20         39 my @cluster_pos;
283             my @strings;
284 20         53 TOK: for (@cluster_tokens) {
285 2000         3405 my $pos = $_->pos;
286 2000 100       3633 $has_hot++ if exists $heatmap{$pos};
287 2000         4037 push @strings, $_->str;
288 2000         3854 push @cluster_pos, $pos;
289             }
290 20 50       57 next START_END unless $has_hot;
291              
292             # the final string is a sentence end,
293             # but we only want the first char in it,
294             # and not any whitespace, stray punctuation or other
295             # non-word noise.
296 20         146 $strings[$#strings] =~ s/^([\.\?\!]).*/$1/;
297              
298 20         70 $span{start_end} = $start_end;
299 20         52 $span{heat} = $heat;
300 20         44 $span{pos} = \@cluster_pos;
301 20         58 $span{tokens} = \@cluster_tokens;
302 20         185 $span{str} = join( '', @strings );
303              
304             # spans with more *unique* hot tokens in a single span rank higher
305             # spans with more *proximate* hot tokens in a single span rank higher
306 20         45 my %uniq = ();
307 20         37 my $i = 0;
308 20         39 my $num_proximate = 1; # one for the single hot token
309 20         41 for (@cluster_pos) {
310 2000 100       3320 if ( exists $heatmap{$_} ) {
311 42         171 $uniq{ lc $strings[$i] } += $heatmap{$_};
312 42 100 100     219 if ( $i && exists $heatmap{ $cluster_pos[ $i - 2 ] } ) {
313 10         15 $num_proximate++;
314             }
315             }
316 2000         2684 $i++;
317             }
318 20         60 $span{unique} = scalar keys %uniq;
319 20         43 $span{proximate} = $num_proximate;
320              
321             # no false phrase matches if !_treat_phrases_as_singles
322             # stemmer check because regex will likely fail
323             # when stemmer is on
324 20 100 66     93 if ( $query_has_phrase
325             and !$self->{_treat_phrases_as_singles} )
326             {
327 7 100       27 if ( !$self->{_stemmer} ) {
328              
329             #warn "_treat_phrases_as_singles NOT true";
330 3 50       128 if ( $span{str} !~ m/$qre/ ) {
331 0 0       0 $debug
332             and warn
333             "treat_phrases_as_singles=FALSE and '$span{str}' failed to match $qre\n";
334 0         0 next START_END;
335             }
336             }
337             else {
338              
339             # if stemmer was on, we cannot rely on the regex,
340             # but we assume that number of uniq terms must match query
341              
342 4 50 66     18 if ( $n_terms == $query_has_phrase
343             && $n_terms > $span{unique} )
344             {
345              
346 0 0       0 $debug
347             and warn
348             "treat_phrases_as_singles=FALSE and '$span{str}' "
349             . "expected $n_terms unique terms, got $span{unique}\n";
350 0         0 next START_END;
351             }
352              
353             }
354             }
355              
356             # just for debug
357 20 50       76 if ($debug) {
358 0         0 my $i = 0;
359             $span{str_w_pos} = join(
360             '',
361             map {
362 0         0 $strings[ $i++ ]
363             . ( exists $heatmap{$_} ? $OPEN : '[' )
364             . $_
365 0 0       0 . ( exists $heatmap{$_} ? $CLOSE : ']' )
    0          
366             } @cluster_pos
367             );
368             }
369              
370 20         189 push @spans, \%span;
371              
372             }
373              
374 13         59 $self->{spans} = $self->_sort_spans( \@spans );
375 13         35 $self->{heatmap} = \%heatmap;
376              
377 13         265 return $self;
378             }
379              
380             sub _sort_spans {
381             return [
382              
383             # sort by unique,
384             # then by proximity
385             # then by heat
386             # then by pos
387              
388             sort {
389             $b->{unique} <=> $a->{unique}
390             || $b->{proximate} <=> $a->{proximate}
391             || $b->{heat} <=> $a->{heat}
392 51 50 66     362 || $a->{pos}->[0] <=> $b->{pos}->[0]
      100        
393 30     30   80 } @{ $_[1] }
  30         170  
394              
395             ];
396             }
397              
398             sub _no_sentences {
399 17     17   57 my ( $self, $tokens, $window ) = @_;
400 17         81 my $lhs_window = int( $window / 2 );
401 17   50     468 my $debug = $self->debug || 0;
402              
403 17         251 my $num_tokens = $tokens->len;
404 17         73 my $tokens_arr = $tokens->as_array;
405 17         96 my %heatmap = ();
406 17         88 my $token_list_heat = $tokens->get_heat;
407              
408             # this regex is a sanity check for phrases. we replace the \ with a
409             # more promiscuous check because the single space is too naive
410             # for real text (e.g. st. john's)
411 17         59 my $qre = $self->{_qre};
412 17         44 my @phrases = @{ $self->{_query}->phrases };
  17         103  
413 17         100 my $n_terms = $self->{_query}->num_terms;
414 17         125 my $query_has_phrase = $qre =~ s/(\\ )+/.+/g;
415              
416 17 50       93 if ($debug) {
417 0         0 warn "token_list_heat: " . dump($token_list_heat);
418 0         0 warn "n_terms: $n_terms";
419 0         0 warn "phrases: " . dump( \@phrases );
420 0         0 warn "query_has_phrase: $query_has_phrase";
421             }
422              
423             # build heatmap
424 17         61 for (@$token_list_heat) {
425 77         203 my $token = $tokens->get_token($_);
426 77         353 $heatmap{ $token->pos } = $token->is_hot;
427             }
428              
429             # make clusters
430              
431             # $proximity == (1/4 of $window)+1 is somewhat arbitrary,
432             # but since we want to err in having too much context,
433             # we aim high. Worst case scenario is where there are
434             # multiple hot spots in a cluster and each is a full
435             # $proximity length apart, which will grow the
436             # eventual span far beyond $window size. We rely
437             # on max_chars in Snipper to catch that worst case.
438 17         79 my $proximity = int( $lhs_window / 2 ) + 1;
439 17         114 my @positions = sort { $a <=> $b } keys %heatmap;
  138         234  
440 17         71 my @clusters = ( [] );
441 17         43 my $i = 0;
442 17         62 for my $pos (@positions) {
443              
444             # if we have advanced past the first position
445             # and the previous position is not "close" to this one,
446             # start a new cluster
447 77 100 100     284 if ( $i && ( $pos - $positions[ $i - 1 ] ) > $proximity ) {
448 33         70 push( @clusters, [$pos] );
449             }
450             else {
451 44         73 push( @{ $clusters[-1] }, $pos );
  44         106  
452             }
453 77         136 $i++;
454             }
455              
456             $debug
457 17 50       68 and warn "proximity: $proximity clusters: " . dump \@clusters;
458              
459             # create spans from each cluster, each with a weight.
460             # we do the initial sort so that clusters that overlap
461             # other clusters via get_window() are weeded out via %seen_pos.
462 17         58 my @spans;
463             my %seen_pos;
464             CLUSTER:
465 17         66 for my $cluster (
466             sort {
467             scalar(@$b) <=> scalar(@$a)
468 61 50 66     286 || $heatmap{ $b->[0] } <=> $heatmap{ $a->[0] }
469             || $a->[0] <=> $b->[0]
470             } @clusters
471             )
472             {
473              
474             # get full window, ignoring positions we've already seen.
475 50         100 my $heat = 0;
476 50         99 my %span;
477             my @cluster_tokens;
478 50         146 POS: for my $pos (@$cluster) {
479 77         318 my ( $start, $end ) = $tokens->get_window( $pos, $window );
480 77         234 POS_TWO: for my $pos2 ( $start .. $end ) {
481 3357 100       7163 next if $seen_pos{$pos2}++;
482 1513 100       2650 $heat += ( exists $heatmap{$pos2} ? $heatmap{$pos2} : 0 );
483 1513         3508 push( @cluster_tokens, $tokens->get_token($pos2) );
484             }
485             }
486              
487             # we may have skipped a $seen_pos from the $slice above
488             # so make sure we still start/end on a match
489 50   66     310 while ( @cluster_tokens && !$cluster_tokens[0]->is_match ) {
490 11         45 shift @cluster_tokens;
491             }
492 50   66     272 while ( @cluster_tokens && !$cluster_tokens[-1]->is_match ) {
493 6         24 pop @cluster_tokens;
494             }
495              
496 50 50       133 next CLUSTER unless @cluster_tokens;
497              
498             # sanity: make sure we still have something hot
499 50         94 my $has_hot = 0;
500 50         91 my @cluster_pos;
501             my @strings;
502 50         125 for (@cluster_tokens) {
503 1496         2773 my $pos = $_->pos;
504 1496 100       2847 $has_hot++ if exists $heatmap{$pos};
505 1496         3210 push @strings, $_->str;
506 1496         3048 push @cluster_pos, $pos;
507             }
508 50 100       172 next CLUSTER unless $has_hot;
509              
510 39         110 $span{cluster} = $cluster;
511 39         103 $span{heat} = $heat;
512 39         102 $span{pos} = \@cluster_pos;
513 39         89 $span{tokens} = \@cluster_tokens;
514 39         239 $span{str} = join( '', @strings );
515              
516             # spans with more *unique* hot tokens in a single span rank higher
517             # spans with more *proximate* hot tokens in a single span rank higher
518 39         94 my %uniq = ();
519 39         83 my $i = 0;
520 39         74 my $num_proximate = 1; # one for the single hot token
521 39         92 for (@cluster_pos) {
522 1341 100       2383 if ( exists $heatmap{$_} ) {
523 77         290 $uniq{ lc $strings[$i] } += $heatmap{$_};
524 77 100 100     337 if ( $i && exists $heatmap{ $cluster_pos[ $i - 2 ] } ) {
525 23         32 $num_proximate++;
526             }
527             }
528 1341         1911 $i++;
529             }
530 39         117 $span{unique} = scalar keys %uniq;
531 39         98 $span{proximate} = $num_proximate;
532              
533             # no false phrase matches if !_treat_phrases_as_singles
534             # stemmer check because regex will likely fail when stemmer is on
535 39 100 66     149 if ( $query_has_phrase
536             and !$self->{_treat_phrases_as_singles} )
537             {
538 3 100       14 if ( !$self->{_stemmer} ) {
539              
540             #warn "_treat_phrases_as_singles NOT true";
541 1 50       73 if ( $span{str} !~ m/$qre/ ) {
542 1 50       6 $debug
543             and warn
544             "treat_phrases_as_singles=FALSE and '$span{str}' failed to match $qre\n";
545 1         14 next CLUSTER;
546             }
547             }
548             else {
549              
550             # stemmer used, so check unique term count against n_terms
551 2 50 66     12 if ( $n_terms == $query_has_phrase
552             && $n_terms > $span{unique} )
553             {
554 0 0       0 $debug
555             and warn
556             "treat_phrases_as_singles=FALSE and '$span{str}' "
557             . "expected $n_terms but got $span{unique}\n";
558 0         0 next CLUSTER;
559             }
560              
561             }
562             }
563              
564             # just for debug
565 38 50       110 if ($debug) {
566 0         0 my $i = 0;
567             $span{str_w_pos} = join(
568             '',
569             map {
570 0         0 $strings[ $i++ ]
571             . ( exists $heatmap{$_} ? $OPEN : '[' )
572             . $_
573 0 0       0 . ( exists $heatmap{$_} ? $CLOSE : ']' )
    0          
574             } @cluster_pos
575             );
576             }
577              
578 38         257 push @spans, \%span;
579              
580             }
581              
582 17         169 $self->{spans} = $self->_sort_spans( \@spans );
583 17         68 $self->{heatmap} = \%heatmap;
584              
585 17         278 return $self;
586             }
587              
588             =head2 has_spans
589              
590             Returns the number of spans found.
591              
592             =cut
593              
594             sub has_spans {
595 30     30 1 56 return scalar @{ $_[0]->{spans} };
  30         129  
596             }
597              
598             1;
599              
600             __END__