File Coverage

blib/lib/Search/Tools/HeatMap.pm

Criterion	Covered	Total	%
statement	192	219	87.6
branch	64	104	61.5
condition	36	47	76.6
subroutine	10	10	100.0
pod	2	2	100.0
total	304	382	79.5

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tools::HeatMap;
2	17			17		61	use Moo;
	17					57
	17					76
3	17			17		3115	use Carp;
	17					20
	17					790
4	17			17		62	use Data::Dump qw( dump );
	17					17
	17					657
5							extends 'Search::Tools::Object';
6
7	17			17		58	use namespace::autoclean;
	17					19
	17					93
8
9							our $VERSION = '1.004';
10
11							# debugging only
12							my $OPEN = '[';
13							my $CLOSE = ']';
14							eval { require Term::ANSIColor; };
15							if ( !$@ ) {
16							$OPEN .= Term::ANSIColor::color('bold red');
17							$CLOSE = Term::ANSIColor::color('reset') . $CLOSE;
18							}
19
20							my @attrs = qw( window_size
21							tokens
22							spans
23							as_sentences
24							_treat_phrases_as_singles
25							_qre
26							_query
27							_stemmer
28							);
29
30							for my $attr (@attrs) {
31							has $attr => ( is => 'rw' );
32							}
33
34							=head1 NAME
35
36							Search::Tools::HeatMap - locate the best matches in a snippet extract
37
38							=head1 SYNOPSIS
39
40							use Search::Tools::Tokenizer;
41							use Search::Tools::HeatMap;
42
43							my $tokens = $self->tokenizer->tokenize( $my_string, qr/^(interesting)$/ );
44							my $heatmap = Search::Tools::HeatMap->new(
45							tokens => $tokens,
46							window_size => 20, # default
47							as_sentences => 0, # default
48							);
49
50							if ( $heatmap->has_spans ) {
51
52							my $tokens_arr = $tokens->as_array;
53
54							# stringify positions
55							my @snips;
56							for my $span ( @{ $heatmap->spans } ) {
57							push( @snips, $span->{str} );
58							}
59							my $occur_index = $self->occur - 1;
60							if ( $#snips > $occur_index ) {
61							@snips = @snips[ 0 .. $occur_index ];
62							}
63							printf("%s\n", join( ' ... ', @snips ));
64
65							}
66
67							=head1 DESCRIPTION
68
69							Search::Tools::HeatMap implements a simple algorithm for locating
70							the densest clusters of unique, hot terms in a TokenList.
71
72							HeatMap is used internally by Snipper but documented here in case
73							someone wants to abuse and/or improve it.
74
75							=head1 METHODS
76
77							=head2 new( tokens => I )
78
79							Create a new HeatMap. The I object may be either a
80							Search::Tools::TokenList or Search::Tools::TokenListPP object.
81
82							=head2 BUILD
83
84							Builds the HeatMap object. Called internally by new().
85
86							=cut
87
88							sub BUILD {
89	30			30	1	485	my $self = shift;
90	30					96	$self->_build;
91	30					741	return $self;
92							}
93
94							=head2 window_size
95
96							The max width of a span. Defaults to 20 tokens, including the
97							matches.
98
99							Set this in new(). Access it later if you need to, but the spans
100							will have already been created by new().
101
102							=head2 as_sentences
103
104							Try to match clusters at sentence boundaries. Default is false.
105
106							Set this in new().
107
108							=head2 spans
109
110							Returns an array ref of matching clusters. Each span in the array
111							is a hash ref with the following keys:
112
113							=over
114
115							=item cluster
116
117							=item pos
118
119							=item heat
120
121							=item str
122
123							=item str_w_pos
124
125							This item is available only if debug() is true.
126
127							=item unique
128
129							=back
130
131							=cut
132
133							# TODO this is mostly integer math and might be much
134							# faster if rewritten in XS once the algorithm is "final".
135							sub _build {
136	30			30		39	my $self = shift;
137	30	50				544	my $tokens = $self->tokens or croak "tokens required";
138	30		50			126	my $window = $self->window_size \|\| 20;
139	30		100			152	my $as_sentences = $self->as_sentences \|\| 0;
140	30	100				131	return $as_sentences
141							? $self->_as_sentences( $tokens, $window )
142							: $self->_no_sentences( $tokens, $window );
143							}
144
145							# currently _as_sentences() is mostly identical to _no_sentences()
146							# with slightly fewer gymnastics.
147							# Since we already know via sentence_starts where our boundaries are,
148							# we do not have to call $tokens->get_window().
149							# Who knows how we might improve the sentence algorithm in future,
150							# so already having it in its own method seems like a win.
151							sub _as_sentences {
152	13			13		24	my ( $self, $tokens, $window ) = @_;
153	13		50			265	my $debug = $self->debug \|\| 0;
154	13					121	my $sentence_length = $window * 2;
155
156							# build heatmap with sentence starts
157	13					51	my $num_tokens = $tokens->len;
158	13					41	my $tokens_arr = $tokens->as_array;
159	13					26	my %heatmap = ();
160	13					44	my $token_list_heat = $tokens->get_heat;
161	13					43	my $heat_sentence_starts = $tokens->get_sentence_starts;
162
163							# this regex is a sanity check for phrases. we replace the \ with a
164							# more promiscuous check because the single space is too naive
165							# for real text (e.g. st. john's)
166	13					24	my $qre = $self->{_qre};
167	13					20	my @phrases = @{ $self->{_query}->phrases };
	13					62
168	13					53	my $n_terms = $self->{_query}->num_terms;
169	13					80	my $query_has_phrase = $qre =~ s/(\\ )+/.+/g;
170
171	13	50				41	if ($debug) {
172	0					0	warn "heat_sentence_starts: " . dump($heat_sentence_starts);
173	0					0	warn "token_list_heat: " . dump($token_list_heat);
174	0					0	warn "n_terms: $n_terms";
175	0					0	warn "phrases: " . dump( \@phrases );
176	0					0	warn "query_has_phrase: $query_has_phrase";
177							}
178
179							# find the "sentence" that each hot token appears in.
180	13					14	my @starts_ends;
181	13					15	my $i = 0;
182	13					23	my %heat_sentence_ends = (); # cache
183	13					29	for (@$token_list_heat) {
184	42					111	my $token = $tokens->get_token($_);
185	42					100	my $token_pos = $token->pos;
186	42					54	my $start = $heat_sentence_starts->[ $i++ ];
187	42					138	$heatmap{$token_pos} = $token->is_hot;
188
189							# a little optimization for when we've got
190							# multiple hot tokens in the same sentence
191	42	100				84	if ( exists $heat_sentence_ends{$start} ) {
192	22	50				40	$debug
193							and warn "found cached end $heat_sentence_ends{$start} "
194							. "for start $start token $token_pos\n";
195
196							push( @starts_ends,
197	22					47	[ $start, $token_pos, $heat_sentence_ends{$start} ] );
198	22					35	next;
199							}
200
201							# find the outermost limit of where this sentence might end
202	20					19	my $max_end;
203
204							# is there a "next" start?
205	20	100	100			98	if ( defined $heat_sentence_starts->[$i]
206							and $heat_sentence_starts->[$i] != $start )
207							{
208
209							# this token is unique in this non-final sentence
210	3					8	$max_end = $heat_sentence_starts->[$i] - 1;
211							}
212							else {
213
214							# this is the final sentence
215	17					30	$max_end = $num_tokens - 1;
216							}
217	20					29	my $end = $start;
218
219							# find the nearest sentence end to the start
220	20					47	while ( $end < $max_end ) {
221	1990					2299	my $tok = $tokens->get_token( $end++ );
222	1990	50				2868	if ( !$tok ) {
223	0	0				0	$debug and warn "No token at end=$end";
224	0					0	last;
225							}
226	1990	100				4422	if ( $tok->is_sentence_end ) {
227	10					17	$end--; # move back one position
228	10	50				25	if ($debug) {
229	0					0	warn "tok $_ is_sentence_end end=$end";
230	0					0	$tok->dump;
231							}
232	10					23	last;
233							}
234							}
235
236							# back up if we've exceeded the 0-based tokens array.
237	20	50				50	$end = $num_tokens if $end > $num_tokens;
238
239	20	50				45	$debug
240							and warn "start=$start max_end=$max_end "
241							. "sentence_length=$sentence_length end=$end "
242							. "token_pos=$token_pos\n";
243
244							# if we didn't yet set the actual hot token,
245							# include everything up to it.
246	20	50				45	if ( $end < $token_pos ) {
247	0	0				0	$debug
248							and warn "resetting end=$token_pos\n";
249
250	0					0	$end = $token_pos;
251							}
252	20					57	push( @starts_ends, [ $start, $token_pos, $end ] );
253
254							# cache
255	20					61	$heat_sentence_ends{$start} = $end;
256							}
257
258	13	50				36	$debug and warn "starts_ends: " . dump( \@starts_ends );
259
260	13					19	my @spans;
261							my %seen_pos;
262							START_END:
263	13					44	for my $start_end (@starts_ends) {
264
265							# get full window, ignoring positions we've already seen.
266	42					42	my $heat = 0;
267	42					37	my %span;
268							my @cluster_tokens;
269
270	42					85	my ( $start, $hot_pos, $end ) = @$start_end;
271	42					89	POS: for my $pos ( $start .. $end ) {
272	5075	100				9889	next POS if $seen_pos{$pos}++;
273	2000	100				2064	$heat += ( exists $heatmap{$pos} ? $heatmap{$pos} : 0 );
274	2000					2426	push( @cluster_tokens, $tokens->get_token($pos) );
275							}
276
277							# if we had already seen_pos all positions.
278	42	100				109	next START_END unless @cluster_tokens;
279
280							# sanity: make sure we still have something hot
281	20					29	my $has_hot = 0;
282	20					24	my @cluster_pos;
283							my @strings;
284	20					34	TOK: for (@cluster_tokens) {
285	2000					1855	my $pos = $_->pos;
286	2000	100				2246	$has_hot++ if exists $heatmap{$pos};
287	2000					2235	push @strings, $_->str;
288	2000					1799	push @cluster_pos, $pos;
289							}
290	20	50				43	next START_END unless $has_hot;
291
292							# the final string is a sentence end,
293							# but we only want the first char in it,
294							# and not any whitespace, stray punctuation or other
295							# non-word noise.
296	20					122	$strings[$#strings] =~ s/^([\.\?\!]).*/$1/;
297
298	20					48	$span{start_end} = $start_end;
299	20					32	$span{heat} = $heat;
300	20					33	$span{pos} = \@cluster_pos;
301	20					30	$span{tokens} = \@cluster_tokens;
302	20					130	$span{str} = join( '', @strings );
303
304							# spans with more unique hot tokens in a single span rank higher
305							# spans with more proximate hot tokens in a single span rank higher
306	20					40	my %uniq = ();
307	20					19	my $i = 0;
308	20					23	my $num_proximate = 1; # one for the single hot token
309	20					36	for (@cluster_pos) {
310	2000	100				2124	if ( exists $heatmap{$_} ) {
311	42					107	$uniq{ lc $strings[$i] } += $heatmap{$_};
312	42	100	100			171	if ( $i && exists $heatmap{ $cluster_pos[ $i - 2 ] } ) {
313	10					11	$num_proximate++;
314							}
315							}
316	2000					1282	$i++;
317							}
318	20					45	$span{unique} = scalar keys %uniq;
319	20					30	$span{proximate} = $num_proximate;
320
321							# no false phrase matches if !_treat_phrases_as_singles
322							# stemmer check because regex will likely fail
323							# when stemmer is on
324	20	100	66			80	if ( $query_has_phrase
325							and !$self->{_treat_phrases_as_singles} )
326							{
327	7	100				16	if ( !$self->{_stemmer} ) {
328
329							#warn "_treat_phrases_as_singles NOT true";
330	3	50				106	if ( $span{str} !~ m/$qre/ ) {
331	0	0				0	$debug
332							and warn
333							"treat_phrases_as_singles=FALSE and '$span{str}' failed to match $qre\n";
334	0					0	next START_END;
335							}
336							}
337							else {
338
339							# if stemmer was on, we cannot rely on the regex,
340							# but we assume that number of uniq terms must match query
341
342	4	50	66			16	if ( $n_terms == $query_has_phrase
343							&& $n_terms > $span{unique} )
344							{
345
346	0	0				0	$debug
347							and warn
348							"treat_phrases_as_singles=FALSE and '$span{str}' "
349							. "expected $n_terms unique terms, got $span{unique}\n";
350	0					0	next START_END;
351							}
352
353							}
354							}
355
356							# just for debug
357	20	50				53	if ($debug) {
358	0					0	my $i = 0;
359							$span{str_w_pos} = join(
360							'',
361							map {
362	0					0	$strings[ $i++ ]
363							. ( exists $heatmap{$_} ? $OPEN : '[' )
364							. $_
365	0	0				0	. ( exists $heatmap{$_} ? $CLOSE : ']' )
		0
366							} @cluster_pos
367							);
368							}
369
370	20					130	push @spans, \%span;
371
372							}
373
374	13					50	$self->{spans} = $self->_sort_spans( \@spans );
375	13					30	$self->{heatmap} = \%heatmap;
376
377	13					195	return $self;
378							}
379
380							sub _sort_spans {
381							return [
382
383							# sort by unique,
384							# then by proximity
385							# then by heat
386							# then by pos
387
388							sort {
389							$b->{unique} <=> $a->{unique}
390							\|\| $b->{proximate} <=> $a->{proximate}
391							\|\| $b->{heat} <=> $a->{heat}
392	51	50	66			267	\|\| $a->{pos}->[0] <=> $b->{pos}->[0]
			100
393	30			30		49	} @{ $_[1] }
	30					126
394
395							];
396							}
397
398							sub _no_sentences {
399	17			17		31	my ( $self, $tokens, $window ) = @_;
400	17					52	my $lhs_window = int( $window / 2 );
401	17		50			299	my $debug = $self->debug \|\| 0;
402
403	17					154	my $num_tokens = $tokens->len;
404	17					46	my $tokens_arr = $tokens->as_array;
405	17					29	my %heatmap = ();
406	17					50	my $token_list_heat = $tokens->get_heat;
407
408							# this regex is a sanity check for phrases. we replace the \ with a
409							# more promiscuous check because the single space is too naive
410							# for real text (e.g. st. john's)
411	17					30	my $qre = $self->{_qre};
412	17					21	my @phrases = @{ $self->{_query}->phrases };
	17					65
413	17					64	my $n_terms = $self->{_query}->num_terms;
414	17					70	my $query_has_phrase = $qre =~ s/(\\ )+/.+/g;
415
416	17	50				43	if ($debug) {
417	0					0	warn "token_list_heat: " . dump($token_list_heat);
418	0					0	warn "n_terms: $n_terms";
419	0					0	warn "phrases: " . dump( \@phrases );
420	0					0	warn "query_has_phrase: $query_has_phrase";
421							}
422
423							# build heatmap
424	17					39	for (@$token_list_heat) {
425	77					139	my $token = $tokens->get_token($_);
426	77					247	$heatmap{ $token->pos } = $token->is_hot;
427							}
428
429							# make clusters
430
431							# $proximity == (1/4 of $window)+1 is somewhat arbitrary,
432							# but since we want to err in having too much context,
433							# we aim high. Worst case scenario is where there are
434							# multiple hot spots in a cluster and each is a full
435							# $proximity length apart, which will grow the
436							# eventual span far beyond $window size. We rely
437							# on max_chars in Snipper to catch that worst case.
438	17					45	my $proximity = int( $lhs_window / 2 ) + 1;
439	17					76	my @positions = sort { $a <=> $b } keys %heatmap;
	151					142
440	17					38	my @clusters = ( [] );
441	17					21	my $i = 0;
442	17					30	for my $pos (@positions) {
443
444							# if we have advanced past the first position
445							# and the previous position is not "close" to this one,
446							# start a new cluster
447	77	100	100			214	if ( $i && ( $pos - $positions[ $i - 1 ] ) > $proximity ) {
448	33					46	push( @clusters, [$pos] );
449							}
450							else {
451	44					33	push( @{ $clusters[-1] }, $pos );
	44					60
452							}
453	77					72	$i++;
454							}
455
456							$debug
457	17	50				36	and warn "proximity: $proximity clusters: " . dump \@clusters;
458
459							# create spans from each cluster, each with a weight.
460							# we do the initial sort so that clusters that overlap
461							# other clusters via get_window() are weeded out via %seen_pos.
462	17					26	my @spans;
463							my %seen_pos;
464							CLUSTER:
465	17					32	for my $cluster (
466							sort {
467							scalar(@$b) <=> scalar(@$a)
468	61	50	66			206	\|\| $heatmap{ $b->[0] } <=> $heatmap{ $a->[0] }
469							\|\| $a->[0] <=> $b->[0]
470							} @clusters
471							)
472							{
473
474							# get full window, ignoring positions we've already seen.
475	50					45	my $heat = 0;
476	50					36	my %span;
477							my @cluster_tokens;
478	50					59	POS: for my $pos (@$cluster) {
479	77					198	my ( $start, $end ) = $tokens->get_window( $pos, $window );
480	77					122	POS_TWO: for my $pos2 ( $start .. $end ) {
481	3357	100				5728	next if $seen_pos{$pos2}++;
482	1513	100				1431	$heat += ( exists $heatmap{$pos2} ? $heatmap{$pos2} : 0 );
483	1513					1851	push( @cluster_tokens, $tokens->get_token($pos2) );
484							}
485							}
486
487							# we may have skipped a $seen_pos from the $slice above
488							# so make sure we still start/end on a match
489	50		66			226	while ( @cluster_tokens && !$cluster_tokens[0]->is_match ) {
490	11					49	shift @cluster_tokens;
491							}
492	50		66			2075	while ( @cluster_tokens && !$cluster_tokens[-1]->is_match ) {
493	6					25	pop @cluster_tokens;
494							}
495
496	50	50				79	next CLUSTER unless @cluster_tokens;
497
498							# sanity: make sure we still have something hot
499	50					43	my $has_hot = 0;
500	50					38	my @cluster_pos;
501							my @strings;
502	50					62	for (@cluster_tokens) {
503	1496					1378	my $pos = $_->pos;
504	1496	100				1711	$has_hot++ if exists $heatmap{$pos};
505	1496					1715	push @strings, $_->str;
506	1496					1366	push @cluster_pos, $pos;
507							}
508	50	100				111	next CLUSTER unless $has_hot;
509
510	39					52	$span{cluster} = $cluster;
511	39					51	$span{heat} = $heat;
512	39					44	$span{pos} = \@cluster_pos;
513	39					48	$span{tokens} = \@cluster_tokens;
514	39					125	$span{str} = join( '', @strings );
515
516							# spans with more unique hot tokens in a single span rank higher
517							# spans with more proximate hot tokens in a single span rank higher
518	39					53	my %uniq = ();
519	39					34	my $i = 0;
520	39					30	my $num_proximate = 1; # one for the single hot token
521	39					46	for (@cluster_pos) {
522	1341	100				1454	if ( exists $heatmap{$_} ) {
523	77					154	$uniq{ lc $strings[$i] } += $heatmap{$_};
524	77	100	100			251	if ( $i && exists $heatmap{ $cluster_pos[ $i - 2 ] } ) {
525	23					22	$num_proximate++;
526							}
527							}
528	1341					849	$i++;
529							}
530	39					55	$span{unique} = scalar keys %uniq;
531	39					45	$span{proximate} = $num_proximate;
532
533							# no false phrase matches if !_treat_phrases_as_singles
534							# stemmer check because regex will likely fail when stemmer is on
535	39	100	66			98	if ( $query_has_phrase
536							and !$self->{_treat_phrases_as_singles} )
537							{
538	3	100				11	if ( !$self->{_stemmer} ) {
539
540							#warn "_treat_phrases_as_singles NOT true";
541	1	50				37	if ( $span{str} !~ m/$qre/ ) {
542	1	50				3	$debug
543							and warn
544							"treat_phrases_as_singles=FALSE and '$span{str}' failed to match $qre\n";
545	1					6	next CLUSTER;
546							}
547							}
548							else {
549
550							# stemmer used, so check unique term count against n_terms
551	2	50	66			13	if ( $n_terms == $query_has_phrase
552							&& $n_terms > $span{unique} )
553							{
554	0	0				0	$debug
555							and warn
556							"treat_phrases_as_singles=FALSE and '$span{str}' "
557							. "expected $n_terms but got $span{unique}\n";
558	0					0	next CLUSTER;
559							}
560
561							}
562							}
563
564							# just for debug
565	38	50				61	if ($debug) {
566	0					0	my $i = 0;
567							$span{str_w_pos} = join(
568							'',
569							map {
570	0					0	$strings[ $i++ ]
571							. ( exists $heatmap{$_} ? $OPEN : '[' )
572							. $_
573	0	0				0	. ( exists $heatmap{$_} ? $CLOSE : ']' )
		0
574							} @cluster_pos
575							);
576							}
577
578	38					137	push @spans, \%span;
579
580							}
581
582	17					1288	$self->{spans} = $self->_sort_spans( \@spans );
583	17					33	$self->{heatmap} = \%heatmap;
584
585	17					161	return $self;
586							}
587
588							=head2 has_spans
589
590							Returns the number of spans found.
591
592							=cut
593
594							sub has_spans {
595	30			30	1	43	return scalar @{ $_[0]->{spans} };
	30					111
596							}
597
598							1;
599
600							__END__