File Coverage

blib/lib/Search/Tools/HeatMap.pm

Criterion	Covered	Total	%
statement	192	219	87.6
branch	64	104	61.5
condition	36	47	76.6
subroutine	10	10	100.0
pod	2	2	100.0
total	304	382	79.5

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tools::HeatMap;
2	16			16		124	use Moo;
	16					37
	16					107
3	16			16		5598	use Carp;
	16					46
	16					1239
4	16			16		116	use Data::Dump qw( dump );
	16					38
	16					1297
5							extends 'Search::Tools::Object';
6
7	16			16		123	use namespace::autoclean;
	16					215
	16					164
8
9							our $VERSION = '1.007';
10
11							# debugging only
12							my $OPEN = '[';
13							my $CLOSE = ']';
14							eval { require Term::ANSIColor; };
15							if ( !$@ ) {
16							$OPEN .= Term::ANSIColor::color('bold red');
17							$CLOSE = Term::ANSIColor::color('reset') . $CLOSE;
18							}
19
20							my @attrs = qw( window_size
21							tokens
22							spans
23							as_sentences
24							_treat_phrases_as_singles
25							_qre
26							_query
27							_stemmer
28							);
29
30							for my $attr (@attrs) {
31							has $attr => ( is => 'rw' );
32							}
33
34							=head1 NAME
35
36							Search::Tools::HeatMap - locate the best matches in a snippet extract
37
38							=head1 SYNOPSIS
39
40							use Search::Tools::Tokenizer;
41							use Search::Tools::HeatMap;
42
43							my $tokens = $self->tokenizer->tokenize( $my_string, qr/^(interesting)$/ );
44							my $heatmap = Search::Tools::HeatMap->new(
45							tokens => $tokens,
46							window_size => 20, # default
47							as_sentences => 0, # default
48							);
49
50							if ( $heatmap->has_spans ) {
51
52							my $tokens_arr = $tokens->as_array;
53
54							# stringify positions
55							my @snips;
56							for my $span ( @{ $heatmap->spans } ) {
57							push( @snips, $span->{str} );
58							}
59							my $occur_index = $self->occur - 1;
60							if ( $#snips > $occur_index ) {
61							@snips = @snips[ 0 .. $occur_index ];
62							}
63							printf("%s\n", join( ' ... ', @snips ));
64
65							}
66
67							=head1 DESCRIPTION
68
69							Search::Tools::HeatMap implements a simple algorithm for locating
70							the densest clusters of unique, hot terms in a TokenList.
71
72							HeatMap is used internally by Snipper but documented here in case
73							someone wants to abuse and/or improve it.
74
75							=head1 METHODS
76
77							=head2 new( tokens => I )
78
79							Create a new HeatMap. The I object may be either a
80							Search::Tools::TokenList or Search::Tools::TokenListPP object.
81
82							=head2 BUILD
83
84							Builds the HeatMap object. Called internally by new().
85
86							=cut
87
88							sub BUILD {
89	30			30	1	503	my $self = shift;
90	30					146	$self->_build;
91	30					207	return $self;
92							}
93
94							=head2 window_size
95
96							The max width of a span. Defaults to 20 tokens, including the
97							matches.
98
99							Set this in new(). Access it later if you need to, but the spans
100							will have already been created by new().
101
102							=head2 as_sentences
103
104							Try to match clusters at sentence boundaries. Default is false.
105
106							Set this in new().
107
108							=head2 spans
109
110							Returns an array ref of matching clusters. Each span in the array
111							is a hash ref with the following keys:
112
113							=over
114
115							=item cluster
116
117							=item pos
118
119							=item heat
120
121							=item str
122
123							=item str_w_pos
124
125							This item is available only if debug() is true.
126
127							=item unique
128
129							=back
130
131							=cut
132
133							# TODO this is mostly integer math and might be much
134							# faster if rewritten in XS once the algorithm is "final".
135							sub _build {
136	30			30		58	my $self = shift;
137	30	50				666	my $tokens = $self->tokens or croak "tokens required";
138	30		50			204	my $window = $self->window_size \|\| 20;
139	30		100			207	my $as_sentences = $self->as_sentences \|\| 0;
140	30	100				179	return $as_sentences
141							? $self->_as_sentences( $tokens, $window )
142							: $self->_no_sentences( $tokens, $window );
143							}
144
145							# currently _as_sentences() is mostly identical to _no_sentences()
146							# with slightly fewer gymnastics.
147							# Since we already know via sentence_starts where our boundaries are,
148							# we do not have to call $tokens->get_window().
149							# Who knows how we might improve the sentence algorithm in future,
150							# so already having it in its own method seems like a win.
151							sub _as_sentences {
152	13			13		42	my ( $self, $tokens, $window ) = @_;
153	13		50			292	my $debug = $self->debug \|\| 0;
154	13					145	my $sentence_length = $window * 2;
155
156							# build heatmap with sentence starts
157	13					65	my $num_tokens = $tokens->len;
158	13					45	my $tokens_arr = $tokens->as_array;
159	13					108	my %heatmap = ();
160	13					56	my $token_list_heat = $tokens->get_heat;
161	13					48	my $heat_sentence_starts = $tokens->get_sentence_starts;
162
163							# this regex is a sanity check for phrases. we replace the \ with a
164							# more promiscuous check because the single space is too naive
165							# for real text (e.g. st. john's)
166	13					33	my $qre = $self->{_qre};
167	13					25	my @phrases = @{ $self->{_query}->phrases };
	13					65
168	13					83	my $n_terms = $self->{_query}->num_terms;
169	13					108	my $query_has_phrase = $qre =~ s/(\\ )+/.+/g;
170
171	13	50				47	if ($debug) {
172	0					0	warn "heat_sentence_starts: " . dump($heat_sentence_starts);
173	0					0	warn "token_list_heat: " . dump($token_list_heat);
174	0					0	warn "n_terms: $n_terms";
175	0					0	warn "phrases: " . dump( \@phrases );
176	0					0	warn "query_has_phrase: $query_has_phrase";
177							}
178
179							# find the "sentence" that each hot token appears in.
180	13					25	my @starts_ends;
181	13					28	my $i = 0;
182	13					27	my %heat_sentence_ends = (); # cache
183	13					37	for (@$token_list_heat) {
184	42					118	my $token = $tokens->get_token($_);
185	42					118	my $token_pos = $token->pos;
186	42					83	my $start = $heat_sentence_starts->[ $i++ ];
187	42					183	$heatmap{$token_pos} = $token->is_hot;
188
189							# a little optimization for when we've got
190							# multiple hot tokens in the same sentence
191	42	100				134	if ( exists $heat_sentence_ends{$start} ) {
192	22	50				53	$debug
193							and warn "found cached end $heat_sentence_ends{$start} "
194							. "for start $start token $token_pos\n";
195
196							push( @starts_ends,
197	22					92	[ $start, $token_pos, $heat_sentence_ends{$start} ] );
198	22					56	next;
199							}
200
201							# find the outermost limit of where this sentence might end
202	20					32	my $max_end;
203
204							# is there a "next" start?
205	20	100	100			159	if ( defined $heat_sentence_starts->[$i]
206							and $heat_sentence_starts->[$i] != $start )
207							{
208
209							# this token is unique in this non-final sentence
210	3					7	$max_end = $heat_sentence_starts->[$i] - 1;
211							}
212							else {
213
214							# this is the final sentence
215	17					43	$max_end = $num_tokens - 1;
216							}
217	20					41	my $end = $start;
218
219							# find the nearest sentence end to the start
220	20					65	while ( $end < $max_end ) {
221	1990					4200	my $tok = $tokens->get_token( $end++ );
222	1990	50				4827	if ( !$tok ) {
223	0	0				0	$debug and warn "No token at end=$end";
224	0					0	last;
225							}
226	1990	100				5997	if ( $tok->is_sentence_end ) {
227	10					21	$end--; # move back one position
228	10	50				30	if ($debug) {
229	0					0	warn "tok $_ is_sentence_end end=$end";
230	0					0	$tok->dump;
231							}
232	10					36	last;
233							}
234							}
235
236							# back up if we've exceeded the 0-based tokens array.
237	20	50				59	$end = $num_tokens if $end > $num_tokens;
238
239	20	50				88	$debug
240							and warn "start=$start max_end=$max_end "
241							. "sentence_length=$sentence_length end=$end "
242							. "token_pos=$token_pos\n";
243
244							# if we didn't yet set the actual hot token,
245							# include everything up to it.
246	20	50				55	if ( $end < $token_pos ) {
247	0	0				0	$debug
248							and warn "resetting end=$token_pos\n";
249
250	0					0	$end = $token_pos;
251							}
252	20					72	push( @starts_ends, [ $start, $token_pos, $end ] );
253
254							# cache
255	20					83	$heat_sentence_ends{$start} = $end;
256							}
257
258	13	50				49	$debug and warn "starts_ends: " . dump( \@starts_ends );
259
260	13					31	my @spans;
261							my %seen_pos;
262							START_END:
263	13					31	for my $start_end (@starts_ends) {
264
265							# get full window, ignoring positions we've already seen.
266	42					69	my $heat = 0;
267	42					75	my %span;
268							my @cluster_tokens;
269
270	42					121	my ( $start, $hot_pos, $end ) = @$start_end;
271	42					106	POS: for my $pos ( $start .. $end ) {
272	5075	100				10563	next POS if $seen_pos{$pos}++;
273	2000	100				3275	$heat += ( exists $heatmap{$pos} ? $heatmap{$pos} : 0 );
274	2000					4318	push( @cluster_tokens, $tokens->get_token($pos) );
275							}
276
277							# if we had already seen_pos all positions.
278	42	100				135	next START_END unless @cluster_tokens;
279
280							# sanity: make sure we still have something hot
281	20					38	my $has_hot = 0;
282	20					39	my @cluster_pos;
283							my @strings;
284	20					53	TOK: for (@cluster_tokens) {
285	2000					3405	my $pos = $_->pos;
286	2000	100				3633	$has_hot++ if exists $heatmap{$pos};
287	2000					4037	push @strings, $_->str;
288	2000					3854	push @cluster_pos, $pos;
289							}
290	20	50				57	next START_END unless $has_hot;
291
292							# the final string is a sentence end,
293							# but we only want the first char in it,
294							# and not any whitespace, stray punctuation or other
295							# non-word noise.
296	20					146	$strings[$#strings] =~ s/^([\.\?\!]).*/$1/;
297
298	20					70	$span{start_end} = $start_end;
299	20					52	$span{heat} = $heat;
300	20					44	$span{pos} = \@cluster_pos;
301	20					58	$span{tokens} = \@cluster_tokens;
302	20					185	$span{str} = join( '', @strings );
303
304							# spans with more unique hot tokens in a single span rank higher
305							# spans with more proximate hot tokens in a single span rank higher
306	20					45	my %uniq = ();
307	20					37	my $i = 0;
308	20					39	my $num_proximate = 1; # one for the single hot token
309	20					41	for (@cluster_pos) {
310	2000	100				3320	if ( exists $heatmap{$_} ) {
311	42					171	$uniq{ lc $strings[$i] } += $heatmap{$_};
312	42	100	100			219	if ( $i && exists $heatmap{ $cluster_pos[ $i - 2 ] } ) {
313	10					15	$num_proximate++;
314							}
315							}
316	2000					2684	$i++;
317							}
318	20					60	$span{unique} = scalar keys %uniq;
319	20					43	$span{proximate} = $num_proximate;
320
321							# no false phrase matches if !_treat_phrases_as_singles
322							# stemmer check because regex will likely fail
323							# when stemmer is on
324	20	100	66			93	if ( $query_has_phrase
325							and !$self->{_treat_phrases_as_singles} )
326							{
327	7	100				27	if ( !$self->{_stemmer} ) {
328
329							#warn "_treat_phrases_as_singles NOT true";
330	3	50				128	if ( $span{str} !~ m/$qre/ ) {
331	0	0				0	$debug
332							and warn
333							"treat_phrases_as_singles=FALSE and '$span{str}' failed to match $qre\n";
334	0					0	next START_END;
335							}
336							}
337							else {
338
339							# if stemmer was on, we cannot rely on the regex,
340							# but we assume that number of uniq terms must match query
341
342	4	50	66			18	if ( $n_terms == $query_has_phrase
343							&& $n_terms > $span{unique} )
344							{
345
346	0	0				0	$debug
347							and warn
348							"treat_phrases_as_singles=FALSE and '$span{str}' "
349							. "expected $n_terms unique terms, got $span{unique}\n";
350	0					0	next START_END;
351							}
352
353							}
354							}
355
356							# just for debug
357	20	50				76	if ($debug) {
358	0					0	my $i = 0;
359							$span{str_w_pos} = join(
360							'',
361							map {
362	0					0	$strings[ $i++ ]
363							. ( exists $heatmap{$_} ? $OPEN : '[' )
364							. $_
365	0	0				0	. ( exists $heatmap{$_} ? $CLOSE : ']' )
		0
366							} @cluster_pos
367							);
368							}
369
370	20					189	push @spans, \%span;
371
372							}
373
374	13					59	$self->{spans} = $self->_sort_spans( \@spans );
375	13					35	$self->{heatmap} = \%heatmap;
376
377	13					265	return $self;
378							}
379
380							sub _sort_spans {
381							return [
382
383							# sort by unique,
384							# then by proximity
385							# then by heat
386							# then by pos
387
388							sort {
389							$b->{unique} <=> $a->{unique}
390							\|\| $b->{proximate} <=> $a->{proximate}
391							\|\| $b->{heat} <=> $a->{heat}
392	51	50	66			362	\|\| $a->{pos}->[0] <=> $b->{pos}->[0]
			100
393	30			30		80	} @{ $_[1] }
	30					170
394
395							];
396							}
397
398							sub _no_sentences {
399	17			17		57	my ( $self, $tokens, $window ) = @_;
400	17					81	my $lhs_window = int( $window / 2 );
401	17		50			468	my $debug = $self->debug \|\| 0;
402
403	17					251	my $num_tokens = $tokens->len;
404	17					73	my $tokens_arr = $tokens->as_array;
405	17					96	my %heatmap = ();
406	17					88	my $token_list_heat = $tokens->get_heat;
407
408							# this regex is a sanity check for phrases. we replace the \ with a
409							# more promiscuous check because the single space is too naive
410							# for real text (e.g. st. john's)
411	17					59	my $qre = $self->{_qre};
412	17					44	my @phrases = @{ $self->{_query}->phrases };
	17					103
413	17					100	my $n_terms = $self->{_query}->num_terms;
414	17					125	my $query_has_phrase = $qre =~ s/(\\ )+/.+/g;
415
416	17	50				93	if ($debug) {
417	0					0	warn "token_list_heat: " . dump($token_list_heat);
418	0					0	warn "n_terms: $n_terms";
419	0					0	warn "phrases: " . dump( \@phrases );
420	0					0	warn "query_has_phrase: $query_has_phrase";
421							}
422
423							# build heatmap
424	17					61	for (@$token_list_heat) {
425	77					203	my $token = $tokens->get_token($_);
426	77					353	$heatmap{ $token->pos } = $token->is_hot;
427							}
428
429							# make clusters
430
431							# $proximity == (1/4 of $window)+1 is somewhat arbitrary,
432							# but since we want to err in having too much context,
433							# we aim high. Worst case scenario is where there are
434							# multiple hot spots in a cluster and each is a full
435							# $proximity length apart, which will grow the
436							# eventual span far beyond $window size. We rely
437							# on max_chars in Snipper to catch that worst case.
438	17					79	my $proximity = int( $lhs_window / 2 ) + 1;
439	17					114	my @positions = sort { $a <=> $b } keys %heatmap;
	138					234
440	17					71	my @clusters = ( [] );
441	17					43	my $i = 0;
442	17					62	for my $pos (@positions) {
443
444							# if we have advanced past the first position
445							# and the previous position is not "close" to this one,
446							# start a new cluster
447	77	100	100			284	if ( $i && ( $pos - $positions[ $i - 1 ] ) > $proximity ) {
448	33					70	push( @clusters, [$pos] );
449							}
450							else {
451	44					73	push( @{ $clusters[-1] }, $pos );
	44					106
452							}
453	77					136	$i++;
454							}
455
456							$debug
457	17	50				68	and warn "proximity: $proximity clusters: " . dump \@clusters;
458
459							# create spans from each cluster, each with a weight.
460							# we do the initial sort so that clusters that overlap
461							# other clusters via get_window() are weeded out via %seen_pos.
462	17					58	my @spans;
463							my %seen_pos;
464							CLUSTER:
465	17					66	for my $cluster (
466							sort {
467							scalar(@$b) <=> scalar(@$a)
468	61	50	66			286	\|\| $heatmap{ $b->[0] } <=> $heatmap{ $a->[0] }
469							\|\| $a->[0] <=> $b->[0]
470							} @clusters
471							)
472							{
473
474							# get full window, ignoring positions we've already seen.
475	50					100	my $heat = 0;
476	50					99	my %span;
477							my @cluster_tokens;
478	50					146	POS: for my $pos (@$cluster) {
479	77					318	my ( $start, $end ) = $tokens->get_window( $pos, $window );
480	77					234	POS_TWO: for my $pos2 ( $start .. $end ) {
481	3357	100				7163	next if $seen_pos{$pos2}++;
482	1513	100				2650	$heat += ( exists $heatmap{$pos2} ? $heatmap{$pos2} : 0 );
483	1513					3508	push( @cluster_tokens, $tokens->get_token($pos2) );
484							}
485							}
486
487							# we may have skipped a $seen_pos from the $slice above
488							# so make sure we still start/end on a match
489	50		66			310	while ( @cluster_tokens && !$cluster_tokens[0]->is_match ) {
490	11					45	shift @cluster_tokens;
491							}
492	50		66			272	while ( @cluster_tokens && !$cluster_tokens[-1]->is_match ) {
493	6					24	pop @cluster_tokens;
494							}
495
496	50	50				133	next CLUSTER unless @cluster_tokens;
497
498							# sanity: make sure we still have something hot
499	50					94	my $has_hot = 0;
500	50					91	my @cluster_pos;
501							my @strings;
502	50					125	for (@cluster_tokens) {
503	1496					2773	my $pos = $_->pos;
504	1496	100				2847	$has_hot++ if exists $heatmap{$pos};
505	1496					3210	push @strings, $_->str;
506	1496					3048	push @cluster_pos, $pos;
507							}
508	50	100				172	next CLUSTER unless $has_hot;
509
510	39					110	$span{cluster} = $cluster;
511	39					103	$span{heat} = $heat;
512	39					102	$span{pos} = \@cluster_pos;
513	39					89	$span{tokens} = \@cluster_tokens;
514	39					239	$span{str} = join( '', @strings );
515
516							# spans with more unique hot tokens in a single span rank higher
517							# spans with more proximate hot tokens in a single span rank higher
518	39					94	my %uniq = ();
519	39					83	my $i = 0;
520	39					74	my $num_proximate = 1; # one for the single hot token
521	39					92	for (@cluster_pos) {
522	1341	100				2383	if ( exists $heatmap{$_} ) {
523	77					290	$uniq{ lc $strings[$i] } += $heatmap{$_};
524	77	100	100			337	if ( $i && exists $heatmap{ $cluster_pos[ $i - 2 ] } ) {
525	23					32	$num_proximate++;
526							}
527							}
528	1341					1911	$i++;
529							}
530	39					117	$span{unique} = scalar keys %uniq;
531	39					98	$span{proximate} = $num_proximate;
532
533							# no false phrase matches if !_treat_phrases_as_singles
534							# stemmer check because regex will likely fail when stemmer is on
535	39	100	66			149	if ( $query_has_phrase
536							and !$self->{_treat_phrases_as_singles} )
537							{
538	3	100				14	if ( !$self->{_stemmer} ) {
539
540							#warn "_treat_phrases_as_singles NOT true";
541	1	50				73	if ( $span{str} !~ m/$qre/ ) {
542	1	50				6	$debug
543							and warn
544							"treat_phrases_as_singles=FALSE and '$span{str}' failed to match $qre\n";
545	1					14	next CLUSTER;
546							}
547							}
548							else {
549
550							# stemmer used, so check unique term count against n_terms
551	2	50	66			12	if ( $n_terms == $query_has_phrase
552							&& $n_terms > $span{unique} )
553							{
554	0	0				0	$debug
555							and warn
556							"treat_phrases_as_singles=FALSE and '$span{str}' "
557							. "expected $n_terms but got $span{unique}\n";
558	0					0	next CLUSTER;
559							}
560
561							}
562							}
563
564							# just for debug
565	38	50				110	if ($debug) {
566	0					0	my $i = 0;
567							$span{str_w_pos} = join(
568							'',
569							map {
570	0					0	$strings[ $i++ ]
571							. ( exists $heatmap{$_} ? $OPEN : '[' )
572							. $_
573	0	0				0	. ( exists $heatmap{$_} ? $CLOSE : ']' )
		0
574							} @cluster_pos
575							);
576							}
577
578	38					257	push @spans, \%span;
579
580							}
581
582	17					169	$self->{spans} = $self->_sort_spans( \@spans );
583	17					68	$self->{heatmap} = \%heatmap;
584
585	17					278	return $self;
586							}
587
588							=head2 has_spans
589
590							Returns the number of spans found.
591
592							=cut
593
594							sub has_spans {
595	30			30	1	56	return scalar @{ $_[0]->{spans} };
	30					129
596							}
597
598							1;
599
600							__END__