|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
  
 
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Search::Tools::HeatMap;  | 
| 
2
 | 
16
 | 
 
 | 
 
 | 
  
16
  
 | 
 
 | 
124
 | 
 use Moo;  | 
| 
 
 | 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
37
 | 
    | 
| 
 
 | 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
107
 | 
    | 
| 
3
 | 
16
 | 
 
 | 
 
 | 
  
16
  
 | 
 
 | 
5598
 | 
 use Carp;  | 
| 
 
 | 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
46
 | 
    | 
| 
 
 | 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1239
 | 
    | 
| 
4
 | 
16
 | 
 
 | 
 
 | 
  
16
  
 | 
 
 | 
116
 | 
 use Data::Dump qw( dump );  | 
| 
 
 | 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
38
 | 
    | 
| 
 
 | 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1297
 | 
    | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 extends 'Search::Tools::Object';  | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
7
 | 
16
 | 
 
 | 
 
 | 
  
16
  
 | 
 
 | 
123
 | 
 use namespace::autoclean;  | 
| 
 
 | 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
215
 | 
    | 
| 
 
 | 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
164
 | 
    | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $VERSION = '1.007';  | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # debugging only  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my $OPEN  = '[';  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my $CLOSE = ']';  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 eval { require Term::ANSIColor; };  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 if ( !$@ ) {  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $OPEN .= Term::ANSIColor::color('bold red');  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $CLOSE = Term::ANSIColor::color('reset') . $CLOSE;  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my @attrs = qw( window_size  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     tokens  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     spans  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     as_sentences  | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     _treat_phrases_as_singles  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     _qre  | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     _query  | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     _stemmer  | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 );  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 for my $attr (@attrs) {  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     has $attr => ( is => 'rw' );  | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 NAME  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Search::Tools::HeatMap - locate the best matches in a snippet extract  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 SYNOPSIS  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  use Search::Tools::Tokenizer;  | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  use Search::Tools::HeatMap;  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
        | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $tokens = $self->tokenizer->tokenize( $my_string, qr/^(interesting)$/ );  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $heatmap = Search::Tools::HeatMap->new(  | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      tokens         => $tokens,  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      window_size    => 20,  # default  | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      as_sentences   => 0,   # default  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  );  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  if ( $heatmap->has_spans ) {  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      my $tokens_arr = $tokens->as_array;  | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      # stringify positions  | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      my @snips;  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      for my $span ( @{ $heatmap->spans } ) {  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
          push( @snips, $span->{str} );  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      }  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      my $occur_index = $self->occur - 1;  | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      if ( $#snips > $occur_index ) {  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
          @snips = @snips[ 0 .. $occur_index ];  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      }  | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      printf("%s\n", join( ' ... ', @snips ));  | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
        | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  }  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 DESCRIPTION  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Search::Tools::HeatMap implements a simple algorithm for locating  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 the densest clusters of unique, hot terms in a TokenList.  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 HeatMap is used internally by Snipper but documented here in case  | 
| 
73
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 someone wants to abuse and/or improve it.  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 METHODS  | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 new( tokens => I )  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Create a new HeatMap. The I object may be either a  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Search::Tools::TokenList or Search::Tools::TokenListPP object.  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 BUILD  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Builds the HeatMap object. Called internally by new().  | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
86
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub BUILD {  | 
| 
89
 | 
30
 | 
 
 | 
 
 | 
  
30
  
 | 
  
1
  
 | 
503
 | 
     my $self = shift;  | 
| 
90
 | 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
146
 | 
     $self->_build;  | 
| 
91
 | 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
207
 | 
     return $self;  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
94
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 window_size  | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The max width of a span. Defaults to 20 tokens, including the  | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 matches.  | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Set this in new(). Access it later if you need to, but the spans  | 
| 
100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 will have already been created by new().  | 
| 
101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 as_sentences  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Try to match clusters at sentence boundaries. Default is false.  | 
| 
105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
106
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Set this in new().  | 
| 
107
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 spans  | 
| 
109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns an array ref of matching clusters. Each span in the array  | 
| 
111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 is a hash ref with the following keys:  | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over  | 
| 
114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
115
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item cluster  | 
| 
116
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
117
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item pos  | 
| 
118
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
119
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item heat  | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item str  | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
123
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item str_w_pos  | 
| 
124
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
125
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This item is available only if debug() is true.  | 
| 
126
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
127
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item unique  | 
| 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
129
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
132
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
133
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # TODO this is mostly integer math and might be much  | 
| 
134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # faster if rewritten in XS once the algorithm is "final".  | 
| 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _build {  | 
| 
136
 | 
30
 | 
 
 | 
 
 | 
  
30
  
 | 
 
 | 
58
 | 
     my $self         = shift;  | 
| 
137
 | 
30
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
666
 | 
     my $tokens       = $self->tokens or croak "tokens required";  | 
| 
138
 | 
30
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
204
 | 
     my $window       = $self->window_size || 20;  | 
| 
139
 | 
30
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
207
 | 
     my $as_sentences = $self->as_sentences || 0;  | 
| 
140
 | 
30
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
179
 | 
     return $as_sentences  | 
| 
141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         ? $self->_as_sentences( $tokens, $window )  | 
| 
142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         : $self->_no_sentences( $tokens, $window );  | 
| 
143
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
144
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # currently _as_sentences() is mostly identical to _no_sentences()  | 
| 
146
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # with slightly fewer gymnastics.  | 
| 
147
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Since we already know via sentence_starts where our boundaries are,  | 
| 
148
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # we do not have to call $tokens->get_window().  | 
| 
149
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Who knows how we might improve the sentence algorithm in future,  | 
| 
150
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # so already having it in its own method seems like a win.  | 
| 
151
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _as_sentences {  | 
| 
152
 | 
13
 | 
 
 | 
 
 | 
  
13
  
 | 
 
 | 
42
 | 
     my ( $self, $tokens, $window ) = @_;  | 
| 
153
 | 
13
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
292
 | 
     my $debug = $self->debug || 0;  | 
| 
154
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
145
 | 
     my $sentence_length = $window * 2;  | 
| 
155
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
156
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # build heatmap with sentence starts  | 
| 
157
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
65
 | 
     my $num_tokens           = $tokens->len;  | 
| 
158
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45
 | 
     my $tokens_arr           = $tokens->as_array;  | 
| 
159
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
108
 | 
     my %heatmap              = ();  | 
| 
160
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
56
 | 
     my $token_list_heat      = $tokens->get_heat;  | 
| 
161
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
48
 | 
     my $heat_sentence_starts = $tokens->get_sentence_starts;  | 
| 
162
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
163
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # this regex is a sanity check for phrases. we replace the \ with a  | 
| 
164
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # more promiscuous check because the single space is too naive  | 
| 
165
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # for real text (e.g. st. john's)  | 
| 
166
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
33
 | 
     my $qre              = $self->{_qre};  | 
| 
167
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
25
 | 
     my @phrases          = @{ $self->{_query}->phrases };  | 
| 
 
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
65
 | 
    | 
| 
168
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
83
 | 
     my $n_terms          = $self->{_query}->num_terms;  | 
| 
169
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
108
 | 
     my $query_has_phrase = $qre =~ s/(\\ )+/.+/g;  | 
| 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
171
 | 
13
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
47
 | 
     if ($debug) {  | 
| 
172
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         warn "heat_sentence_starts: " . dump($heat_sentence_starts);  | 
| 
173
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         warn "token_list_heat: " . dump($token_list_heat);  | 
| 
174
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         warn "n_terms: $n_terms";  | 
| 
175
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         warn "phrases: " . dump( \@phrases );  | 
| 
176
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         warn "query_has_phrase: $query_has_phrase";  | 
| 
177
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
178
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
179
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # find the "sentence" that each hot token appears in.  | 
| 
180
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
25
 | 
     my @starts_ends;  | 
| 
181
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
28
 | 
     my $i                  = 0;  | 
| 
182
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
27
 | 
     my %heat_sentence_ends = ();    # cache  | 
| 
183
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
37
 | 
     for (@$token_list_heat) {  | 
| 
184
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
118
 | 
         my $token     = $tokens->get_token($_);  | 
| 
185
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
118
 | 
         my $token_pos = $token->pos;  | 
| 
186
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
83
 | 
         my $start     = $heat_sentence_starts->[ $i++ ];  | 
| 
187
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
183
 | 
         $heatmap{$token_pos} = $token->is_hot;  | 
| 
188
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
189
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # a little optimization for when we've got  | 
| 
190
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # multiple hot tokens in the same sentence  | 
| 
191
 | 
42
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
134
 | 
         if ( exists $heat_sentence_ends{$start} ) {  | 
| 
192
 | 
22
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
53
 | 
             $debug  | 
| 
193
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 and warn "found cached end $heat_sentence_ends{$start} "  | 
| 
194
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 . "for start $start token $token_pos\n";  | 
| 
195
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
196
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             push( @starts_ends,  | 
| 
197
 | 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
92
 | 
                 [ $start, $token_pos, $heat_sentence_ends{$start} ] );  | 
| 
198
 | 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
56
 | 
             next;  | 
| 
199
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
200
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
201
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # find the outermost limit of where this sentence might end  | 
| 
202
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
32
 | 
         my $max_end;  | 
| 
203
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
204
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # is there a "next" start?  | 
| 
205
 | 
20
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
159
 | 
         if ( defined $heat_sentence_starts->[$i]  | 
| 
206
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             and $heat_sentence_starts->[$i] != $start )  | 
| 
207
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         {  | 
| 
208
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
209
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # this token is unique in this non-final sentence  | 
| 
210
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
             $max_end = $heat_sentence_starts->[$i] - 1;  | 
| 
211
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
212
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         else {  | 
| 
213
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
214
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # this is the final sentence  | 
| 
215
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
43
 | 
             $max_end = $num_tokens - 1;  | 
| 
216
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
217
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
41
 | 
         my $end = $start;  | 
| 
218
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
219
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # find the nearest sentence end to the start  | 
| 
220
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
65
 | 
         while ( $end < $max_end ) {  | 
| 
221
 | 
1990
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4200
 | 
             my $tok = $tokens->get_token( $end++ );  | 
| 
222
 | 
1990
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
4827
 | 
             if ( !$tok ) {  | 
| 
223
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 $debug and warn "No token at end=$end";  | 
| 
224
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 last;  | 
| 
225
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
226
 | 
1990
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
5997
 | 
             if ( $tok->is_sentence_end ) {  | 
| 
227
 | 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
21
 | 
                 $end--;    # move back one position  | 
| 
228
 | 
10
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
30
 | 
                 if ($debug) {  | 
| 
229
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                     warn "tok $_ is_sentence_end end=$end";  | 
| 
230
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                     $tok->dump;  | 
| 
231
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
232
 | 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
36
 | 
                 last;  | 
| 
233
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
234
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
235
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
236
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # back up if we've exceeded the 0-based tokens array.  | 
| 
237
 | 
20
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
59
 | 
         $end = $num_tokens if $end > $num_tokens;  | 
| 
238
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
239
 | 
20
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
88
 | 
         $debug  | 
| 
240
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             and warn "start=$start max_end=$max_end "  | 
| 
241
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             . "sentence_length=$sentence_length end=$end "  | 
| 
242
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             . "token_pos=$token_pos\n";  | 
| 
243
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
244
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # if we didn't yet set the actual hot token,  | 
| 
245
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # include everything up to it.  | 
| 
246
 | 
20
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
55
 | 
         if ( $end < $token_pos ) {  | 
| 
247
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             $debug  | 
| 
248
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 and warn "resetting end=$token_pos\n";  | 
| 
249
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
250
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             $end = $token_pos;  | 
| 
251
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
252
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
72
 | 
         push( @starts_ends, [ $start, $token_pos, $end ] );  | 
| 
253
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
254
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # cache  | 
| 
255
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
83
 | 
         $heat_sentence_ends{$start} = $end;  | 
| 
256
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
257
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
258
 | 
13
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
49
 | 
     $debug and warn "starts_ends: " . dump( \@starts_ends );  | 
| 
259
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
260
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
31
 | 
     my @spans;  | 
| 
261
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my %seen_pos;  | 
| 
262
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 START_END:  | 
| 
263
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
31
 | 
     for my $start_end (@starts_ends) {  | 
| 
264
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # get full window, ignoring positions we've already seen.  | 
| 
266
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
69
 | 
         my $heat = 0;  | 
| 
267
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
75
 | 
         my %span;  | 
| 
268
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my @cluster_tokens;  | 
| 
269
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
270
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
121
 | 
         my ( $start, $hot_pos, $end ) = @$start_end;  | 
| 
271
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
106
 | 
     POS: for my $pos ( $start .. $end ) {  | 
| 
272
 | 
5075
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
10563
 | 
             next POS if $seen_pos{$pos}++;  | 
| 
273
 | 
2000
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
3275
 | 
             $heat += ( exists $heatmap{$pos} ? $heatmap{$pos} : 0 );  | 
| 
274
 | 
2000
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4318
 | 
             push( @cluster_tokens, $tokens->get_token($pos) );  | 
| 
275
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
276
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
277
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # if we had already seen_pos all positions.  | 
| 
278
 | 
42
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
135
 | 
         next START_END unless @cluster_tokens;  | 
| 
279
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
280
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # sanity: make sure we still have something hot  | 
| 
281
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
38
 | 
         my $has_hot = 0;  | 
| 
282
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
39
 | 
         my @cluster_pos;  | 
| 
283
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my @strings;  | 
| 
284
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
53
 | 
     TOK: for (@cluster_tokens) {  | 
| 
285
 | 
2000
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3405
 | 
             my $pos = $_->pos;  | 
| 
286
 | 
2000
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
3633
 | 
             $has_hot++ if exists $heatmap{$pos};  | 
| 
287
 | 
2000
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4037
 | 
             push @strings,     $_->str;  | 
| 
288
 | 
2000
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3854
 | 
             push @cluster_pos, $pos;  | 
| 
289
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
290
 | 
20
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
57
 | 
         next START_END unless $has_hot;  | 
| 
291
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
292
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # the final string is a sentence end,  | 
| 
293
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # but we only want the first char in it,  | 
| 
294
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # and not any whitespace, stray punctuation or other  | 
| 
295
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # non-word noise.  | 
| 
296
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
146
 | 
         $strings[$#strings] =~ s/^([\.\?\!]).*/$1/;  | 
| 
297
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
298
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
70
 | 
         $span{start_end} = $start_end;  | 
| 
299
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
52
 | 
         $span{heat}      = $heat;  | 
| 
300
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
44
 | 
         $span{pos}       = \@cluster_pos;  | 
| 
301
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
58
 | 
         $span{tokens}    = \@cluster_tokens;  | 
| 
302
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
185
 | 
         $span{str}       = join( '', @strings );  | 
| 
303
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
304
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # spans with more *unique* hot tokens in a single span rank higher  | 
| 
305
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # spans with more *proximate* hot tokens in a single span rank higher  | 
| 
306
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45
 | 
         my %uniq          = ();  | 
| 
307
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
37
 | 
         my $i             = 0;  | 
| 
308
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
39
 | 
         my $num_proximate = 1;    # one for the single hot token  | 
| 
309
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
41
 | 
         for (@cluster_pos) {  | 
| 
310
 | 
2000
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
3320
 | 
             if ( exists $heatmap{$_} ) {  | 
| 
311
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
171
 | 
                 $uniq{ lc $strings[$i] } += $heatmap{$_};  | 
| 
312
 | 
42
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
219
 | 
                 if ( $i && exists $heatmap{ $cluster_pos[ $i - 2 ] } ) {  | 
| 
313
 | 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
15
 | 
                     $num_proximate++;  | 
| 
314
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
315
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
316
 | 
2000
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2684
 | 
             $i++;  | 
| 
317
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
318
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
60
 | 
         $span{unique}    = scalar keys %uniq;  | 
| 
319
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
43
 | 
         $span{proximate} = $num_proximate;  | 
| 
320
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
321
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # no false phrase matches if !_treat_phrases_as_singles  | 
| 
322
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # stemmer check because regex will likely fail  | 
| 
323
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # when stemmer is on  | 
| 
324
 | 
20
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
93
 | 
         if ( $query_has_phrase  | 
| 
325
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             and !$self->{_treat_phrases_as_singles} )  | 
| 
326
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         {  | 
| 
327
 | 
7
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
27
 | 
             if ( !$self->{_stemmer} ) {  | 
| 
328
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
329
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 #warn "_treat_phrases_as_singles NOT true";  | 
| 
330
 | 
3
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
128
 | 
                 if ( $span{str} !~ m/$qre/ ) {  | 
| 
331
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                     $debug  | 
| 
332
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         and warn  | 
| 
333
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         "treat_phrases_as_singles=FALSE and '$span{str}' failed to match $qre\n";  | 
| 
334
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                     next START_END;  | 
| 
335
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
336
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
337
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             else {  | 
| 
338
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
339
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # if stemmer was on, we cannot rely on the regex,  | 
| 
340
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # but we assume that number of uniq terms must match query  | 
| 
341
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
342
 | 
4
 | 
  
 50
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
18
 | 
                 if (   $n_terms == $query_has_phrase  | 
| 
343
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     && $n_terms > $span{unique} )  | 
| 
344
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 {  | 
| 
345
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
346
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                     $debug  | 
| 
347
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         and warn  | 
| 
348
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         "treat_phrases_as_singles=FALSE and '$span{str}' "  | 
| 
349
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         . "expected $n_terms unique terms, got $span{unique}\n";  | 
| 
350
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                     next START_END;  | 
| 
351
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
352
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
353
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
354
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
355
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
356
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # just for debug  | 
| 
357
 | 
20
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
76
 | 
         if ($debug) {  | 
| 
358
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             my $i = 0;  | 
| 
359
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $span{str_w_pos} = join(  | 
| 
360
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 '',  | 
| 
361
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 map {  | 
| 
362
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                           $strings[ $i++ ]  | 
| 
363
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         . ( exists $heatmap{$_} ? $OPEN : '[' )  | 
| 
364
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         . $_  | 
| 
365
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                         . ( exists $heatmap{$_} ? $CLOSE : ']' )  | 
| 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
366
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 } @cluster_pos  | 
| 
367
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             );  | 
| 
368
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
369
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
370
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
189
 | 
         push @spans, \%span;  | 
| 
371
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
372
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
373
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
374
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
59
 | 
     $self->{spans}   = $self->_sort_spans( \@spans );  | 
| 
375
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
35
 | 
     $self->{heatmap} = \%heatmap;  | 
| 
376
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
377
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
265
 | 
     return $self;  | 
| 
378
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
379
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
380
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _sort_spans {  | 
| 
381
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return [  | 
| 
382
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
383
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # sort by unique,  | 
| 
384
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # then by proximity  | 
| 
385
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # then by heat  | 
| 
386
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # then by pos  | 
| 
387
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
388
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         sort {  | 
| 
389
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                    $b->{unique} <=> $a->{unique}  | 
| 
390
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 || $b->{proximate} <=> $a->{proximate}  | 
| 
391
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 || $b->{heat} <=> $a->{heat}  | 
| 
392
 | 
51
 | 
  
 50
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
362
 | 
                 || $a->{pos}->[0] <=> $b->{pos}->[0]  | 
| 
 
 | 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
393
 | 
30
 | 
 
 | 
 
 | 
  
30
  
 | 
 
 | 
80
 | 
             } @{ $_[1] }  | 
| 
 
 | 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
170
 | 
    | 
| 
394
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
395
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     ];  | 
| 
396
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
397
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
398
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _no_sentences {  | 
| 
399
 | 
17
 | 
 
 | 
 
 | 
  
17
  
 | 
 
 | 
57
 | 
     my ( $self, $tokens, $window ) = @_;  | 
| 
400
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
81
 | 
     my $lhs_window = int( $window / 2 );  | 
| 
401
 | 
17
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
468
 | 
     my $debug = $self->debug || 0;  | 
| 
402
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
403
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
251
 | 
     my $num_tokens      = $tokens->len;  | 
| 
404
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
73
 | 
     my $tokens_arr      = $tokens->as_array;  | 
| 
405
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
96
 | 
     my %heatmap         = ();  | 
| 
406
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
88
 | 
     my $token_list_heat = $tokens->get_heat;  | 
| 
407
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
408
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # this regex is a sanity check for phrases. we replace the \ with a  | 
| 
409
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # more promiscuous check because the single space is too naive  | 
| 
410
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # for real text (e.g. st. john's)  | 
| 
411
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
59
 | 
     my $qre              = $self->{_qre};  | 
| 
412
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
44
 | 
     my @phrases          = @{ $self->{_query}->phrases };  | 
| 
 
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
103
 | 
    | 
| 
413
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
100
 | 
     my $n_terms          = $self->{_query}->num_terms;  | 
| 
414
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
125
 | 
     my $query_has_phrase = $qre =~ s/(\\ )+/.+/g;  | 
| 
415
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
416
 | 
17
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
93
 | 
     if ($debug) {  | 
| 
417
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         warn "token_list_heat: " . dump($token_list_heat);  | 
| 
418
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         warn "n_terms: $n_terms";  | 
| 
419
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         warn "phrases: " . dump( \@phrases );  | 
| 
420
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         warn "query_has_phrase: $query_has_phrase";  | 
| 
421
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
422
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
423
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # build heatmap  | 
| 
424
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
61
 | 
     for (@$token_list_heat) {  | 
| 
425
 | 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
203
 | 
         my $token = $tokens->get_token($_);  | 
| 
426
 | 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
353
 | 
         $heatmap{ $token->pos } = $token->is_hot;  | 
| 
427
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
428
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
429
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # make clusters  | 
| 
430
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
431
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # $proximity == (1/4 of $window)+1 is somewhat arbitrary,  | 
| 
432
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # but since we want to err in having too much context,  | 
| 
433
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # we aim high. Worst case scenario is where there are  | 
| 
434
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # multiple hot spots in a cluster and each is a full  | 
| 
435
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # $proximity length apart, which will grow the  | 
| 
436
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # eventual span far beyond $window size. We rely  | 
| 
437
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # on max_chars in Snipper to catch that worst case.  | 
| 
438
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
79
 | 
     my $proximity = int( $lhs_window / 2 ) + 1;  | 
| 
439
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
114
 | 
     my @positions = sort { $a <=> $b } keys %heatmap;  | 
| 
 
 | 
138
 | 
 
 | 
 
 | 
 
 | 
 
 | 
234
 | 
    | 
| 
440
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
71
 | 
     my @clusters  = ( [] );  | 
| 
441
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
43
 | 
     my $i         = 0;  | 
| 
442
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
62
 | 
     for my $pos (@positions) {  | 
| 
443
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
444
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # if we have advanced past the first position  | 
| 
445
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # and the previous position is not "close" to this one,  | 
| 
446
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # start a new cluster  | 
| 
447
 | 
77
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
284
 | 
         if ( $i && ( $pos - $positions[ $i - 1 ] ) > $proximity ) {  | 
| 
448
 | 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
70
 | 
             push( @clusters, [$pos] );  | 
| 
449
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
450
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         else {  | 
| 
451
 | 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
73
 | 
             push( @{ $clusters[-1] }, $pos );  | 
| 
 
 | 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
106
 | 
    | 
| 
452
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
453
 | 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
136
 | 
         $i++;  | 
| 
454
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
455
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
456
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $debug  | 
| 
457
 | 
17
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
68
 | 
         and warn "proximity: $proximity   clusters: " . dump \@clusters;  | 
| 
458
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
459
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # create spans from each cluster, each with a weight.  | 
| 
460
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # we do the initial sort so that clusters that overlap  | 
| 
461
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # other clusters via get_window() are weeded out via %seen_pos.  | 
| 
462
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
58
 | 
     my @spans;  | 
| 
463
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my %seen_pos;  | 
| 
464
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 CLUSTER:  | 
| 
465
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
66
 | 
     for my $cluster (  | 
| 
466
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         sort {  | 
| 
467
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                    scalar(@$b) <=> scalar(@$a)  | 
| 
468
 | 
61
 | 
  
 50
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
286
 | 
                 || $heatmap{ $b->[0] } <=> $heatmap{ $a->[0] }  | 
| 
469
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 || $a->[0] <=> $b->[0]  | 
| 
470
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } @clusters  | 
| 
471
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         )  | 
| 
472
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     {  | 
| 
473
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
474
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # get full window, ignoring positions we've already seen.  | 
| 
475
 | 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
100
 | 
         my $heat = 0;  | 
| 
476
 | 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
99
 | 
         my %span;  | 
| 
477
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my @cluster_tokens;  | 
| 
478
 | 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
146
 | 
     POS: for my $pos (@$cluster) {  | 
| 
479
 | 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
318
 | 
             my ( $start, $end ) = $tokens->get_window( $pos, $window );  | 
| 
480
 | 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
234
 | 
         POS_TWO: for my $pos2 ( $start .. $end ) {  | 
| 
481
 | 
3357
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
7163
 | 
                 next if $seen_pos{$pos2}++;  | 
| 
482
 | 
1513
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
2650
 | 
                 $heat += ( exists $heatmap{$pos2} ? $heatmap{$pos2} : 0 );  | 
| 
483
 | 
1513
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3508
 | 
                 push( @cluster_tokens, $tokens->get_token($pos2) );  | 
| 
484
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
485
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
486
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
487
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # we may have skipped a $seen_pos from the $slice above  | 
| 
488
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # so make sure we still start/end on a match  | 
| 
489
 | 
50
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
310
 | 
         while ( @cluster_tokens && !$cluster_tokens[0]->is_match ) {  | 
| 
490
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45
 | 
             shift @cluster_tokens;  | 
| 
491
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
492
 | 
50
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
272
 | 
         while ( @cluster_tokens && !$cluster_tokens[-1]->is_match ) {  | 
| 
493
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
             pop @cluster_tokens;  | 
| 
494
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
495
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
496
 | 
50
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
133
 | 
         next CLUSTER unless @cluster_tokens;  | 
| 
497
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
498
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # sanity: make sure we still have something hot  | 
| 
499
 | 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
94
 | 
         my $has_hot = 0;  | 
| 
500
 | 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
91
 | 
         my @cluster_pos;  | 
| 
501
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my @strings;  | 
| 
502
 | 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
125
 | 
         for (@cluster_tokens) {  | 
| 
503
 | 
1496
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2773
 | 
             my $pos = $_->pos;  | 
| 
504
 | 
1496
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
2847
 | 
             $has_hot++ if exists $heatmap{$pos};  | 
| 
505
 | 
1496
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3210
 | 
             push @strings,     $_->str;  | 
| 
506
 | 
1496
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3048
 | 
             push @cluster_pos, $pos;  | 
| 
507
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
508
 | 
50
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
172
 | 
         next CLUSTER unless $has_hot;  | 
| 
509
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
510
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
110
 | 
         $span{cluster} = $cluster;  | 
| 
511
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
103
 | 
         $span{heat}    = $heat;  | 
| 
512
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
102
 | 
         $span{pos}     = \@cluster_pos;  | 
| 
513
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
89
 | 
         $span{tokens}  = \@cluster_tokens;  | 
| 
514
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
239
 | 
         $span{str}     = join( '', @strings );  | 
| 
515
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
516
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # spans with more *unique* hot tokens in a single span rank higher  | 
| 
517
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # spans with more *proximate* hot tokens in a single span rank higher  | 
| 
518
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
94
 | 
         my %uniq          = ();  | 
| 
519
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
83
 | 
         my $i             = 0;  | 
| 
520
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
74
 | 
         my $num_proximate = 1;    # one for the single hot token  | 
| 
521
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
92
 | 
         for (@cluster_pos) {  | 
| 
522
 | 
1341
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
2383
 | 
             if ( exists $heatmap{$_} ) {  | 
| 
523
 | 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
290
 | 
                 $uniq{ lc $strings[$i] } += $heatmap{$_};  | 
| 
524
 | 
77
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
337
 | 
                 if ( $i && exists $heatmap{ $cluster_pos[ $i - 2 ] } ) {  | 
| 
525
 | 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
32
 | 
                     $num_proximate++;  | 
| 
526
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
527
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
528
 | 
1341
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1911
 | 
             $i++;  | 
| 
529
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
530
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
117
 | 
         $span{unique}    = scalar keys %uniq;  | 
| 
531
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
98
 | 
         $span{proximate} = $num_proximate;  | 
| 
532
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
533
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # no false phrase matches if !_treat_phrases_as_singles  | 
| 
534
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # stemmer check because regex will likely fail when stemmer is on  | 
| 
535
 | 
39
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
149
 | 
         if ( $query_has_phrase  | 
| 
536
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             and !$self->{_treat_phrases_as_singles} )  | 
| 
537
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         {  | 
| 
538
 | 
3
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
14
 | 
             if ( !$self->{_stemmer} ) {  | 
| 
539
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
540
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 #warn "_treat_phrases_as_singles NOT true";  | 
| 
541
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
73
 | 
                 if ( $span{str} !~ m/$qre/ ) {  | 
| 
542
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
6
 | 
                     $debug  | 
| 
543
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         and warn  | 
| 
544
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         "treat_phrases_as_singles=FALSE and '$span{str}' failed to match $qre\n";  | 
| 
545
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
                     next CLUSTER;  | 
| 
546
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
547
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
548
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             else {  | 
| 
549
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
550
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # stemmer used, so check unique term count against n_terms  | 
| 
551
 | 
2
 | 
  
 50
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
12
 | 
                 if (   $n_terms == $query_has_phrase  | 
| 
552
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     && $n_terms > $span{unique} )  | 
| 
553
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 {  | 
| 
554
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                     $debug  | 
| 
555
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         and warn  | 
| 
556
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         "treat_phrases_as_singles=FALSE and '$span{str}' "  | 
| 
557
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         . "expected $n_terms but got $span{unique}\n";  | 
| 
558
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                     next CLUSTER;  | 
| 
559
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
560
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
561
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
562
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
563
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
564
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # just for debug  | 
| 
565
 | 
38
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
110
 | 
         if ($debug) {  | 
| 
566
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             my $i = 0;  | 
| 
567
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $span{str_w_pos} = join(  | 
| 
568
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 '',  | 
| 
569
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 map {  | 
| 
570
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                           $strings[ $i++ ]  | 
| 
571
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         . ( exists $heatmap{$_} ? $OPEN : '[' )  | 
| 
572
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         . $_  | 
| 
573
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                         . ( exists $heatmap{$_} ? $CLOSE : ']' )  | 
| 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
574
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 } @cluster_pos  | 
| 
575
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             );  | 
| 
576
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
577
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
578
 | 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
257
 | 
         push @spans, \%span;  | 
| 
579
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
580
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
581
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
582
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
169
 | 
     $self->{spans}   = $self->_sort_spans( \@spans );  | 
| 
583
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
68
 | 
     $self->{heatmap} = \%heatmap;  | 
| 
584
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
585
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
278
 | 
     return $self;  | 
| 
586
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
587
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
588
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 has_spans  | 
| 
589
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
590
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns the number of spans found.  | 
| 
591
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
592
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
593
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
594
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub has_spans {  | 
| 
595
 | 
30
 | 
 
 | 
 
 | 
  
30
  
 | 
  
1
  
 | 
56
 | 
     return scalar @{ $_[0]->{spans} };  | 
| 
 
 | 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
129
 | 
    | 
| 
596
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
597
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
598
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
599
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
600
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |