File Coverage

blib/lib/KinoSearch1/Highlight/Highlighter.pm
Criterion Covered Total %
statement 162 177 91.5
branch 39 56 69.6
condition 4 6 66.6
subroutine 12 12 100.0
pod 1 2 50.0
total 218 253 86.1


line stmt bran cond sub pod time code
1             package KinoSearch1::Highlight::Highlighter;
2 18     18   103 use strict;
  18         44  
  18         667  
3 18     18   99 use warnings;
  18         45  
  18         453  
4 18     18   127 use KinoSearch1::Util::ToolSet;
  18         36  
  18         2828  
5 18     18   104 use base qw( KinoSearch1::Util::Class );
  18         68  
  18         1578  
6 18     18   11460 use locale;
  18         2752  
  18         126  
7              
8             BEGIN {
9 18     18   1841 __PACKAGE__->init_instance_vars(
10             # constructor params / members
11             excerpt_field => undef,
12             analyzer => undef,
13             formatter => undef,
14             encoder => undef,
15             terms => undef,
16             excerpt_length => 200,
17             pre_tag => undef, # back compat
18             post_tag => undef, # back compat
19             token_re => qr/\b\w+(?:'\w+)?\b/,
20              
21             # members
22             limit => undef,
23             );
24 18         181 __PACKAGE__->ready_get_set(qw( terms ));
25             }
26              
27 18     18   10961 use KinoSearch1::Highlight::SimpleHTMLFormatter;
  18         51  
  18         463  
28 18     18   10697 use KinoSearch1::Highlight::SimpleHTMLEncoder;
  18         52  
  18         52856  
29              
30             sub init_instance {
31 1     1 1 199 my $self = shift;
32 1 50       177 croak("Missing required arg 'excerpt_field'")
33             unless defined $self->{excerpt_field};
34 1         3 $self->{terms} = [];
35              
36             # assume HTML
37 1 50       174 if ( !defined $self->{encoder} ) {
38 1         11 $self->{encoder} = KinoSearch1::Highlight::SimpleHTMLEncoder->new;
39             }
40 1 50       7 if ( !defined $self->{formatter} ) {
41 1         3 my ( $pre_tag, $post_tag ) = @{$self}{qw( pre_tag post_tag )};
  1         3  
42 1 50       5 $pre_tag = '' unless defined $pre_tag;
43 1 50       4 $post_tag = '' unless defined $post_tag;
44 1         17 $self->{formatter} = KinoSearch1::Highlight::SimpleHTMLFormatter->new(
45             pre_tag => $pre_tag,
46             post_tag => $post_tag,
47             );
48             }
49              
50             # scoring window is 1.66 * excerpt_length, with the loc in the middle
51 1         5 $self->{limit} = int( $self->{excerpt_length} / 3 );
52             }
53              
54             sub generate_excerpt {
55 4     4 0 8 my ( $self, $doc ) = @_;
56 4         11 my $excerpt_length = $self->{excerpt_length};
57 4         8 my $limit = $self->{limit};
58 4         8 my $token_re = $self->{token_re};
59              
60             # retrieve the text from the chosen field
61 4         17 my $field = $doc->get_field( $self->{excerpt_field} );
62 4         13 my $text = $field->get_value;
63 4         18 my $text_length = bytes::length $text;
64 4 50       21 return '' unless $text_length;
65              
66             # determine the rough boundaries of the excerpt
67 4         15 my $posits = $self->_starts_and_ends($field);
68 4         14 my $best_location = $self->_calc_best_location($posits);
69 4         10 my $top = $best_location - $limit;
70              
71             # expand the excerpt if the best location is near the end
72 4 100       10 $top
73             = $text_length - $excerpt_length < $top
74             ? $text_length - $excerpt_length
75             : $top;
76              
77             # if the best starting point is the very beginning, cool...
78 4 100       10 if ( $top <= 0 ) {
79 2         4 $top = 0;
80             }
81             # ... otherwise ...
82             else {
83             # lop off $top bytes
84 2         9 $text = bytes::substr( $text, $top );
85              
86             # try to start the excerpt at a sentence boundary
87 2 50       74 if ($text =~ s/
88             \A
89             (
90             \C{0,$limit}?
91             \.\s+
92             )
93             //xsm
94             )
95             {
96 0         0 $top += bytes::length($1);
97             }
98             # no sentence boundary, so we'll need an ellipsis
99             else {
100             # skip past possible partial tokens, prepend an ellipsis
101 2 50       54 if ($text =~ s/
102             \A
103             (
104             \C{0,$limit}? # don't go outside the window
105             $token_re # match possible partial token
106             .*? # ... and any junk following that token
107             )
108             (?=$token_re) # just before the start of a full token...
109             /... /xsm # ... insert an ellipsis
110             )
111             {
112 2         7 $top += bytes::length($1);
113 2         11 $top -= 4 # three dots and a space
114             }
115             }
116             }
117              
118             # remove possible partial tokens from the end of the excerpt
119 4         16 $text = bytes::substr( $text, 0, $excerpt_length + 1 );
120 4 100       28 if ( bytes::length($text) > $excerpt_length ) {
121 2         10 my $extra_char = chop $text;
122             # if the extra char wasn't part of a token, we aren't splitting one
123 2 50       16 if ( $extra_char =~ $token_re ) {
124 2         93 $text =~ s/$token_re$//; # if this is unsuccessful, that's fine
125             }
126             }
127              
128             # if the excerpt doesn't end with a full stop, end with an an ellipsis
129 4 100       40 if ( $text !~ /\.\s*\Z/xsm ) {
130 3         40 $text =~ s/\W+\Z//xsm;
131 3         10 while ( bytes::length($text) + 4 > $excerpt_length ) {
132 4         18 my $extra_char = chop $text;
133 4 50       29 if ( $extra_char =~ $token_re ) {
134 4         151 $text =~ s/\W+$token_re\Z//xsm; # if unsuccessful, that's fine
135             }
136 4         73 $text =~ s/\W+\Z//xsm;
137             }
138 3         17 $text .= ' ...';
139             }
140              
141             # remap locations now that we know the starting and ending bytes
142 4         11 $text_length = bytes::length($text);
143 4         17 my @relative_starts = map { $_->[0] - $top } @$posits;
  5         13  
144 4         8 my @relative_ends = map { $_->[1] - $top } @$posits;
  5         10  
145              
146             # get rid of pairs with at least one member outside the text
147 4   66     25 while ( @relative_starts and $relative_starts[0] < 0 ) {
148 0         0 shift @relative_starts;
149 0         0 shift @relative_ends;
150             }
151 4   66     22 while ( @relative_ends and $relative_ends[-1] > $text_length ) {
152 0         0 pop @relative_starts;
153 0         0 pop @relative_ends;
154             }
155              
156             # insert highlight tags
157 4         9 my $formatter = $self->{formatter};
158 4         10 my $encoder = $self->{encoder};
159 4         6 my $output_text = '';
160 4         8 my ( $start, $end, $last_start, $last_end ) = ( undef, undef, 0, 0 );
161 4         11 while (@relative_starts) {
162 5         7 $end = shift @relative_ends;
163 5         6 $start = shift @relative_starts;
164 5         14 $output_text .= $encoder->encode(
165             bytes::substr( $text, $last_end, $start - $last_end ) );
166 5         18 $output_text
167             .= $formatter->highlight(
168             $encoder->encode( bytes::substr( $text, $start, $end - $start ) )
169             );
170 5         13 $last_end = $end;
171             }
172 4         12 $output_text .= $encoder->encode( bytes::substr( $text, $last_end ) );
173              
174 4         25 return $output_text;
175             }
176              
177             =for comment
178             Find all points in the text where a relevant term begins and ends. For terms
179             that are part of a phrase, only include points that are part of the phrase.
180              
181             =cut
182              
183             sub _starts_and_ends {
184 4     4   5 my ( $self, $field ) = @_;
185 4         6 my @posits;
186             my %done;
187              
188 4         6 TERM: for my $term ( @{ $self->{terms} } ) {
  4         12  
189 16 100       39 if ( a_isa_b( $term, 'KinoSearch1::Index::Term' ) ) {
190 10         33 my $term_text = $term->get_text;
191              
192 10 100       33 next TERM if $done{$term_text};
193 5         10 $done{$term_text} = 1;
194              
195             # add all starts and ends
196 5         17 my $term_vector = $field->term_vector($term_text);
197 5 100       15 next TERM unless defined $term_vector;
198 4         12 my $starts = $term_vector->get_start_offsets;
199 4         10 my $ends = $term_vector->get_end_offsets;
200 4         10 while (@$starts) {
201 4         22 push @posits, [ shift @$starts, shift @$ends, 1 ];
202             }
203             }
204             # intersect positions for phrase terms
205             else {
206             # if not a Term, it's an array of Terms representing a phrase
207 6         10 my @term_texts = map { $_->get_text } @$term;
  18         40  
208              
209 6         17 my $phrase_text = join( ' ', @term_texts );
210 6 100       19 next TERM if $done{$phrase_text};
211 3         7 $done{$phrase_text} = 1;
212              
213 3         42 my $posit_vec = KinoSearch1::Util::BitVector->new;
214 3         6 my @term_vectors = map { $field->term_vector($_) } @term_texts;
  9         21  
215              
216             # make sure all terms are present
217 3 100       16 next TERM unless scalar @term_vectors == scalar @term_texts;
218              
219 2         4 my $i = 0;
220 2         4 for my $tv (@term_vectors) {
221             # one term missing, ergo no phrase
222 6 50       14 next TERM unless defined $tv;
223 6 100       13 if ( $i == 0 ) {
224 2         5 $posit_vec->set( @{ $tv->get_positions } );
  2         7  
225             }
226             else {
227             # filter positions using logical "and"
228 4         27 my $other_posit_vec = KinoSearch1::Util::BitVector->new;
229 4         15 $other_posit_vec->set(
230 4         9 grep { $_ >= 0 }
231 4         10 map { $_ - $i } @{ $tv->get_positions }
  4         11  
232             );
233 4         33 $posit_vec->logical_and($other_posit_vec);
234             }
235 6         11 $i++;
236             }
237              
238             # add only those starts/ends that belong to a valid position
239 2         8 my $tv_start_positions = $term_vectors[0]->get_positions;
240 2         9 my $tv_starts = $term_vectors[0]->get_start_offsets;
241 2         6 my $tv_end_positions = $term_vectors[-1]->get_positions;
242 2         6 my $tv_ends = $term_vectors[-1]->get_end_offsets;
243 2         5 $i = 0;
244 2         3 my $j = 0;
245 2         3 my $last_token_index = $#term_vectors;
246 2         4 for my $valid_position ( @{ $posit_vec->to_arrayref } ) {
  2         11  
247              
248 2         9 while ( $i <= $#$tv_start_positions ) {
249 2 50       7 last if ( $tv_start_positions->[$i] >= $valid_position );
250 0         0 $i++;
251             }
252 2         3 $valid_position += $last_token_index;
253 2         5 while ( $j <= $#$tv_end_positions ) {
254 2 50       6 last if ( $tv_end_positions->[$j] >= $valid_position );
255 0         0 $j++;
256             }
257 2         7 push @posits,
258             [ $tv_starts->[$i], $tv_ends->[$j], scalar @$term ];
259 2         4 $i++;
260 2         29 $j++;
261             }
262             }
263             }
264              
265             # sort, uniquify and return
266 4 50       13 @posits = sort { $a->[0] <=> $b->[0] || $b->[1] <=> $a->[1] } @posits;
  3         10  
267 4         6 my @unique;
268 4         7 my $last = ~0;
269 4         8 for (@posits) {
270 6 100       14 push @unique, $_ if $_->[0] != $last;
271 6         10 $last = $_->[0];
272             }
273 4         12 return \@unique;
274             }
275              
276             =for comment
277             Select the byte address representing the greatest keyword density. Because
278             the algorithm counts bytes rather than characters, it will degrade if the
279             number of bytes per character is larger than 1.
280              
281             =cut
282              
283             sub _calc_best_location {
284 4     4   6 my ( $self, $posits ) = @_;
285 4         6 my $window = $self->{limit} * 2;
286              
287             # if there aren't any keywords, take the excerpt from the top of the text
288 4 100       10 return 0 unless @$posits;
289              
290 3         3 my %locations = map { ( $_->[0] => 0 ) } @$posits;
  5         19  
291              
292             # if another keyword is in close proximity, add to the loc's score
293 3         9 for my $loc_index ( 0 .. $#$posits ) {
294             # only score positions that are in range
295 5         8 my $location = $posits->[$loc_index][0];
296 5         7 my $other_loc_index = $loc_index - 1;
297 5         16 while ( $other_loc_index > 0 ) {
298 0         0 my $diff = $location - $posits->[$other_loc_index][0];
299 0 0       0 last if $diff > $window;
300 0         0 my $num_tokens_at_pos = $posits->[$other_loc_index][2];
301 0         0 $locations{$location}
302             += ( 1 / ( 1 + log($diff) ) ) * $num_tokens_at_pos;
303 0         0 --$other_loc_index;
304             }
305 5         7 $other_loc_index = $loc_index + 1;
306 5         20 while ( $other_loc_index <= $#$posits ) {
307 2         4 my $diff = $posits->[$other_loc_index] - $location;
308 2 50       9 last if $diff > $window;
309 0         0 my $num_tokens_at_pos = $posits->[$other_loc_index][2];
310 0         0 $locations{$location}
311             += ( 1 / ( 1 + log($diff) ) ) * $num_tokens_at_pos;
312 0         0 ++$other_loc_index;
313             }
314             }
315              
316             # return the highest scoring position
317 3         13 return ( sort { $locations{$b} <=> $locations{$a} } keys %locations )[0];
  2         10  
318             }
319              
320             1;
321              
322             __END__