File Coverage

blib/lib/KSx/Highlight/Summarizer.pm
Criterion Covered Total %
statement 26 153 16.9
branch 2 62 3.2
condition 0 33 0.0
subroutine 7 11 63.6
pod 3 3 100.0
total 38 262 14.5


line stmt bran cond sub pod time code
1             package KSx::Highlight::Summarizer;
2              
3             $VERSION = '0.06';
4              
5             @ISA = KinoSearch::Highlight::Highlighter;
6 1     1   1500 use KinoSearch::Highlight::Highlighter;
  1         3  
  1         57  
7              
8 1     1   6 use strict;
  1         2  
  1         44  
9              
10 1     1   24 use List::Util qw 'min';
  1         2  
  1         169  
11 1     1   1073 use Number::Range;
  1         26897  
  1         76  
12              
13 1     1   1509 use Hash::Util::FieldHash::Compat 'fieldhashes';
  1         25608  
  1         10  
14             fieldhashes \my( %ellipsis, %summ_len, %page_h, %encoder );
15              
16             sub _range_endpoints {
17 0     0   0 my $range = shift;
18 0         0 my @range = $range->range;
19 0         0 my $previous = shift @range;
20 0         0 my $subrange = [($previous) x 2];
21 0         0 my @arrays;
22 0         0 foreach my $current (@range) {
23 0 0       0 if ($current == ($previous + 1)) {
24 0         0 $subrange->[1] = $current;
25             }
26             else {
27 0         0 push @arrays, $subrange;
28 0         0 $subrange = [($current) x 2];
29             }
30 0         0 $previous = $current;
31             }
32 0         0 return @arrays, $subrange; # Make sure the last subrange isn’t left out!
33             }
34              
35             sub new {
36 1     1 1 31123 my($pack, %args) = @_;
37 1 50       10 my $ellipsis = exists $args{ellipsis} ? delete $args{ellipsis}
38             : ' ... ';
39 1 50       4 my $summ_len = exists $args{summary_length}
40             ? delete $args{summary_length} : 0;
41 1         4 my $page_h = delete $args{page_handler};
42 1         3 my $encoder = delete $args{encoder};
43              
44             # accept args that the superclass only allows one to set through
45             # accessor methods:
46 1         3 my $pre_tag = delete $args{pre_tag};
47 1         2 my $post_tag = delete $args{post_tag};
48              
49 1         201 my $self = SUPER::new $pack %args;
50              
51 0           $ellipsis{$self} = $ellipsis;
52 0           $summ_len{$self} = $summ_len;
53 0           $page_h{$self} = $page_h;
54 0           $encoder{$self} = $encoder;
55              
56 0 0         defined $pre_tag and $self->set_pre_tag($pre_tag);
57 0 0         defined $post_tag and $self->set_post_tag($post_tag);
58              
59 0           return $self;
60             }
61              
62             sub create_excerpt {
63 0     0 1   my ($self, $hitdoc) = @_;
64            
65 0           my $field = $self->get_field;
66 0           my $x_len = $self->get_excerpt_length;
67 0           my $limit = int($x_len /3 );
68              
69             # retrieve the text from the chosen field
70 0           my $text = $hitdoc->{$field};
71 0 0         return unless defined $text;
72 0           my $text_length = length $text;
73 0 0         return '' unless $text_length;
74              
75             # get offsets and weights of words that match
76 0           my $searcher = $self->get_searchable;
77 0           my $posits = $self->get_compiler->highlight_spans(
78             searchable => $searcher,
79             field => $field,
80             doc_vec => $searcher->fetch_doc_vec(
81             $hitdoc->get_doc_id
82             ),
83             );
84 0           my @locs = map [$_->get_offset,$_->get_weight], @{
85 0           KinoSearch::Highlight::HeatMap->new(
86             spans => $posits,
87             window => $limit*2
88             )->get_spans
89             };
90 0           @locs = map $$_[0], sort { $$b[1] <=> $$a[1] } @locs;
  0            
91            
92 0 0         @locs or @locs = 0;
93              
94             #warn "@locs" if $summ_len{$self};
95             # determine the rough boundaries of the excerpts
96 0           my $range = new Number::Range;
97 0           my $summ_len = $summ_len{$self};
98 0           for(@locs) {
99 1     1   1316 no warnings; # suppress Number::Range’s nasty warnings
  1         2  
  1         1701  
100 0           my $start = $_-$limit;
101 0 0         $start = 0 if $start < 0;
102 0           $range->addrange($start . '..' . min($start+$x_len, $text_length));
103 0 0 0       last if !$summ_len || $range->size >= $summ_len;
104             }
105 0           my @excerpt_bounds = _range_endpoints($range);
106             #use DDS; warn Dump \@excerpt_bounds if $summ_len;
107              
108             # close small gaps between ranges
109 0           for(my $c = 1; $c < @excerpt_bounds;++$c) {
110 0 0         $excerpt_bounds[$c][0] - $excerpt_bounds[$c-1][1] <= 10 and
111             $excerpt_bounds[$c-1][1] = $excerpt_bounds[$c][1],
112             splice(@excerpt_bounds, $c, 1),
113             --$c;
114             }
115              
116             # extract the offsets from the highlight spans
117 0           my(@starts, @ends);
118 0           for(@$posits) {
119 0           push(@starts, my $start = $_->get_offset);
120 0           push(@ends, $start + $_->get_length);
121             }
122              
123             # make the summary
124 0           my $summary = '';
125 0           my $ellipsis = $ellipsis{$self};
126 0           my $token_re = qr/\b\w+(?:'\w+)?\b/;
127 0           my $prev_ellipsis; # whether the previous excerpt ended with an ellip.
128 0           my $prev_page = 0; # last page number of previous excerpt
129 0           my $page_h = $page_h{$self};
130 0           for(@excerpt_bounds) {
131             # make the excerpt
132 0           my ($start,$end) = @$_;
133              
134             # determine the page number that $start falls within
135 0           my $page_no;
136 0 0         $page_h and $page_no =
137             substr($text, 0,$start) =~ y/\014// + 1;
138              
139 0           my $x; # short for x-cerpt
140             my $need_ellipsis;
141              
142             #warn "<<".substr($text,$start,$limit).">>";
143             # look for a page break within $limit chars from $start (except we
144             # shouldn’t do it if $start is 0 because there’s a good chance
145             # we’ll go past the very word for whose sake this excerpt exists)
146             # ~~~ What about a case in which a page break plus maybe a few
147             # spaces occur just *before* $start. That shouldn’t get an
148             # ellipsis (as in the elsif block below), should it?
149 0 0 0       if($page_h && $start &&
    0 0        
150             substr($text,$start,$limit) =~ /^(.*)\014/s) {
151 0           $start += length($1) + 1;
152 0           $page_no += 1 + $1 =~ y/\014//;
153 0           $x = substr $text, $start;
154             }
155             elsif( $start ) { # if this is not the beginning of the doc
156 0           my $sb = $self->find_sentences(
157             text => $text, offset => $start, length => $limit
158             );
159 0 0         if(@$sb) {
160 0           $start = $$sb[0];
161             }
162 0           else { ++ $need_ellipsis }
163 0           $x = substr $text, $start;
164 0 0         if($need_ellipsis) {
165             # skip past possible partial tokens, but don’t insert an
166             # ellipsis yet, because it might need to come after a
167             # page marker
168 0 0         if ($x =~ s/
169             \A
170             (
171             .{1,$limit}? # don't go outside the window
172             )
173             (?=$token_re) # just b4 the start of a full token
174             //xsm
175             )
176             {
177 0           $start += length($1);
178             }
179             }
180             }
181 0           else { $x = substr $text, $start }
182              
183             # trim unwanted text from the end of the excerpt
184 0           $x = substr $x, 0, $end-$start+1; # +1 ’cos we need that extra
185             # char later
186 0           my $end_with_ellipsis = 0;
187              
188             # if we’ve trimmed the end of the text
189 0 0         if ( $end < $text_length) {{ # doubled so ‘last’ will work
190             # check to see whether there are page breaks after the high-
191             # lighted word, and stop at the first one if so
192 0 0 0       if ($page_h and substr($x, $limit*-2) =~ s/(\014[^\014]*)//) {
  0            
193 0           $end -= length $1; last;
  0            
194             }
195              
196             # remove possible partial tokens from the end of the excerpt
197 0           my $extra_char = chop $x; # the char we left dangling earlier
198             # if the extra char wasn't part of a token, then we’re not
199             # splitting one
200 0 0         if ( $extra_char =~ $token_re ) {
201 0           $x =~ s/$token_re$//; # if this fails, that's fine
202             }
203              
204             # if the excerpt doesn't end with a full stop, end with
205             # an ellipsis
206 0 0         if ( $x !~ /\.\s*\Z/xsm ) {
207 0           $x =~ s/\W+\Z//xsm;
208 0           $x .= $ellipsis;
209 0           ++$end_with_ellipsis;
210             }
211             }}
212             #warn $x if $page_h;
213              
214             # get the offsets that are within range for the excerpt, and make
215             # them relative to $start
216 0           my @relative_starts = map $_-$start, @starts;
217 0           my @relative_ends = map $_-$start, @ends;
218 0           my $this_x_len = $end - $start;
219 0   0       while ( @relative_starts and $relative_starts[0] < 0 ) {
220 0           shift @relative_starts;
221 0           shift @relative_ends;
222             }
223 0   0       while ( @relative_ends and $relative_ends[-1] > $this_x_len ) {
224 0           pop @relative_starts;
225 0           pop @relative_ends;
226             }
227              
228             # insert highlight tags and page break markers
229             # sstart and send stand for span start and end
230 0           my ( $sstart, $send, $last_sstart, $last_send ) =
231             ( undef, undef, 0, 0 );
232 0 0         if($page_h) { # Some of this code *is* repeated redundantly, but it
233             # should theoretically run faster since the
234             # if($page_h) check doesn’t have to be made every
235             # time through the loop.
236 0 0 0       $prev_page != $page_no
      0        
237             ? (
238             $summary .= &$page_h($hitdoc, $page_no),
239             $need_ellipsis && ($summary .= $ellipsis)
240             ) : $need_ellipsis && !$prev_ellipsis &&
241             ($summary .= $ellipsis)
242             ;
243 0           while (@relative_starts) {
244 0           $send = shift @relative_ends;
245 0           $sstart = shift @relative_starts;
246 0 0 0       $summary .= _encode_with_pb( $self,
247             substr( $x, $last_send, $sstart - $last_send ),
248             $page_h, \$page_no, $hitdoc
249             ) unless !$last_send && !$sstart;
250 0           $summary .= $self->highlight(
251             _encode_with_pb( $self,
252             substr( $x, $sstart, $send - $sstart ),
253             $page_h, \$page_no, $hitdoc
254             )
255             );
256 0           $last_send = $send;
257             }
258 0 0         $summary .= _encode_with_pb( $self,
259             substr( $x, $last_send ),
260             $page_h, \$page_no, $hitdoc
261             ) unless $last_send == length $x;
262 0           $prev_page = $page_no;
263             }
264             else {
265 0 0 0       $need_ellipsis and !$prev_ellipsis and $summary .= $ellipsis;
266 0           while (@relative_starts) {
267 0           $send = shift @relative_ends;
268 0           $sstart = shift @relative_starts;
269 0 0 0       $summary .= $self->encode(
270             substr( $x, $last_send, $sstart - $last_send ) )
271             unless !$last_send && !$sstart;
272 0           $summary .= $self->highlight(
273             $self->encode(
274             substr( $x, $sstart, $send - $sstart )
275             )
276             );
277 0           $last_send = $send;
278             }
279 0 0         $summary .= $self->encode( substr( $x, $last_send ) )
280             unless $last_send == length $x;
281             }
282              
283 0           $prev_ellipsis = $end_with_ellipsis;
284              
285             }
286              
287 0           return $summary;
288             }
289              
290             # This is not called as a method above, because it’s a private routine that
291             # should not be overridden (it is not guaranteed to exist in future ver-
292             # sions), and it’s faster to call it as a function.
293             sub _encode_with_pb { # w/page breaks
294 0     0     my ($self, $text, $page_h, $page_no_ref, $hitdoc) = @_;
295 0           my @to_encode = split /\014/, $text, -1; # -1 to allow trailing
296 0           my $ret = ''; # null fields
297 0 0         $ret .= $self->encode(shift @to_encode) if length $to_encode[0];
298 0           for(@to_encode) {
299 0           $ret .= &$page_h($hitdoc, ++$$page_no_ref);
300 0 0         $ret .= $self->encode($_) if length;
301             }
302 0           $ret;
303             }
304              
305             sub encode {
306 0     0 1   my @__ = @_; # workaround for perl5.8.8 bug
307             &{
308 0 0         $encoder{$__[0]} or return shift(@__)->SUPER::encode(@__)
  0            
309             }($__[1])
310             }
311              
312             1;
313              
314             __END__