File Coverage

blib/lib/Search/Tools/Snipper.pm
Criterion Covered Total %
statement 241 353 68.2
branch 73 160 45.6
condition 24 47 51.0
subroutine 24 25 96.0
pod 2 2 100.0
total 364 587 62.0


line stmt bran cond sub pod time code
1             package Search::Tools::Snipper;
2 16     16   173447 use Moo;
  16         61224  
  16         118  
3             extends 'Search::Tools::Object';
4             with 'Search::Tools::ArgNormalizer';
5 16     16   12838 use Carp;
  16         46  
  16         1184  
6 16     16   1145 use Data::Dump qw( dump );
  16         13463  
  16         816  
7 16     16   3682 use Search::Tools::XML;
  16         42  
  16         566  
8 16     16   112 use Search::Tools::UTF8;
  16         37  
  16         1747  
9 16     16   5727 use Search::Tools::Tokenizer;
  16         57  
  16         674  
10 16     16   9347 use Search::Tools::HeatMap;
  16         52  
  16         697  
11              
12 16     16   127 use namespace::autoclean;
  16         36  
  16         124  
13              
14             our $VERSION = '1.007';
15              
16             # extra space here so pmvers works against $VERSION
17             our $ellip = ' ... ';
18             our $DefaultSnipper = 'offset';
19              
20             #
21             # TODO allow for returning an array ref of
22             # extracts instead of joining them all with $ellip
23             #
24              
25             my @attrs = qw(
26             as_sentences
27             collapse_whitespace
28             context
29             count
30             escape
31             force
32             ignore_length
33             max_chars
34             occur
35             query
36             show
37             snipper
38             strip_markup
39             treat_phrases_as_singles
40             type
41             type_used
42             use_pp
43             word_len
44             );
45              
46             my %Defaults = (
47             type => $DefaultSnipper,
48             occur => 5,
49             max_chars => 300,
50             context => 8,
51             word_len => 4, # TODO still used?
52             show => 1,
53             collapse_whitespace => 1,
54             escape => 0,
55             force => 0,
56             as_sentences => 0,
57             ignore_length => 0,
58             treat_phrases_as_singles => 1,
59             strip_markup => 0,
60             );
61              
62             for my $attr (@attrs) {
63             my $def = $Defaults{$attr} || undef;
64             if ( defined $def ) {
65             has( $attr => ( is => 'rw', default => sub {$def} ) );
66             }
67             else {
68             has( $attr => ( is => 'rw' ) );
69             }
70             }
71              
72             sub BUILD {
73 30     30 1 200 my $self = shift;
74              
75             #dump $self;
76              
77 30         840 $self->{_tokenizer} = Search::Tools::Tokenizer->new(
78             re => $self->query->qp->term_re,
79             debug => $self->debug,
80             );
81              
82 30         206 my $wc = $self->query->qp->word_characters;
83              
84             # regexp for splitting into terms in _re()
85 30         977 $self->{_wc_regexp} = qr/[^$wc]+/io;
86              
87             $self->{_qre}
88 30         247 = $self->query->terms_as_regex( $self->treat_phrases_as_singles );
89              
90 30         182 $self->count(0);
91              
92 30         1919 return $self;
93             }
94              
95             # I tried Text::Context but that was too slow.
96             # Here are several different models.
97             # I have found that _loop() is faster for single-word queries,
98             # while _re() seems to be the best compromise between speed and accuracy.
99             # New in version 0.24 is _token() which is mostly XS and should be best.
100              
101             sub _pick_snipper {
102 31     31   199 my ( $self, $text ) = @_;
103 31   66     200 my $snipper_name = $self->type || $DefaultSnipper;
104 31 100       234 if ( $self->query->qp->stemmer ) {
105 5         12 $snipper_name = 'token';
106             }
107 31         104 my $method_name = '_' . $snipper_name;
108 31         133 $self->type_used($snipper_name);
109 31     31   184 my $func = sub { shift->$method_name(@_) };
  31         170  
110 31         140 return $func;
111             }
112              
113             # 2 passes, excluding ' ' in the first one,
114             # is 60% faster than a single pass including ' '.
115             # likely because there are far fewer matches
116             # in either of the 2 than the 1.
117             sub _normalize_whitespace {
118 62     62   3079 $_[0] =~ s,[\n\r\t\xa0]+,\ ,go;
119 62         6761 $_[0] =~ s,\ +, ,go; # \ \ + was 16x slower on bigfile!!
120             }
121              
122             sub snip {
123 33     33 1 154 my $self = shift;
124 33         92 my $text = shift;
125 33 50       147 if ( !defined $text ) {
126 0         0 croak "text required to snip";
127             }
128              
129             # normalize encoding, esp for regular expressions.
130 33         159 $text = to_utf8($text);
131              
132             # don't snip if we're less than the threshold
133 33 100 100     814 if ( length($text) < $self->max_chars && !$self->ignore_length ) {
134 2 50       11 if ( $self->show ) {
135 2 50       8 if ( $self->strip_markup ) {
136 0         0 return Search::Tools::XML->no_html($text);
137             }
138 2         11 return $text;
139             }
140 0         0 return '';
141             }
142              
143 31 100       152 if ( $self->strip_markup ) {
144 1         12 $text = Search::Tools::XML->no_html($text);
145             }
146              
147 31 50       159 if ( $self->collapse_whitespace ) {
148 31         114 _normalize_whitespace($text);
149             }
150              
151             # we calculate the snipper each time since caller
152             # may set type() or snipper() between calls to snip().
153 31   33     311 my $func = $self->snipper || $self->_pick_snipper($text);
154              
155 31         104 my $s = $func->( $self, $text );
156              
157 31 50       874 $self->debug and warn "snipped: '$s'\n";
158              
159             # sanity check
160 31 100 100     642 if ( length($s) > ( $self->max_chars * 4 ) && !$self->ignore_length ) {
    100 66        
161 1         5 $s = $self->_dumb($s);
162 1 50       22 $self->debug and warn "too long. dumb snip: '$s'\n";
163             }
164             elsif ( !length($s) && !$self->ignore_length ) {
165 1         7 $s = $self->_dumb($text);
166 1 50       35 $self->debug and warn "too short. dumb snip: '$s'\n";
167             }
168              
169             # escape entities before collapsing whitespace.
170 31         156 $s = $self->_escape($s);
171              
172 31 50       133 if ( $self->collapse_whitespace ) {
173 31         96 _normalize_whitespace($s);
174             }
175              
176 31         437 return $s;
177              
178             }
179              
180             sub _token {
181 30     30   72 my $self = shift;
182 30         76 my $qre = $self->{_qre};
183 30 50       899 $self->debug and warn "\$qre: $qre";
184              
185 30 100       335 my $method = ( $self->{use_pp} ) ? 'tokenize_pp' : 'tokenize';
186              
187             # must split phrases into OR'd regex or else no heat is generated.
188 30         78 my $qre_ORd = $qre;
189 30         214 $qre_ORd =~ s/(\\ )+/\|/g;
190 30         1422 my $heat_seeker = qr/^$qre_ORd$/;
191              
192             # if stemmer is on, we must stem each token to look for a match
193 30 100       296 if ( $self->query->qp->stemmer ) {
194 5         18 my $stemmer = $self->query->qp->stemmer;
195 5         16 my $qp = $self->query->qp;
196 5         8 my $re = $heat_seeker;
197             $heat_seeker = sub {
198 486     486   959 my ($token) = @_;
199 486         1094 my $st = $stemmer->( $qp, $token->str );
200 486         6738 return $st =~ m/$re/;
201 5         30 };
202             }
203 30         31512 my $tokens = $self->{_tokenizer}->$method( $_[0], $heat_seeker );
204              
205             #$self->debug and $tokens->dump;
206              
207 30 50       125 return $self->_dumb( $_[0] ) unless scalar @{ $tokens->get_heat };
  30         246  
208              
209             my $heatmap = Search::Tools::HeatMap->new(
210             tokens => $tokens,
211             window_size => $self->{context},
212             as_sentences => $self->{as_sentences},
213             debug => $self->debug,
214             _query => $self->query,
215             _qre => $qre,
216             _treat_phrases_as_singles => $self->{treat_phrases_as_singles},
217 30         957 _stemmer => $self->query->qp->stemmer,
218             );
219              
220             # reduce noise in debug
221 30         101 delete $heatmap->{_query};
222              
223 30 50       943 $self->debug and warn "heatmap: " . dump $heatmap;
224              
225 30         346 my $tokens_arr = $tokens->as_array;
226              
227             #warn "snips: " . dump $heatmap->spans;
228 30 100       143 if ( $heatmap->has_spans ) {
229              
230             # stringify positions
231 29         57 my @snips;
232 29         60 for my $span ( @{ $heatmap->spans } ) {
  29         141  
233              
234 58 50       1054 $self->debug and warn '>>>' . $span->{str_w_pos} . '<<<';
235 58         449 push( @snips, $span->{str} );
236             }
237 29         121 my $occur_index = $self->occur - 1;
238 29 100       111 if ( $#snips > $occur_index ) {
239 5         26 @snips = @snips[ 0 .. $occur_index ];
240             }
241 29         164 my $snip = join( $ellip, @snips );
242 29         1195 my $snips_start_with_query = $_[0] =~ m/^\Q$snip\E/;
243 29         1163 my $snips_end_with_query = $_[0] =~ m/\Q$snip\E$/;
244 29 100       169 if ( $self->{as_sentences} ) {
245 13         29 $snips_start_with_query = 1;
246 13         153 $snips_end_with_query = $snip =~ m/[\.\?\!]\s*$/;
247             }
248              
249             # if we are pulling out something less than the entire
250             # text, insert ellipses...
251 29 100       123 if ( $_[0] ne $snip ) {
252 25 50       680 $self->debug and warn "extract is smaller than snip";
253 25 100       318 my $extract = join( '',
    100          
254             ( $snips_start_with_query ? '' : $ellip ),
255             $snip, ( $snips_end_with_query ? '' : $ellip ) );
256 25         13180 return $extract;
257             }
258             else {
259 4         488 return $snip;
260             }
261             }
262             else {
263              
264             #warn "no spans. using dumb snip";
265 1         8 return $self->_dumb( $_[0] );
266             }
267              
268             }
269              
270             sub _get_offsets {
271 25     25   74 my $self = shift;
272 25         4588 return $self->{_tokenizer}->get_offsets( @_, $self->{_qre} );
273             }
274              
275             sub _offset {
276 25     25   59 my $self = shift;
277 25         135 my $txt = shift;
278 25         124 my $offsets = $self->_get_offsets($txt);
279 25         139 my $snips = $self->_get_offset_snips( $txt, $offsets );
280 25         243 return $self->_token( join( '', @$snips ) );
281             }
282              
283             sub _get_offset_snips {
284 25     25   64 my $self = shift;
285 25         60 my $txt = shift;
286 25         68 my $offsets = shift;
287              
288             # grab $size chars on either side of each offset
289             # and tokenize each.
290             # $size should be nice and wide to minimize the substr() calls.
291 25         101 my $size = $self->max_chars * 10;
292              
293             #warn "window size $size";
294              
295 25         58 my @buf;
296 25         553 my $len = length($txt);
297 25 100       111 if ( $size > $len ) {
298              
299             #warn "window bigger than document";
300 20         100 return [$txt];
301             }
302              
303 5         13 my ( $seen_start, $seen_end );
304 5         19 my $last_ending = 0;
305 5         18 for my $pos (@$offsets) {
306              
307 34         49 my $tmp;
308              
309 34         69 my $start = $pos - int( $size / 2 );
310 34         55 my $end = $pos + int( $size / 2 );
311              
312             # avoid overlaps
313 34 100 100     104 if ( $last_ending && $start < $last_ending ) {
314 26         44 $start = $last_ending + 1;
315 26         35 $end = $start + $size;
316             }
317              
318             #warn "$start .. $pos .. $end";
319              
320 34 100 66     113 if ( $pos > $end or $pos < $start ) {
321 23         36 next;
322             }
323              
324 11         28 $last_ending = $end;
325              
326             #warn "$start .. $end";
327              
328             # if $pos is close to the front of $txt
329 11 100       44 if ( $start <= 0 ) {
    100          
330 1 50       5 next if $seen_start++;
331              
332             #warn "start";
333 1         5 $tmp = substr( $txt, 0, $size );
334             }
335              
336             # if $pos is somewhere near the end
337             elsif ( $end > $len ) {
338 2 50       16 next if $seen_end++;
339              
340             #warn "end";
341 2         23 $tmp = substr( $txt, ( $len - $size ) );
342             }
343              
344             # default is somewhere in the ripe middle.
345             else {
346              
347             #warn "middle";
348 8         182 $tmp = substr( $txt, $start, $size );
349             }
350              
351 11         39 push @buf, $tmp;
352             }
353              
354 5         26 return \@buf;
355             }
356              
357             sub _loop {
358 0     0   0 my $self = shift;
359 0         0 my $txt = shift;
360 0         0 my $regexp = $self->{_qre};
361              
362             #carp "loop snip: $txt";
363              
364 0 0       0 $self->debug and carp "loop snip regexp: $regexp";
365              
366 0   0     0 my $debug = $self->debug || 0;
367              
368             # no matches
369 0 0       0 return $self->_dumb($txt) unless $txt =~ m/$regexp/;
370              
371             #carp "loop snip: $txt";
372              
373 0         0 my $context = $self->context - 1;
374 0   0     0 my $occur = $self->occur || 1;
375 0         0 my @snips;
376              
377 0         0 my $notwc = $self->{_wc_regexp};
378              
379 0         0 my @words = split( /($notwc)/, $txt );
380 0         0 my $count = -1;
381 0         0 my $start_again = $count;
382 0         0 my $total = 0;
383 0         0 my $first_match = 0;
384              
385 0         0 WORD: for my $w (@words) {
386              
387 0 0       0 if ( $debug > 1 ) {
388 0 0       0 warn ">>\n" if $count % 2;
389 0         0 warn "word: '$w'\n";
390             }
391              
392 0         0 $count++;
393 0 0       0 next WORD if $count < $start_again;
394              
395             # the next WORD lets us skip past the last frag we excerpted
396              
397 0         0 my $last = $count - 1;
398 0         0 my $next = $count + 1;
399              
400             #warn '-' x 30 . "\n";
401 0 0       0 if ( $w =~ m/^$regexp$/ ) {
402              
403 0 0       0 if ( $debug > 1 ) {
404 0         0 warn "w: '$w' match: '$1'\n";
405             }
406              
407 0         0 $first_match = $count;
408              
409 0         0 my $before = $last - $context;
410 0 0       0 $before = 0 if $before < 0;
411 0         0 my $after = $next + $context;
412 0 0       0 $after = $#words if $after > $#words;
413              
414 0 0       0 if ( $debug > 1 ) {
415 0         0 warn "$before .. $last, $count, $next .. $after\n";
416             }
417              
418 0         0 my @before = @words[ $before .. $last ];
419 0         0 my @after = @words[ $next .. $after ];
420              
421 0         0 my $this_snip_matches = grep {m/^$regexp$/i} ( @before, @after );
  0         0  
422 0 0       0 if ($this_snip_matches) {
423 0         0 $after += $this_snip_matches;
424 0         0 @after = @words[ $next .. $after ];
425             }
426 0         0 $total += $this_snip_matches;
427 0         0 $total++; # for current $w
428              
429 0         0 my $t = join( '', @before, $w, @after );
430              
431 0 0       0 $t .= $ellip unless $count == $#words;
432              
433 0 0       0 if ( $debug > 1 ) {
434 0         0 warn "t: $t\n";
435 0         0 warn "this_snip_matches: $this_snip_matches\n";
436 0         0 warn "total: $total\n";
437             }
438              
439 0         0 push( @snips, [ $t, $this_snip_matches + 1 ] ); # +1 for $w
440 0         0 $start_again = $after;
441             }
442              
443             }
444              
445             # sort by match density.
446             # consistent with HeatMap and lets us find
447             # the *best* match, including phrases.
448 0         0 @snips = map { $_->[0] } sort { $b->[1] <=> $a->[1] } @snips;
  0         0  
  0         0  
449              
450 0 0       0 if ( $debug > 1 ) {
451 0         0 carp "snips: " . scalar @snips;
452 0         0 carp "words: $count\n";
453 0         0 carp "grandtotal: $total\n";
454 0         0 carp "occur: $occur\n";
455 0         0 carp '-' x 50 . "\n";
456              
457             }
458              
459 0         0 $self->count( scalar(@snips) + $self->count );
460 0         0 my $last_snip = $occur - 1;
461 0 0       0 if ( $last_snip > $#snips ) {
462 0         0 $last_snip = $#snips;
463             }
464              
465             #warn dump \@snips;
466 0         0 my $snippet = join( '', @snips[ 0 .. $last_snip ] );
467 0 0       0 $self->debug and warn "before no_start_partial: '$snippet'\n";
468              
469             #_no_start_partial($snippet);
470 0 0       0 $snippet = $ellip . $snippet if $first_match;
471              
472 0         0 return $snippet;
473             }
474              
475             sub _re {
476              
477             # get first N matches for each q, then take one of each till we have $occur
478              
479 1     1   6 my $self = shift;
480 1         3 my $text = shift;
481 1         3 my @q = @{ $self->query->terms };
  1         10  
482 1         7 my $occur = $self->occur;
483 1         10 my $Nchar = $self->context * $self->word_len;
484 1         3 my $total = 0;
485 1         4 my $notwc = $self->{_wc_regexp};
486              
487             # get minimum number of snips necessary to meet $occur
488 1         7 my $snip_per_q = int( $occur / scalar(@q) );
489 1   50     6 $snip_per_q ||= 1;
490              
491 1         4 my ( %snips, @snips, %ranges, $snip_starts_with_query );
492 1         4 $snip_starts_with_query = 0;
493              
494 1         5 Q: for my $q (@q) {
495 1         9 $snips{$q} = { t => [], offset => [] };
496              
497 1 50       43 $self->debug and warn "$q : $snip_starts_with_query";
498              
499             # try simple regexp first, then more complex if we don't match
500             next Q
501             if $self->_re_match( \$text, $self->query->regex_for($q)->plain,
502 1 50       23 \$total, $snips{$q}, \%ranges, $Nchar, $snip_per_q,
503             \$snip_starts_with_query );
504              
505 0 0       0 $self->debug and warn "failed match on plain regexp";
506              
507 0         0 pos $text = 0; # do we really need to reset this?
508              
509 0 0       0 unless (
510             $self->_re_match(
511             \$text, $self->query->regex_for($q)->html,
512             \$total, $snips{$q},
513             \%ranges, $Nchar,
514             $snip_per_q, \$snip_starts_with_query
515             )
516             )
517             {
518 0 0       0 $self->debug and warn "failed match on html regexp";
519             }
520              
521             }
522              
523 1 50       6 return $self->_dumb($text) unless $total;
524              
525             # get all snips into one array in order they appeared in $text
526             # should be a max of $snip_per_q in any one $q snip array
527             # so we should have at least $occur in total,
528             # which we'll splice() if need be.
529              
530 1         3 my %offsets;
531 1         8 for my $q ( keys %snips ) {
532 1         3 my @s = @{ $snips{$q}->{t} };
  1         9  
533 1         4 my @o = @{ $snips{$q}->{offset} };
  1         7  
534              
535 1         3 my $i = 0;
536 1         5 for (@s) {
537 1         8 $offsets{$_} = $o[$i];
538             }
539             }
540 1         10 @snips = sort { $offsets{$a} <=> $offsets{$b} } keys %offsets;
  0         0  
541              
542             # max = $occur
543 1         6 @snips = splice @snips, 0, $occur;
544              
545 1 50       41 $self->debug and warn dump( \@snips );
546              
547 1         19 my $snip = join( $ellip, @snips );
548 1 50       10 _no_start_partial($snip) unless $snip_starts_with_query;
549 1 50       51 $snip = $ellip . $snip unless $text =~ m/^\Q$snips[0]/i;
550 1 50       69 $snip .= $ellip unless $text =~ m/\Q$snips[-1]$/i;
551              
552 1         15 $self->count( scalar(@snips) + $self->count );
553              
554 1         47 return $snip;
555              
556             }
557              
558             sub _re_match {
559              
560             # the .{0,$Nchar} regexp slows things WAY down. so just match,
561             # then use pos() to get chars before and after.
562              
563             # if escape = 0 and if prefix or suffix contains a < or >,
564             # try to include entire tagset.
565              
566 1     1   4 my ( $self, $text, $re, $total, $snips, $ranges, $Nchar, $max_snips,
567             $snip_starts_with_query )
568             = @_;
569              
570 1         5 my $t_len = length $$text;
571              
572 1         3 my $cnt = 0;
573              
574 1 50       24 if ( $self->debug ) {
575 0         0 warn "re_match regexp: >$re<\n";
576 0         0 warn "max_snips: $max_snips\n";
577             }
578              
579 1         95 RE: while ( $$text =~ m/$re/g ) {
580              
581 1         6 my $pos = pos $$text;
582 1         3 my $before_match = $1;
583 1         3 my $match = $2;
584 1         4 my $after_match = $3;
585 1         1 $cnt++;
586 1         2 my $len = length $match;
587 1         4 my $blen = length $before_match;
588 1 50       19 if ( $self->debug ) {
589 0         0 warn "re: '$re'\n";
590 0         0 warn "\$1 = '$before_match' = ", ord($before_match), "\n";
591 0         0 warn "\$2 = '$match'\n";
592 0         0 warn "\$3 = '$after_match' = ", ord($after_match), "\n";
593 0         0 warn "pos = $pos\n";
594 0         0 warn "len = $len\n";
595 0         0 warn "blen= $blen\n";
596             }
597              
598 1 0 33     22 if ( $self->debug && exists $ranges->{$pos} ) {
599 0         0 warn "already found $pos\n";
600             }
601              
602 1 50       9 next RE if exists $ranges->{$pos};
603              
604 1   50     4 my $start_match = $pos - $len - ( $blen || 1 );
605 1 50       3 $start_match = 0 if $start_match < 0;
606              
607 1 50       3 $$snip_starts_with_query = 1 if $start_match == 0;
608              
609             # sanity
610 1 50       24 $self->debug
611             and warn "match should be [$start_match $len]: '",
612             substr( $$text, $start_match, $len ), "'\n";
613              
614 1 50       17 my $prefix_start
615             = $start_match < $Nchar
616             ? 0
617             : $start_match - $Nchar;
618              
619 1         4 my $prefix_len = $start_match - $prefix_start;
620              
621             #$prefix_len++; $prefix_len++;
622              
623 1         5 my $suffix_start = $pos - length($after_match);
624 1         3 my $suffix_len = $Nchar;
625 1         4 my $end = $suffix_start + $suffix_len;
626              
627             # if $end extends beyond, that's ok, substr compensates
628              
629 1         190 $ranges->{$_}++ for ( $prefix_start .. $end );
630 1         9 my $prefix = substr( $$text, $prefix_start, $prefix_len );
631 1         5 my $suffix = substr( $$text, $suffix_start, $suffix_len );
632              
633 1 50       41 if ( $self->debug ) {
634 0         0 warn "prefix_start = $prefix_start\n";
635 0         0 warn "prefix_len = $prefix_len\n";
636 0         0 warn "start_match = $start_match\n";
637 0         0 warn "len = $len\n";
638 0         0 warn "pos = $pos\n";
639 0         0 warn "char = $Nchar\n";
640 0         0 warn "suffix_start = $suffix_start\n";
641 0         0 warn "suffix_len = $suffix_len\n";
642 0         0 warn "end = $end\n";
643 0         0 warn "prefix: '$prefix'\n";
644 0         0 warn "match: '$match'\n";
645 0         0 warn "suffix: '$suffix'\n";
646             }
647              
648             # try and get whole words if we split one up
649             # _no_*_partial does this more rudely
650              
651             # might be faster to do m/(\S)*$prefix/i
652             # but we couldn't guarantee position accuracy
653             # e.g. if $prefix matched more than once in $$text,
654             # we might pull the wrong \S*
655              
656 1 50 33     20 unless ( $prefix =~ m/^\s/
657             or substr( $$text, $prefix_start - 1, 1 ) =~ m/(\s)/ )
658             {
659 0   0     0 while ( --$prefix_start >= 0
660             and substr( $$text, $prefix_start, 1 ) =~ m/(\S)/ )
661             {
662 0         0 my $onemorechar = $1;
663              
664             #warn "adding $onemorechar to prefix\n";
665 0         0 $prefix = $onemorechar . $prefix;
666              
667             #last if $prefix_start <= 0 or $onemorechar !~ /\S/;
668             }
669             }
670              
671             # do same for suffix
672              
673             # We get error here under -w
674             # about substr outside of string -- is $end undefined sometimes??
675              
676 1 50 33     18 unless ( $suffix =~ m/\s$/ or substr( $$text, $end, 1 ) =~ m/(\s)/ ) {
677 1   66     15 while ( $end <= $t_len
678             and substr( $$text, $end++, 1 ) =~ m/(\S)/ )
679             {
680              
681 3         14 my $onemore = $1;
682              
683             #warn "adding $onemore to suffix\n";
684             #warn "before '$suffix'\n";
685 3         30 $suffix .= $onemore;
686              
687             #warn "after '$suffix'\n";
688             }
689             }
690              
691             # will likely fail to include one half of tagset if other is complete
692 1 50       11 unless ( $self->escape ) {
693 1         4 my $sanity = 0;
694 1         7 my @l = ( $prefix =~ /(<)/g );
695 1         5 my @r = ( $prefix =~ /(>)/g );
696 1         7 while ( scalar @l != scalar @r ) {
697              
698 0         0 @l = ( $prefix =~ /(<)/g );
699 0         0 @r = ( $prefix =~ /(>)/g );
700             last
701             if scalar @l
702 0 0       0 == scalar @r; # don't take any more than we need to
703              
704 0         0 my $onemorechar = substr( $$text, $prefix_start--, 1 );
705              
706             #warn "tagfix: adding $onemorechar to prefix\n";
707 0         0 $prefix = $onemorechar . $prefix;
708 0 0       0 last if $prefix_start <= 0;
709 0 0       0 last if $sanity++ > 100;
710              
711             }
712              
713 1         5 $sanity = 0;
714 1   33     8 while ( $suffix =~ /<(\w+)/ && $suffix !~ /<\/$1>/ ) {
715              
716 0         0 my $onemorechar = substr( $$text, $end, 1 );
717              
718             #warn "tagfix: adding $onemorechar to suffix\n";
719 0         0 $suffix .= $onemorechar;
720 0 0       0 last if ++$end > $t_len;
721 0 0       0 last if $sanity++ > 100;
722              
723             }
724             }
725              
726             # warn "prefix: '$prefix'\n";
727             # warn "match: '$match'\n";
728             # warn "suffix: '$suffix'\n";
729              
730 1         10 my $context = join( '', $prefix, $match, $suffix );
731              
732             #warn "context is '$context'\n";
733              
734 1         6 push( @{ $snips->{t} }, $context );
  1         7  
735 1         3 push( @{ $snips->{offset} }, $prefix_start );
  1         6  
736              
737 1         4 $$total++;
738              
739             # warn '-' x 40, "\n";
740              
741 1 50       9 last if $cnt >= $max_snips;
742             }
743              
744 1         8 return $cnt;
745             }
746              
747             sub _dumb {
748              
749             # just grap the first X chars and return
750              
751 3     3   11 my $self = shift;
752 3 100       295 return '' unless $self->show;
753              
754 1         4 my $txt = shift;
755 1         3 my $max = $self->max_chars;
756 1         5 $self->type_used('dumb');
757              
758 1         4 my $show = substr( $txt, 0, $max );
759 1         4 _no_end_partial($show);
760 1         3 $show .= $ellip;
761              
762 1         5 $self->count( 1 + $self->count );
763              
764 1         4 return $show;
765              
766             }
767              
768             sub _no_start_partial {
769 1     1   8 $_[0] =~ s/^\S+\s+//gs;
770             }
771              
772             sub _no_end_partial {
773 1     1   5 $_[0] =~ s/\s+\S+$//gs;
774             }
775              
776             sub _escape {
777 31 50   31   151 if ( $_[0]->escape ) {
778 0         0 return Search::Tools::XML->escape( $_[1] );
779             }
780             else {
781 31         124 return $_[1];
782             }
783             }
784              
785             1;
786             __END__