File Coverage

blib/lib/Search/Tools/Snipper.pm

Criterion	Covered	Total	%
statement	241	353	68.2
branch	73	160	45.6
condition	24	47	51.0
subroutine	24	25	96.0
pod	2	2	100.0
total	364	587	62.0

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tools::Snipper;
2	16			16		173447	use Moo;
	16					61224
	16					118
3							extends 'Search::Tools::Object';
4							with 'Search::Tools::ArgNormalizer';
5	16			16		12838	use Carp;
	16					46
	16					1184
6	16			16		1145	use Data::Dump qw( dump );
	16					13463
	16					816
7	16			16		3682	use Search::Tools::XML;
	16					42
	16					566
8	16			16		112	use Search::Tools::UTF8;
	16					37
	16					1747
9	16			16		5727	use Search::Tools::Tokenizer;
	16					57
	16					674
10	16			16		9347	use Search::Tools::HeatMap;
	16					52
	16					697
11
12	16			16		127	use namespace::autoclean;
	16					36
	16					124
13
14							our $VERSION = '1.007';
15
16							# extra space here so pmvers works against $VERSION
17							our $ellip = ' ... ';
18							our $DefaultSnipper = 'offset';
19
20							#
21							# TODO allow for returning an array ref of
22							# extracts instead of joining them all with $ellip
23							#
24
25							my @attrs = qw(
26							as_sentences
27							collapse_whitespace
28							context
29							count
30							escape
31							force
32							ignore_length
33							max_chars
34							occur
35							query
36							show
37							snipper
38							strip_markup
39							treat_phrases_as_singles
40							type
41							type_used
42							use_pp
43							word_len
44							);
45
46							my %Defaults = (
47							type => $DefaultSnipper,
48							occur => 5,
49							max_chars => 300,
50							context => 8,
51							word_len => 4, # TODO still used?
52							show => 1,
53							collapse_whitespace => 1,
54							escape => 0,
55							force => 0,
56							as_sentences => 0,
57							ignore_length => 0,
58							treat_phrases_as_singles => 1,
59							strip_markup => 0,
60							);
61
62							for my $attr (@attrs) {
63							my $def = $Defaults{$attr} \|\| undef;
64							if ( defined $def ) {
65							has( $attr => ( is => 'rw', default => sub {$def} ) );
66							}
67							else {
68							has( $attr => ( is => 'rw' ) );
69							}
70							}
71
72							sub BUILD {
73	30			30	1	200	my $self = shift;
74
75							#dump $self;
76
77	30					840	$self->{_tokenizer} = Search::Tools::Tokenizer->new(
78							re => $self->query->qp->term_re,
79							debug => $self->debug,
80							);
81
82	30					206	my $wc = $self->query->qp->word_characters;
83
84							# regexp for splitting into terms in _re()
85	30					977	$self->{_wc_regexp} = qr/[^$wc]+/io;
86
87							$self->{_qre}
88	30					247	= $self->query->terms_as_regex( $self->treat_phrases_as_singles );
89
90	30					182	$self->count(0);
91
92	30					1919	return $self;
93							}
94
95							# I tried Text::Context but that was too slow.
96							# Here are several different models.
97							# I have found that _loop() is faster for single-word queries,
98							# while _re() seems to be the best compromise between speed and accuracy.
99							# New in version 0.24 is _token() which is mostly XS and should be best.
100
101							sub _pick_snipper {
102	31			31		199	my ( $self, $text ) = @_;
103	31		66			200	my $snipper_name = $self->type \|\| $DefaultSnipper;
104	31	100				234	if ( $self->query->qp->stemmer ) {
105	5					12	$snipper_name = 'token';
106							}
107	31					104	my $method_name = '_' . $snipper_name;
108	31					133	$self->type_used($snipper_name);
109	31			31		184	my $func = sub { shift->$method_name(@_) };
	31					170
110	31					140	return $func;
111							}
112
113							# 2 passes, excluding ' ' in the first one,
114							# is 60% faster than a single pass including ' '.
115							# likely because there are far fewer matches
116							# in either of the 2 than the 1.
117							sub _normalize_whitespace {
118	62			62		3079	$_[0] =~ s,[\n\r\t\xa0]+,\ ,go;
119	62					6761	$_[0] =~ s,\ +, ,go; # \ \ + was 16x slower on bigfile!!
120							}
121
122							sub snip {
123	33			33	1	154	my $self = shift;
124	33					92	my $text = shift;
125	33	50				147	if ( !defined $text ) {
126	0					0	croak "text required to snip";
127							}
128
129							# normalize encoding, esp for regular expressions.
130	33					159	$text = to_utf8($text);
131
132							# don't snip if we're less than the threshold
133	33	100	100			814	if ( length($text) < $self->max_chars && !$self->ignore_length ) {
134	2	50				11	if ( $self->show ) {
135	2	50				8	if ( $self->strip_markup ) {
136	0					0	return Search::Tools::XML->no_html($text);
137							}
138	2					11	return $text;
139							}
140	0					0	return '';
141							}
142
143	31	100				152	if ( $self->strip_markup ) {
144	1					12	$text = Search::Tools::XML->no_html($text);
145							}
146
147	31	50				159	if ( $self->collapse_whitespace ) {
148	31					114	_normalize_whitespace($text);
149							}
150
151							# we calculate the snipper each time since caller
152							# may set type() or snipper() between calls to snip().
153	31		33			311	my $func = $self->snipper \|\| $self->_pick_snipper($text);
154
155	31					104	my $s = $func->( $self, $text );
156
157	31	50				874	$self->debug and warn "snipped: '$s'\n";
158
159							# sanity check
160	31	100	100			642	if ( length($s) > ( $self->max_chars * 4 ) && !$self->ignore_length ) {
		100	66
161	1					5	$s = $self->_dumb($s);
162	1	50				22	$self->debug and warn "too long. dumb snip: '$s'\n";
163							}
164							elsif ( !length($s) && !$self->ignore_length ) {
165	1					7	$s = $self->_dumb($text);
166	1	50				35	$self->debug and warn "too short. dumb snip: '$s'\n";
167							}
168
169							# escape entities before collapsing whitespace.
170	31					156	$s = $self->_escape($s);
171
172	31	50				133	if ( $self->collapse_whitespace ) {
173	31					96	_normalize_whitespace($s);
174							}
175
176	31					437	return $s;
177
178							}
179
180							sub _token {
181	30			30		72	my $self = shift;
182	30					76	my $qre = $self->{_qre};
183	30	50				899	$self->debug and warn "\$qre: $qre";
184
185	30	100				335	my $method = ( $self->{use_pp} ) ? 'tokenize_pp' : 'tokenize';
186
187							# must split phrases into OR'd regex or else no heat is generated.
188	30					78	my $qre_ORd = $qre;
189	30					214	$qre_ORd =~ s/(\\ )+/\\|/g;
190	30					1422	my $heat_seeker = qr/^$qre_ORd$/;
191
192							# if stemmer is on, we must stem each token to look for a match
193	30	100				296	if ( $self->query->qp->stemmer ) {
194	5					18	my $stemmer = $self->query->qp->stemmer;
195	5					16	my $qp = $self->query->qp;
196	5					8	my $re = $heat_seeker;
197							$heat_seeker = sub {
198	486			486		959	my ($token) = @_;
199	486					1094	my $st = $stemmer->( $qp, $token->str );
200	486					6738	return $st =~ m/$re/;
201	5					30	};
202							}
203	30					31512	my $tokens = $self->{_tokenizer}->$method( $_[0], $heat_seeker );
204
205							#$self->debug and $tokens->dump;
206
207	30	50				125	return $self->_dumb( $_[0] ) unless scalar @{ $tokens->get_heat };
	30					246
208
209							my $heatmap = Search::Tools::HeatMap->new(
210							tokens => $tokens,
211							window_size => $self->{context},
212							as_sentences => $self->{as_sentences},
213							debug => $self->debug,
214							_query => $self->query,
215							_qre => $qre,
216							_treat_phrases_as_singles => $self->{treat_phrases_as_singles},
217	30					957	_stemmer => $self->query->qp->stemmer,
218							);
219
220							# reduce noise in debug
221	30					101	delete $heatmap->{_query};
222
223	30	50				943	$self->debug and warn "heatmap: " . dump $heatmap;
224
225	30					346	my $tokens_arr = $tokens->as_array;
226
227							#warn "snips: " . dump $heatmap->spans;
228	30	100				143	if ( $heatmap->has_spans ) {
229
230							# stringify positions
231	29					57	my @snips;
232	29					60	for my $span ( @{ $heatmap->spans } ) {
	29					141
233
234	58	50				1054	$self->debug and warn '>>>' . $span->{str_w_pos} . '<<<';
235	58					449	push( @snips, $span->{str} );
236							}
237	29					121	my $occur_index = $self->occur - 1;
238	29	100				111	if ( $#snips > $occur_index ) {
239	5					26	@snips = @snips[ 0 .. $occur_index ];
240							}
241	29					164	my $snip = join( $ellip, @snips );
242	29					1195	my $snips_start_with_query = $_[0] =~ m/^\Q$snip\E/;
243	29					1163	my $snips_end_with_query = $_[0] =~ m/\Q$snip\E$/;
244	29	100				169	if ( $self->{as_sentences} ) {
245	13					29	$snips_start_with_query = 1;
246	13					153	$snips_end_with_query = $snip =~ m/[\.\?\!]\s*$/;
247							}
248
249							# if we are pulling out something less than the entire
250							# text, insert ellipses...
251	29	100				123	if ( $_[0] ne $snip ) {
252	25	50				680	$self->debug and warn "extract is smaller than snip";
253	25	100				318	my $extract = join( '',
		100
254							( $snips_start_with_query ? '' : $ellip ),
255							$snip, ( $snips_end_with_query ? '' : $ellip ) );
256	25					13180	return $extract;
257							}
258							else {
259	4					488	return $snip;
260							}
261							}
262							else {
263
264							#warn "no spans. using dumb snip";
265	1					8	return $self->_dumb( $_[0] );
266							}
267
268							}
269
270							sub _get_offsets {
271	25			25		74	my $self = shift;
272	25					4588	return $self->{_tokenizer}->get_offsets( @_, $self->{_qre} );
273							}
274
275							sub _offset {
276	25			25		59	my $self = shift;
277	25					135	my $txt = shift;
278	25					124	my $offsets = $self->_get_offsets($txt);
279	25					139	my $snips = $self->_get_offset_snips( $txt, $offsets );
280	25					243	return $self->_token( join( '', @$snips ) );
281							}
282
283							sub _get_offset_snips {
284	25			25		64	my $self = shift;
285	25					60	my $txt = shift;
286	25					68	my $offsets = shift;
287
288							# grab $size chars on either side of each offset
289							# and tokenize each.
290							# $size should be nice and wide to minimize the substr() calls.
291	25					101	my $size = $self->max_chars * 10;
292
293							#warn "window size $size";
294
295	25					58	my @buf;
296	25					553	my $len = length($txt);
297	25	100				111	if ( $size > $len ) {
298
299							#warn "window bigger than document";
300	20					100	return [$txt];
301							}
302
303	5					13	my ( $seen_start, $seen_end );
304	5					19	my $last_ending = 0;
305	5					18	for my $pos (@$offsets) {
306
307	34					49	my $tmp;
308
309	34					69	my $start = $pos - int( $size / 2 );
310	34					55	my $end = $pos + int( $size / 2 );
311
312							# avoid overlaps
313	34	100	100			104	if ( $last_ending && $start < $last_ending ) {
314	26					44	$start = $last_ending + 1;
315	26					35	$end = $start + $size;
316							}
317
318							#warn "$start .. $pos .. $end";
319
320	34	100	66			113	if ( $pos > $end or $pos < $start ) {
321	23					36	next;
322							}
323
324	11					28	$last_ending = $end;
325
326							#warn "$start .. $end";
327
328							# if $pos is close to the front of $txt
329	11	100				44	if ( $start <= 0 ) {
		100
330	1	50				5	next if $seen_start++;
331
332							#warn "start";
333	1					5	$tmp = substr( $txt, 0, $size );
334							}
335
336							# if $pos is somewhere near the end
337							elsif ( $end > $len ) {
338	2	50				16	next if $seen_end++;
339
340							#warn "end";
341	2					23	$tmp = substr( $txt, ( $len - $size ) );
342							}
343
344							# default is somewhere in the ripe middle.
345							else {
346
347							#warn "middle";
348	8					182	$tmp = substr( $txt, $start, $size );
349							}
350
351	11					39	push @buf, $tmp;
352							}
353
354	5					26	return \@buf;
355							}
356
357							sub _loop {
358	0			0		0	my $self = shift;
359	0					0	my $txt = shift;
360	0					0	my $regexp = $self->{_qre};
361
362							#carp "loop snip: $txt";
363
364	0	0				0	$self->debug and carp "loop snip regexp: $regexp";
365
366	0		0			0	my $debug = $self->debug \|\| 0;
367
368							# no matches
369	0	0				0	return $self->_dumb($txt) unless $txt =~ m/$regexp/;
370
371							#carp "loop snip: $txt";
372
373	0					0	my $context = $self->context - 1;
374	0		0			0	my $occur = $self->occur \|\| 1;
375	0					0	my @snips;
376
377	0					0	my $notwc = $self->{_wc_regexp};
378
379	0					0	my @words = split( /($notwc)/, $txt );
380	0					0	my $count = -1;
381	0					0	my $start_again = $count;
382	0					0	my $total = 0;
383	0					0	my $first_match = 0;
384
385	0					0	WORD: for my $w (@words) {
386
387	0	0				0	if ( $debug > 1 ) {
388	0	0				0	warn ">>\n" if $count % 2;
389	0					0	warn "word: '$w'\n";
390							}
391
392	0					0	$count++;
393	0	0				0	next WORD if $count < $start_again;
394
395							# the next WORD lets us skip past the last frag we excerpted
396
397	0					0	my $last = $count - 1;
398	0					0	my $next = $count + 1;
399
400							#warn '-' x 30 . "\n";
401	0	0				0	if ( $w =~ m/^$regexp$/ ) {
402
403	0	0				0	if ( $debug > 1 ) {
404	0					0	warn "w: '$w' match: '$1'\n";
405							}
406
407	0					0	$first_match = $count;
408
409	0					0	my $before = $last - $context;
410	0	0				0	$before = 0 if $before < 0;
411	0					0	my $after = $next + $context;
412	0	0				0	$after = $#words if $after > $#words;
413
414	0	0				0	if ( $debug > 1 ) {
415	0					0	warn "$before .. $last, $count, $next .. $after\n";
416							}
417
418	0					0	my @before = @words[ $before .. $last ];
419	0					0	my @after = @words[ $next .. $after ];
420
421	0					0	my $this_snip_matches = grep {m/^$regexp$/i} ( @before, @after );
	0					0
422	0	0				0	if ($this_snip_matches) {
423	0					0	$after += $this_snip_matches;
424	0					0	@after = @words[ $next .. $after ];
425							}
426	0					0	$total += $this_snip_matches;
427	0					0	$total++; # for current $w
428
429	0					0	my $t = join( '', @before, $w, @after );
430
431	0	0				0	$t .= $ellip unless $count == $#words;
432
433	0	0				0	if ( $debug > 1 ) {
434	0					0	warn "t: $t\n";
435	0					0	warn "this_snip_matches: $this_snip_matches\n";
436	0					0	warn "total: $total\n";
437							}
438
439	0					0	push( @snips, [ $t, $this_snip_matches + 1 ] ); # +1 for $w
440	0					0	$start_again = $after;
441							}
442
443							}
444
445							# sort by match density.
446							# consistent with HeatMap and lets us find
447							# the best match, including phrases.
448	0					0	@snips = map { $_->[0] } sort { $b->[1] <=> $a->[1] } @snips;
	0					0
	0					0
449
450	0	0				0	if ( $debug > 1 ) {
451	0					0	carp "snips: " . scalar @snips;
452	0					0	carp "words: $count\n";
453	0					0	carp "grandtotal: $total\n";
454	0					0	carp "occur: $occur\n";
455	0					0	carp '-' x 50 . "\n";
456
457							}
458
459	0					0	$self->count( scalar(@snips) + $self->count );
460	0					0	my $last_snip = $occur - 1;
461	0	0				0	if ( $last_snip > $#snips ) {
462	0					0	$last_snip = $#snips;
463							}
464
465							#warn dump \@snips;
466	0					0	my $snippet = join( '', @snips[ 0 .. $last_snip ] );
467	0	0				0	$self->debug and warn "before no_start_partial: '$snippet'\n";
468
469							#_no_start_partial($snippet);
470	0	0				0	$snippet = $ellip . $snippet if $first_match;
471
472	0					0	return $snippet;
473							}
474
475							sub _re {
476
477							# get first N matches for each q, then take one of each till we have $occur
478
479	1			1		6	my $self = shift;
480	1					3	my $text = shift;
481	1					3	my @q = @{ $self->query->terms };
	1					10
482	1					7	my $occur = $self->occur;
483	1					10	my $Nchar = $self->context * $self->word_len;
484	1					3	my $total = 0;
485	1					4	my $notwc = $self->{_wc_regexp};
486
487							# get minimum number of snips necessary to meet $occur
488	1					7	my $snip_per_q = int( $occur / scalar(@q) );
489	1		50			6	$snip_per_q \|\|= 1;
490
491	1					4	my ( %snips, @snips, %ranges, $snip_starts_with_query );
492	1					4	$snip_starts_with_query = 0;
493
494	1					5	Q: for my $q (@q) {
495	1					9	$snips{$q} = { t => [], offset => [] };
496
497	1	50				43	$self->debug and warn "$q : $snip_starts_with_query";
498
499							# try simple regexp first, then more complex if we don't match
500							next Q
501							if $self->_re_match( \$text, $self->query->regex_for($q)->plain,
502	1	50				23	\$total, $snips{$q}, \%ranges, $Nchar, $snip_per_q,
503							\$snip_starts_with_query );
504
505	0	0				0	$self->debug and warn "failed match on plain regexp";
506
507	0					0	pos $text = 0; # do we really need to reset this?
508
509	0	0				0	unless (
510							$self->_re_match(
511							\$text, $self->query->regex_for($q)->html,
512							\$total, $snips{$q},
513							\%ranges, $Nchar,
514							$snip_per_q, \$snip_starts_with_query
515							)
516							)
517							{
518	0	0				0	$self->debug and warn "failed match on html regexp";
519							}
520
521							}
522
523	1	50				6	return $self->_dumb($text) unless $total;
524
525							# get all snips into one array in order they appeared in $text
526							# should be a max of $snip_per_q in any one $q snip array
527							# so we should have at least $occur in total,
528							# which we'll splice() if need be.
529
530	1					3	my %offsets;
531	1					8	for my $q ( keys %snips ) {
532	1					3	my @s = @{ $snips{$q}->{t} };
	1					9
533	1					4	my @o = @{ $snips{$q}->{offset} };
	1					7
534
535	1					3	my $i = 0;
536	1					5	for (@s) {
537	1					8	$offsets{$_} = $o[$i];
538							}
539							}
540	1					10	@snips = sort { $offsets{$a} <=> $offsets{$b} } keys %offsets;
	0					0
541
542							# max = $occur
543	1					6	@snips = splice @snips, 0, $occur;
544
545	1	50				41	$self->debug and warn dump( \@snips );
546
547	1					19	my $snip = join( $ellip, @snips );
548	1	50				10	_no_start_partial($snip) unless $snip_starts_with_query;
549	1	50				51	$snip = $ellip . $snip unless $text =~ m/^\Q$snips[0]/i;
550	1	50				69	$snip .= $ellip unless $text =~ m/\Q$snips[-1]$/i;
551
552	1					15	$self->count( scalar(@snips) + $self->count );
553
554	1					47	return $snip;
555
556							}
557
558							sub _re_match {
559
560							# the .{0,$Nchar} regexp slows things WAY down. so just match,
561							# then use pos() to get chars before and after.
562
563							# if escape = 0 and if prefix or suffix contains a < or >,
564							# try to include entire tagset.
565
566	1			1		4	my ( $self, $text, $re, $total, $snips, $ranges, $Nchar, $max_snips,
567							$snip_starts_with_query )
568							= @_;
569
570	1					5	my $t_len = length $$text;
571
572	1					3	my $cnt = 0;
573
574	1	50				24	if ( $self->debug ) {
575	0					0	warn "re_match regexp: >$re<\n";
576	0					0	warn "max_snips: $max_snips\n";
577							}
578
579	1					95	RE: while ( $$text =~ m/$re/g ) {
580
581	1					6	my $pos = pos $$text;
582	1					3	my $before_match = $1;
583	1					3	my $match = $2;
584	1					4	my $after_match = $3;
585	1					1	$cnt++;
586	1					2	my $len = length $match;
587	1					4	my $blen = length $before_match;
588	1	50				19	if ( $self->debug ) {
589	0					0	warn "re: '$re'\n";
590	0					0	warn "\$1 = '$before_match' = ", ord($before_match), "\n";
591	0					0	warn "\$2 = '$match'\n";
592	0					0	warn "\$3 = '$after_match' = ", ord($after_match), "\n";
593	0					0	warn "pos = $pos\n";
594	0					0	warn "len = $len\n";
595	0					0	warn "blen= $blen\n";
596							}
597
598	1	0	33			22	if ( $self->debug && exists $ranges->{$pos} ) {
599	0					0	warn "already found $pos\n";
600							}
601
602	1	50				9	next RE if exists $ranges->{$pos};
603
604	1		50			4	my $start_match = $pos - $len - ( $blen \|\| 1 );
605	1	50				3	$start_match = 0 if $start_match < 0;
606
607	1	50				3	$$snip_starts_with_query = 1 if $start_match == 0;
608
609							# sanity
610	1	50				24	$self->debug
611							and warn "match should be [$start_match $len]: '",
612							substr( $$text, $start_match, $len ), "'\n";
613
614	1	50				17	my $prefix_start
615							= $start_match < $Nchar
616							? 0
617							: $start_match - $Nchar;
618
619	1					4	my $prefix_len = $start_match - $prefix_start;
620
621							#$prefix_len++; $prefix_len++;
622
623	1					5	my $suffix_start = $pos - length($after_match);
624	1					3	my $suffix_len = $Nchar;
625	1					4	my $end = $suffix_start + $suffix_len;
626
627							# if $end extends beyond, that's ok, substr compensates
628
629	1					190	$ranges->{$_}++ for ( $prefix_start .. $end );
630	1					9	my $prefix = substr( $$text, $prefix_start, $prefix_len );
631	1					5	my $suffix = substr( $$text, $suffix_start, $suffix_len );
632
633	1	50				41	if ( $self->debug ) {
634	0					0	warn "prefix_start = $prefix_start\n";
635	0					0	warn "prefix_len = $prefix_len\n";
636	0					0	warn "start_match = $start_match\n";
637	0					0	warn "len = $len\n";
638	0					0	warn "pos = $pos\n";
639	0					0	warn "char = $Nchar\n";
640	0					0	warn "suffix_start = $suffix_start\n";
641	0					0	warn "suffix_len = $suffix_len\n";
642	0					0	warn "end = $end\n";
643	0					0	warn "prefix: '$prefix'\n";
644	0					0	warn "match: '$match'\n";
645	0					0	warn "suffix: '$suffix'\n";
646							}
647
648							# try and get whole words if we split one up
649							# _no_*_partial does this more rudely
650
651							# might be faster to do m/(\S)*$prefix/i
652							# but we couldn't guarantee position accuracy
653							# e.g. if $prefix matched more than once in $$text,
654							# we might pull the wrong \S*
655
656	1	50	33			20	unless ( $prefix =~ m/^\s/
657							or substr( $$text, $prefix_start - 1, 1 ) =~ m/(\s)/ )
658							{
659	0		0			0	while ( --$prefix_start >= 0
660							and substr( $$text, $prefix_start, 1 ) =~ m/(\S)/ )
661							{
662	0					0	my $onemorechar = $1;
663
664							#warn "adding $onemorechar to prefix\n";
665	0					0	$prefix = $onemorechar . $prefix;
666
667							#last if $prefix_start <= 0 or $onemorechar !~ /\S/;
668							}
669							}
670
671							# do same for suffix
672
673							# We get error here under -w
674							# about substr outside of string -- is $end undefined sometimes??
675
676	1	50	33			18	unless ( $suffix =~ m/\s$/ or substr( $$text, $end, 1 ) =~ m/(\s)/ ) {
677	1		66			15	while ( $end <= $t_len
678							and substr( $$text, $end++, 1 ) =~ m/(\S)/ )
679							{
680
681	3					14	my $onemore = $1;
682
683							#warn "adding $onemore to suffix\n";
684							#warn "before '$suffix'\n";
685	3					30	$suffix .= $onemore;
686
687							#warn "after '$suffix'\n";
688							}
689							}
690
691							# will likely fail to include one half of tagset if other is complete
692	1	50				11	unless ( $self->escape ) {
693	1					4	my $sanity = 0;
694	1					7	my @l = ( $prefix =~ /(<)/g );
695	1					5	my @r = ( $prefix =~ /(>)/g );
696	1					7	while ( scalar @l != scalar @r ) {
697
698	0					0	@l = ( $prefix =~ /(<)/g );
699	0					0	@r = ( $prefix =~ /(>)/g );
700							last
701							if scalar @l
702	0	0				0	== scalar @r; # don't take any more than we need to
703
704	0					0	my $onemorechar = substr( $$text, $prefix_start--, 1 );
705
706							#warn "tagfix: adding $onemorechar to prefix\n";
707	0					0	$prefix = $onemorechar . $prefix;
708	0	0				0	last if $prefix_start <= 0;
709	0	0				0	last if $sanity++ > 100;
710
711							}
712
713	1					5	$sanity = 0;
714	1		33			8	while ( $suffix =~ /<(\w+)/ && $suffix !~ /<\/$1>/ ) {
715
716	0					0	my $onemorechar = substr( $$text, $end, 1 );
717
718							#warn "tagfix: adding $onemorechar to suffix\n";
719	0					0	$suffix .= $onemorechar;
720	0	0				0	last if ++$end > $t_len;
721	0	0				0	last if $sanity++ > 100;
722
723							}
724							}
725
726							# warn "prefix: '$prefix'\n";
727							# warn "match: '$match'\n";
728							# warn "suffix: '$suffix'\n";
729
730	1					10	my $context = join( '', $prefix, $match, $suffix );
731
732							#warn "context is '$context'\n";
733
734	1					6	push( @{ $snips->{t} }, $context );
	1					7
735	1					3	push( @{ $snips->{offset} }, $prefix_start );
	1					6
736
737	1					4	$$total++;
738
739							# warn '-' x 40, "\n";
740
741	1	50				9	last if $cnt >= $max_snips;
742							}
743
744	1					8	return $cnt;
745							}
746
747							sub _dumb {
748
749							# just grap the first X chars and return
750
751	3			3		11	my $self = shift;
752	3	100				295	return '' unless $self->show;
753
754	1					4	my $txt = shift;
755	1					3	my $max = $self->max_chars;
756	1					5	$self->type_used('dumb');
757
758	1					4	my $show = substr( $txt, 0, $max );
759	1					4	_no_end_partial($show);
760	1					3	$show .= $ellip;
761
762	1					5	$self->count( 1 + $self->count );
763
764	1					4	return $show;
765
766							}
767
768							sub _no_start_partial {
769	1			1		8	$_[0] =~ s/^\S+\s+//gs;
770							}
771
772							sub _no_end_partial {
773	1			1		5	$_[0] =~ s/\s+\S+$//gs;
774							}
775
776							sub _escape {
777	31	50		31		151	if ( $_[0]->escape ) {
778	0					0	return Search::Tools::XML->escape( $_[1] );
779							}
780							else {
781	31					124	return $_[1];
782							}
783							}
784
785							1;
786							__END__