File Coverage

blib/lib/Search/Tools/HiLiter.pm

Criterion	Covered	Total	%
statement	252	281	89.6
branch	61	98	62.2
condition	21	42	50.0
subroutine	21	26	80.7
pod	10	10	100.0
total	365	457	79.8

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tools::HiLiter;
2	16			16		17055	use Moo;
	16					60072
	16					120
3							extends 'Search::Tools::Object';
4							with 'Search::Tools::ArgNormalizer';
5	16			16		12420	use Carp;
	16					41
	16					1233
6	16			16		2529	use Search::Tools::Tokenizer;
	16					52
	16					566
7	16			16		2759	use Search::Tools::XML;
	16					55
	16					613
8	16			16		124	use Search::Tools::UTF8;
	16					43
	16					1848
9	16			16		120	use Data::Dump qw( dump );
	16					53
	16					1001
10
11	16			16		124	use namespace::autoclean;
	16					42
	16					143
12
13							our $VERSION = '1.007';
14
15							my $XML = Search::Tools::XML->new;
16
17							my @attrs = qw(
18							query
19							tag
20							class
21							style
22							text_color
23							colors
24							tty
25							ttycolors
26							no_html
27							);
28
29							for my $attr (@attrs) {
30							has $attr => ( is => 'rw' );
31							}
32
33							sub BUILD {
34	23			23	1	489	my $self = shift;
35
36	23	50				529	if ( $self->debug ) {
37	0					0	carp "debug level set at " . $self->debug;
38							}
39
40	23					766	$self->{_tokenizer} = Search::Tools::Tokenizer->new(
41							re => $self->query->qp->term_re,
42							debug => $self->debug,
43							);
44
45	23		100			178	$self->{tag} \|\|= 'span';
46	23		50			194	$self->{colors} \|\|= [ '#ffff99', '#99ffff', '#ffccff', '#ccccff' ];
47	23		50			159	$self->{ttycolors} \|\|= [ 'bold blue', 'bold red', 'bold green' ];
48
49	23	100				112	if ( $self->tty ) {
50	7					13	eval { require Term::ANSIColor };
	7					2159
51	7	50				30840	$self->tty(0) if $@;
52							}
53
54	23					108	$self->_build_tags;
55							}
56
57							sub terms {
58	0			0	1	0	return shift->{query}->terms;
59							}
60
61							sub keywords {
62	0			0	1	0	return @{ shift->terms };
	0					0
63							}
64
65							sub _phrases {
66	0			0		0	my $self = shift;
67	0					0	my $q = $self->{query};
68	0					0	return grep { $self->_regex_for($_)->is_phrase } @{ $q->terms };
	0					0
	0					0
69							}
70
71							sub _singles {
72	0			0		0	my $self = shift;
73	0					0	my $q = $self->{query};
74	0					0	return grep { !$self->_regex_for($_)->is_phrase } @{ $q->terms };
	0					0
	0					0
75							}
76
77							sub _kworder {
78	48			48		108	my $self = shift;
79	48					118	my $q = $self->{query};
80	48					166	my $qstr = $q->str;
81	48	100				210	if ( exists $self->{_kworder_cache}->{$qstr} ) {
82	25					64	return @{ $self->{_kworder_cache}->{$qstr} };
	25					155
83							}
84
85							# do phrases first so that duplicates privilege phrases
86	23					60	my ( @phrases, @singles );
87
88	23					49	for ( @{ $q->terms } ) {
	23					117
89	60	100				224	if ( $self->_regex_for($_)->is_phrase ) {
90	15					57	push @phrases, $_;
91							}
92							else {
93	45					179	push @singles, $_;
94							}
95							}
96
97	23					153	$self->{_kworder_cache}->{$qstr} = [ @phrases, @singles ];
98
99	23					131	return ( @phrases, @singles );
100							}
101
102							sub _build_tags {
103	23			23		52	my $self = shift;
104
105	23					60	my $t = {};
106	23					45	my @colors = @{ $self->colors };
	23					113
107	23					56	my @ttycolors = @{ $self->ttycolors };
	23					98
108	23					90	my $tag = $self->tag;
109
110	23					51	my $n = 0;
111	23					51	my $m = 0;
112
113	23					91	for my $q ( $self->_kworder ) {
114
115							# if tty flag is on, use ansicolor instead of html
116							# if debug flag is on, use both html and ansicolor
117
118	60					123	my ( %tags, $opener );
119	60					168	$tags{open} = '';
120	60					152	$tags{close} = '';
121	60	100				365	if ( $self->class ) {
		50
		100
122	10					41	$opener = qq/<$tag class='/ . $self->class . qq/'>/;
123							}
124							elsif ( $self->style ) {
125	0					0	$opener = qq/<$tag style='/ . $self->style . qq/'>/;
126							}
127							elsif ( $self->text_color ) {
128	6					23	$opener
129							= qq/<$tag style='color:/
130							. $self->text_color
131							. qq/;background:/
132							. $colors[$n] . qq/'>/;
133							}
134							else {
135	44					190	$opener = qq/<$tag style='background:/ . $colors[$n] . qq/'>/;
136							}
137
138	60	100				166	if ( $self->tty ) {
139	15	50	33			336	$tags{open} .= $opener if $self->debug && !$self->no_html;
140	15					173	$tags{open} .= Term::ANSIColor::color( $ttycolors[$m] );
141	15					417	$tags{close} .= Term::ANSIColor::color('reset');
142	15	50	33			559	$tags{close} .= "" if $self->debug && !$self->no_html;
143							}
144							else {
145	45					123	$tags{open} .= $opener;
146	45					116	$tags{close} .= "";
147							}
148
149	60					262	$t->{$q} = \%tags;
150
151	60	100				225	$n = 0 if ++$n > $#colors;
152	60	100				209	$m = 0 if ++$m > $#ttycolors;
153							}
154
155	23					2021	$self->{_tags} = $t;
156							}
157
158							sub open_tag {
159	69			69	1	127	my $self = shift;
160	69	50				196	my $q = shift or croak "need query to get open_tag";
161	69		50			318	return $self->{_tags}->{$q}->{open} \|\| '';
162							}
163
164							sub close_tag {
165	69			69	1	193	my $self = shift;
166	69	50				180	my $q = shift or croak "need query to get close_tag";
167	69		50			262	return $self->{_tags}->{$q}->{close} \|\| '';
168							}
169
170							sub light {
171	25			25	1	106	my $self = shift;
172	25	50				109	my $text = shift or return '';
173
174							# force upgrade. this is so regex will match ok.
175	25					157	$text = to_utf8($text);
176
177	25	100	66			161	if ( $XML->looks_like_html($text) && !$self->no_html ) {
178
179							#warn "running ->html";
180	11	50				87	if ( $self->query->qp->stemmer ) {
181	0					0	return $self->html_stemmer($text);
182							}
183	11					44	return $self->html($text);
184							}
185							else {
186
187							#warn "running ->plain";
188	14	100				135	if ( $self->query->qp->stemmer ) {
189	1					4	return $self->plain_stemmer($text);
190							}
191	13					128	return $self->plain($text);
192							}
193							}
194
195							*hilite = \&light;
196
197							sub _get_real_html {
198	29			29		46	my $self = shift;
199	29					61	my $text = shift;
200	29					47	my $re = shift;
201	29					49	my $m = {};
202	29	50				714	my $debug = $self->debug > 1 ? 1 : 0;
203
204							# $1 should be st_bound, $2 should be query, $3 should be end_bound
205							# N.B. The XS version of this algorithm is only a hair faster,
206							# since the $re is the bottleneck.
207	29					46940	while ( $$text =~ m/$re/g ) {
208
209	43					241	my $pos = pos($$text);
210
211	43	50				105	if ($debug) {
212	0					0	carp "$2 matches $re";
213	0					0	carp "\$1='$1'\n\$2='$2'\n\$3='$3'\npos=$pos";
214							}
215
216	43					191	$m->{$2}++;
217
218							# move back and consider $3 again as possible $1 for next match
219	43	50				124	if ( length($3) ) {
220	43					39006	pos($$text) = $pos - 1;
221							}
222
223							}
224
225	29					128	return $m;
226
227							}
228
229							sub _regex_for {
230	124			124		213	my $self = shift;
231	124	50				317	my $term = shift or croak "term required";
232	124	100				407	if ( exists $self->{_regex_for}->{$term} ) {
233	64					220	return $self->{_regex_for}->{$term};
234							}
235	60					304	$self->{_regex_for}->{$term} = $self->query->regex_for($term);
236	60					310	return $self->{_regex_for}->{$term};
237							}
238
239							# based on HTML::HiLiter hilite()
240							sub html {
241	11			11	1	23	my $self = shift;
242	11	50				38	my $text = shift or croak "need text to light()";
243
244							###################################################################
245							# 1. create hash of query -> [ array of real HTML to hilite ]
246							# using the prebuilt regexp
247							# 2. hilite the real HTML
248							###################################################################
249
250							## 1
251
252	11					35	my $q2real = {};
253
254							# this is going to be query => [ real_html ]
255
256							# if the query text matched in the text, then we need to
257							# use our prebuilt regexp
258	11					37	my @kworder = $self->_kworder;
259
260							# don't consider anything we've marked
261							# with a 'nohiliter' attribute
262	11					29	my $text_copy = $text;
263	11					212	$text_copy =~ s/\002.*?\003//sgi;
264
265	11					31	Q: for my $query (@kworder) {
266	29					95	my $re = $self->_regex_for($query)->html;
267	29					86	my $real = $self->_get_real_html( \$text_copy, $re );
268
269	29					158	R: for my $r ( keys %$real ) {
270	35					117	push( @{ $q2real->{$query} }, $r ) while $real->{$r}--;
	43					246
271							}
272							}
273
274							## 2
275
276	11					40	HILITE: for my $q (@kworder) {
277
278	29					81	my %uniq_reals = ();
279	29					46	$uniq_reals{$_}++ for @{ $q2real->{$q} };
	29					182
280
281	29					102	REAL: for my $real ( keys %uniq_reals ) {
282
283	35					149	$self->_add_hilite_tags( \$text, $q, $real );
284
285							}
286
287							}
288
289	11					206	return $text;
290							}
291
292							sub _add_hilite_tags {
293	35			35		72	my $self = shift;
294	35					62	my $text = shift; # reference
295	35					64	my $query = shift;
296	35					84	my $html = shift;
297
298							# $text is reference to original text
299							# $html is the real html that matched our regexp
300
301							# we still check boundaries just to be safe
302	35					219	my $st_bound = $self->query->qp->start_bound;
303	35					108	my $end_bound = $self->query->qp->end_bound;
304
305	35					98	my $o = $self->open_tag($query);
306	35					104	my $c = $self->close_tag($query);
307
308	35					90	my $safe = quotemeta($html);
309
310							# pre-fix nested tags in match
311	35					60	my $pre_fixed = $html;
312	35					131	my $tag_re = $self->query->qp->tag_re;
313	35					482	my $pre_added = $pre_fixed =~ s(${tag_re}+)$c$1$og;
314	35					131	my $len_added = length( $c . $o ) * $pre_added;
315
316							# should be same as length( $to_hilite) - length( $prefixed );
317	35					108	my $len_diff = ( length($html) - length($pre_fixed) );
318	35	100				98	$len_diff *= -1
319							if $len_diff < 0; # pre_added might be -1 if no subs were made
320	35	50				85	if ( $len_diff != $len_added ) {
321	0					0	carp "length math failed!"
322							. "len_diff = $len_diff\nlen_added = $len_added";
323							}
324
325	35					30873	while ( $$text =~ m/($st_bound)($safe)($end_bound)/g ) {
326	43					489	my $s = $1;
327	43					111	my $m = $2;
328	43					83	my $e = $3;
329	43	50				1269	if ( $self->debug > 1 ) {
330	0					0	carp "matched:\n'$s'\n'$m'\n'$e'\n"
331							. "\$1 is "
332							. ord($s)
333							. "\$3 is "
334							. ord($e);
335							}
336
337							# use substr to do what s// would normally do if pos() wasn't an issue
338							# -- is this a big speed hit?
339	43					456	my $len = length( $s . $m . $e );
340	43					206	my $pos = pos($$text);
341	43					154	my $newstring = $s . $o . $pre_fixed . $c . $e;
342	43					415	substr( $$text, $pos - $len, $len, $newstring );
343
344	43					2402	pos($$text) = $pos + length( $o . $c ) + $len_added - 1;
345
346							# adjust for new text added
347							# $pre_fixed is the hard bit, since we must take $len_added into account
348							# move back 1 to reconsider $3 as next $1
349
350							# warn "pos was $pos\nnow ", pos( $html ), "\n";
351							# warn "new: '$html'\n";
352							# warn "new text: '$newstring'\n";
353							# warn "first chars of new pos are '", substr( $html, pos($html), 10 ), "'\n";
354
355							}
356
357	35					165	$self->_clean_up_hilites( $text, $query, $o, $c, $safe );
358
359							}
360
361							# no algorithm is perfect. fix it as best we can.
362							sub _clean_up_hilites {
363
364	35			35		83	my $self = shift;
365	35					108	my ( $text, $query, $o, $c, $safe ) = @_;
366
367							# empty hilites are useless
368	35		100			2250	my $empty = ( $$text =~ s,\Q$o$c\E,,sgi ) \|\| 0;
369
370							#$self->debug and carp "looking for split entities: (&[\\w#])\Q$o\E(?:\Q$c\E)(${safe})\Q$c\E([\\w#];)";
371
372							# to be safe: in some cases we might match against entities or within tag content.
373	35		50			2684	my $ent_split = (
374							$$text
375							=~ s/(&[\w#])\Q$o\E(?:\Q$c\E)?(${safe})\Q$c\E([\w#];)/$1$2$3/igs # is i and s necessary?
376							) \|\| 0;
377
378							#$self->debug and carp "found $ent_split split entities";
379
380	35					121	my $tag_split = 0;
381	35					2200	while (
382							$$text
383							=~ m/(<[^<>])\Q$o\E($safe)\Q$c\E([^>]>)/gxsi # are these xsi flags necessary?
384							)
385							{
386
387	4					18	my $first = $1;
388	4					10	my $second = $2;
389	4					10	my $third = $3;
390	4	50				103	carp "appears to split tag: $first - $second - $third"
391							if $self->debug > 1;
392
393							# TODO this would be one place to highlight text where attributes match
394
395	4					562	$tag_split += (
396							$$text =~ s/(<[^<>])\Q$o\E($safe)\Q$c\E([^>]>)/$1$2$3/gxsi );
397
398							}
399
400							}
401
402							sub html_stemmer {
403	0			0	1	0	my $self = shift;
404	0					0	my $text = shift;
405	0					0	return $self->plain_stemmer($text);
406							}
407
408							sub plain_stemmer {
409	1			1	1	2	my $self = shift;
410	1	50				3	my $text = shift or croak "need text";
411	1					37	my $debug = $self->debug;
412
413	1					9	my @kworder = $self->_kworder;
414
415							# if stemmer is on, we must stem each token to look for a match
416	1					6	my $qre = $self->query->terms_as_regex(1);
417	1					4	$qre =~ s/(\\ )+/\\|/g; # TODO OR phrases together if (0) above?
418
419	1					142	my $re = qr/^$qre$/;
420	1					11	my $stemmer = $self->query->qp->stemmer;
421	1					4	my $qp = $self->query->qp;
422	1					3	my $wildcard = $qp->wildcard;
423							my $heat_seeker = sub {
424	34			34		66	my ($token) = @_;
425	34					86	my $st = $stemmer->( $qp, $token->str );
426	34					490	return $st =~ m/$re/;
427	1					6	};
428
429	1					23	my $tokens = $self->{_tokenizer}->tokenize( $text, $heat_seeker );
430
431							# create a new string
432	1					2	my $buf;
433
434							# iterate over tokens, looking for any hot ones,
435							# and create a new string
436	1					20	TOK: while ( my $tok = $tokens->next ) {
437	69					123	my $str = $tok->str;
438	69	100				139	if ( $tok->is_hot ) {
439
440							# find the matching query term
441
442	3					7	my $stemmed = $stemmer->( $qp, $str );
443	3					24	my $found_match = 0;
444	3					7	Q: for my $query (@kworder) {
445	4					11	my $regex = $self->_regex_for($query);
446	4					5	my @regex_to_try;
447
448							# if it is a phrase, try each term in the phrase
449	4	100				16	if ( $regex->is_phrase ) {
450	3					5	@regex_to_try = @{ $regex->phrase_terms };
	3					10
451							}
452							else {
453	1					2	@regex_to_try = ($regex);
454							}
455	4					7	REGEX: for my $r (@regex_to_try) {
456	6					14	my $term_re = $r->term_re;
457	6	50				12	$debug
458							and warn
459							"testing '$stemmed' against '$query' with '$term_re'";
460	6	100				33	if ( $stemmed =~ m/$term_re/ ) {
461	3					9	my $open = $self->open_tag($query);
462	3					7	my $close = $self->close_tag($query);
463	3	50				16	$debug and warn "$str is hot with match '$query'";
464	3					11	$str = $open . $str . $close;
465	3					5	$found_match = 1;
466	3					7	last Q;
467							}
468
469							}
470							}
471
472	3	50				8	if ( !$found_match ) {
473
474							# common case is phrases?
475	0	0				0	$debug and warn "failed to find match for '$stemmed'";
476
477							}
478							}
479	69					218	$buf .= $str;
480							}
481	1					86	return $buf;
482							}
483
484							# based on HTML::HiLiter plaintext()
485							sub plain {
486	13			13	1	32	my $self = shift;
487	13	50				87	my $text = shift or croak "need text to light()";
488	13					431	my $debug = $self->debug;
489	13					139	my $query_obj = $self->{query};
490	13					68	my @kworder = $self->_kworder;
491
492	13					35	my $i = 0;
493	13					37	my @markers;
494	13					38	Q: for my $query (@kworder) {
495	31					97	my $regex = $self->_regex_for($query);
496	31					114	my $re = $regex->plain;
497	31					95	my $term_re = $regex->term_re;
498	31					99	my $open = $self->open_tag($query);
499	31					117	my $close = $self->close_tag($query);
500
501							# use open/close markers rather than actual html tags
502							# because we do not want to get double matches on text
503							# like 'span' or 'style'
504	31					108	my $o = chr($i) . "\002";
505	31					253	my $c = chr($i) . "\003";
506	31					92	my $length_we_add = length( $o . $c ) - 1;
507	31					94	push @markers, [ $open, $close ];
508
509							# cache this
510	31		66			199	my $query_re = $self->{_compiled_query_regex}->{"$query"}
511							\|\| quotemeta($query);
512	31	100				168	if ( !$self->{_compiled_query_regex}->{"$query"} ) {
513	30					525	$self->{_compiled_query_regex}->{"$query"} = qr/$query_re/;
514							}
515
516	31	50				128	$debug > 1
517							and carp
518							"plain hiliter looking for: $re against '$query' in '$text'";
519
520							# because s/// fails to find duplicate instances like 'foo foo'
521							# we use a while loop and increment pos()
522
523							# this can suck into an infinite loop because increm pos()-- results
524							# in repeated match on nonwordchar: > (since we just added a tag)
525
526	31	50				87	if ($debug) {
527	0	0	0			0	if ( $text =~ m/\b$query_re\b/i && $text !~ m/$re/i ) {
528	0					0	my ($snip) = ( $text =~ m/(.....$query_re.....)/gi );
529	0					0	croak "bad regex for '$query' [$snip]: $re";
530							}
531							}
532
533	31					58	my $found_matches = 0;
534	31					1394	while ( $text =~ m/$re/g ) {
535
536	31		100			152	my $s = $1 \|\| '';
537	31		33			111	my $m = $2 \|\| $query;
538	31		50			106	my $e = $3 \|\| '';
539
540	31					62	$found_matches++;
541
542	31	50				78	$debug > 1 and carp "matched $s $m $e against $re";
543
544							# use substr to do what s/// would normally do
545							# if pos() wasn't an issue -- is this a big speed diff?
546	31					91	my $len = length( $s . $m . $e );
547	31					197	my $pos = pos($text);
548	31	50				96	$debug > 1 and carp "pos==$pos len==$len";
549	31					114	my $newstring = $s . $o . $m . $c . $e;
550	31					320	substr( $text, $pos - $len, $len, $newstring );
551
552	31	50				146	last if $pos == length $text;
553
554							# need to account for all the new chars we just added
555	31					90	pos($text) = $pos + $length_we_add;
556	31	50				1493	$debug > 1
557							and carp "length_we_add==$length_we_add pos==" . pos($text);
558
559							}
560
561	31	50				96	$debug and warn "found $found_matches matches";
562
563							# sanity check similar to Snipper->_re_snip()
564	31	0	33			90	if ( $debug and !$found_matches and $text =~ m/$query_re/ ) {
			33
565	0	0				0	$debug and warn "ERROR: regex failure for '$query'";
566	0					0	$text = $self->html($text);
567							}
568
569							# increment the marker
570	31					80	$i++;
571
572							}
573
574							# now our markers replaced with actual tags
575	13					31	$i = 0;
576	13					38	for my $set (@markers) {
577	31					75	my $ichr = quotemeta( chr($i) );
578	31					442	$text =~ s/$ichr\002/$set->[0]/g;
579	31					390	$text =~ s/$ichr\003/$set->[1]/g;
580	31					94	$i++;
581							}
582
583							#warn "plain done";
584
585	13					142	return $text;
586
587							}
588
589							1;
590							__END__