File Coverage

blib/lib/Text/Context.pm

Criterion	Covered	Total	%
statement	95	95	100.0
branch	16	16	100.0
condition	1	2	50.0
subroutine	15	15	100.0
pod	10	10	100.0
total	137	138	99.2

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Context;
2
3	4			4		248803	use strict;
	4					9
	4					186
4	4			4		26	use warnings;
	4					8
	4					143
5
6	4			4		32080	use UNIVERSAL::require;
	4					22507
	4					51
7
8							our $VERSION = "3.7";
9
10							=head1 NAME
11
12							Text::Context - Handle highlighting search result context snippets
13
14							=head1 SYNOPSIS
15
16							use Text::Context;
17
18							my $snippet = Text::Context->new($text, @keywords);
19
20							$snippet->keywords("foo", "bar"); # In case you change your mind
21
22							print $snippet->as_html;
23							print $snippet->as_text;
24
25							=head1 DESCRIPTION
26
27							Given a piece of text and some search terms, produces an object
28							which locates the search terms in the message, extracts a reasonable-length
29							string containing all the search terms, and optionally dumps the string out
30							as HTML text with the search terms highlighted in bold.
31
32							=head2 new
33
34							Creates a new snippet object for holding and formatting context for
35							search terms.
36
37							=cut
38
39							sub new {
40	10			10	1	6732	my ($class, $text, @keywords) = @_;
41	10					70	my $self = bless { text => $text, keywords => [] }, $class;
42	10					48	$self->keywords(@keywords);
43	10					34	return $self;
44							}
45
46							=head2 keywords
47
48							Accessor method to get/set keywords. As the context search is done
49							case-insensitively, the keywords will be lower-cased.
50
51							=cut
52
53							sub keywords {
54	15			15	1	6683	my ($self, @keywords) = @_;
55	15	100				67	$self->{keywords} = [ map { s/\s+/ /g; lc $_ } @keywords ] if @keywords;
	23					82
	23					101
56	15					36	return @{ $self->{keywords} };
	15					76
57							}
58
59							=begin maintenance
60
61							=head2 prepare_text
62
63							Turns the text into a set of Paragraph objects, collapsing multiple
64							spaces in the text and feeding the paragraphs, in order, onto the
65							C member.
66
67							=head2 para_class
68
69							The Paragraph class to use. This defaults to 'Text::Context::Para'
70
71							=end maintenance
72
73							=cut
74
75	166			166	1	5641	sub para_class { "Text::Context::Para" }
76
77							sub prepare_text {
78	9			9	1	18	my $self = shift;
79	9					94	my @paras = split /\n\n/, $self->{text};
80	9					46	for (0 .. $#paras) {
81	83					153	my $x = $paras[$_];
82	83					1407	$x =~ s/\s+/ /g;
83	83					217	$self->para_class->require;
84	83					2034	push @{ $self->{text_a} }, $self->para_class->new($x, $_);
	83					307
85							}
86							}
87
88							=begin maintenance
89
90							=head2 permute_keywords
91
92							This is very clever. To determine which keywords "apply" to a given
93							paragraph, we first produce a set of all possible keyword sets. For
94							instance, given "a", "b" and "c", we want to produce
95
96							a b c
97							a b
98							a c
99							a
100							b c
101							b
102							c
103
104							We do this by counting in binary, and then mapping the counts onto
105							keywords.
106
107							=end maintenance
108
109							=cut
110
111							sub permute_keywords {
112	98			98	1	137	my $self = shift;
113	98					103	my @permutation;
114	98					112	for my $bitstring (1 .. (2**@{ $self->{keywords} }) - 1) {
	98					296
115	292					771	my @thisperm;
116	292					294	for my $bitmask (0 .. @{ $self->{keywords} } - 1) {
	292					859
117	618	100				5247	push @thisperm, $self->{keywords}[$bitmask]
118							if $bitstring & 2**$bitmask;
119							}
120	292					1245	push @permutation, \@thisperm;
121							}
122	98					285	return reverse @permutation;
123							}
124
125							=begin maintenance
126
127							=head2 score_para / get_appropriate_paras
128
129							Now we want to find a "score" for this paragraph, finding the best set
130							of keywords which "apply" to it. We favour keyword sets which have a
131							large number of matches (obviously a paragraph is better if it matches
132							"a" and "c" than if it just matches "a") and with multi-word keywords.
133							(A paragraph which matches "fresh cheese sandwiches" en bloc is worth
134							picking out, even if it has no other matches.)
135
136							=end maintenance
137
138							=cut
139
140							sub score_para {
141	98			98	1	169	my ($self, $para) = @_;
142	98					534	my $content = $para->{content};
143	98					116	my %matches;
144
145							# Do all the matching of keywords in advance of the boring
146							# permutation bit
147	98					1467	for my $word (@{ $self->{keywords} }) {
	98					196
148	188					204	my $word_score = 0;
149	188	100				4359	$word_score += 1 + ($content =~ tr/ / /) if $content =~ /\b\Q$word\E\b/i;
150	188					559	$matches{$word} = $word_score;
151							}
152
153							#XXX : Possible optimization: Give up if there are no matches
154
155	98					261	for my $wordset ($self->permute_keywords) {
156	292					4985	my $this_score = 0;
157	292					951	$this_score += $matches{$_} for @$wordset;
158	292	100				784	$para->{scoretable}[$this_score] = $wordset if $this_score > @$wordset;
159							}
160	98					198	$para->{final_score} = $#{ $para->{scoretable} };
	98					482
161							}
162
163							sub _set_intersection {
164	19			19		23	my %union;
165							my %isect;
166	19	100				64	for (@_) { $union{$_}++ && ($isect{$_} = $_) }
	71					234
167	19					94	return values %isect;
168							}
169
170							sub _set_difference {
171	18			18		27	my ($a, $b) = @_;
172	18					20	my %seen;
173	18					46	@seen{@$b} = ();
174	18					31	return grep { !exists $seen{$_} } @$a;
	34					113
175							}
176
177							sub get_appropriate_paras {
178	16			16	1	26	my $self = shift;
179	16					21	my @app_paras;
180	16					27	my @keywords = @{ $self->{keywords} };
	16					52
181	408					935	my @paras =
182	16					29	sort { $b->{final_score} <=> $a->{final_score} } @{ $self->{text_a} };
	16					91
183	16					34	for my $para (@paras) {
184	19					79	my @words = _set_intersection($para->best_keywords, @keywords);
185	19	100				60	if (@words) {
186	18					51	@keywords = _set_difference(\@keywords, \@words);
187	18					38	$para->{marked_words} = \@words;
188	18					36	push @app_paras, $para;
189	18	100				72	last if !@keywords;
190							}
191							}
192	16					46	$self->{app_paras} = [ sort { $a->{order} <=> $b->{order} } @app_paras ];
	3					15
193	16					26	return @{ $self->{app_paras} };
	16					64
194							}
195
196							=head2 paras
197
198							@paras = $self->paras($maxlen)
199
200							Return shortened paragraphs to fit together into a snippet of at most
201							C<$maxlen> characters.
202
203							=cut
204
205							sub paras {
206	8			8	1	26	my $self = shift;
207	8		50			67	my $max_len = shift \|\| 80;
208	8					34	$self->prepare_text;
209	8					233	$self->score_para($_) for @{ $self->{text_a} };
	8					57
210	8					39	my @paras = $self->get_appropriate_paras;
211	8	100				35	return unless @paras;
212
213							# XXX: Algorithm may get better here by considering number of marked
214							# up words as weight
215	7					20	return map { $_->slim($max_len / @paras) } $self->get_appropriate_paras;
	8					45
216							}
217
218							=head2 as_text
219
220							Calculates a "representative" string which contains
221							the given search terms. If there's lots and lots of context between the
222							terms, it's replaced with an ellipsis.
223
224							=cut
225
226							sub as_text {
227	4			4	1	48	return join " ... ", map { $_->as_text } $_[0]->paras;
	5					262
228							}
229
230							=head2 as_html([ start => "", end => "" ])
231
232							Markup the snippet as a HTML string using the specified delimiters or
233							with a default set of delimiters (Cspan class="quoted"E>).
234
235							=cut
236
237							sub as_html {
238	3			3	1	23	my $self = shift;
239	3					11	my %args = @_;
240
241	3					9	my ($start, $end) = @args{qw(start end)};
242	3					17	return join " ... ", map { $_->marked_up($start, $end) } $self->paras;
	3					20
243							}
244
245							=head1 AUTHOR
246
247							Original author: Simon Cozens
248
249							Current maintainer: Tony Bowden
250
251							=head1 BUGS and QUERIES
252
253							Please direct all correspondence regarding this module to:
254							bug-Text-Context@rt.cpan.org
255
256							=head1 COPYRIGHT AND LICENSE
257
258							Copyright (C) 2002-2005 Kasei
259
260							This program is free software; you can redistribute it and/or modify
261							it under the terms of the GNU General Public License; either version
262							2 of the License, or (at your option) any later version.
263
264							This program is distributed in the hope that it will be useful,
265							but WITHOUT ANY WARRANTY; without even the implied warranty of
266							MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
267
268							=cut
269
270							1;