File Coverage

blib/lib/Text/Context.pm
Criterion Covered Total %
statement 95 95 100.0
branch 16 16 100.0
condition 1 2 50.0
subroutine 15 15 100.0
pod 10 10 100.0
total 137 138 99.2


line stmt bran cond sub pod time code
1             package Text::Context;
2              
3 4     4   248803 use strict;
  4         9  
  4         186  
4 4     4   26 use warnings;
  4         8  
  4         143  
5              
6 4     4   32080 use UNIVERSAL::require;
  4         22507  
  4         51  
7              
8             our $VERSION = "3.7";
9              
10             =head1 NAME
11              
12             Text::Context - Handle highlighting search result context snippets
13              
14             =head1 SYNOPSIS
15              
16             use Text::Context;
17              
18             my $snippet = Text::Context->new($text, @keywords);
19              
20             $snippet->keywords("foo", "bar"); # In case you change your mind
21              
22             print $snippet->as_html;
23             print $snippet->as_text;
24              
25             =head1 DESCRIPTION
26              
27             Given a piece of text and some search terms, produces an object
28             which locates the search terms in the message, extracts a reasonable-length
29             string containing all the search terms, and optionally dumps the string out
30             as HTML text with the search terms highlighted in bold.
31              
32             =head2 new
33              
34             Creates a new snippet object for holding and formatting context for
35             search terms.
36              
37             =cut
38              
39             sub new {
40 10     10 1 6732 my ($class, $text, @keywords) = @_;
41 10         70 my $self = bless { text => $text, keywords => [] }, $class;
42 10         48 $self->keywords(@keywords);
43 10         34 return $self;
44             }
45              
46             =head2 keywords
47              
48             Accessor method to get/set keywords. As the context search is done
49             case-insensitively, the keywords will be lower-cased.
50              
51             =cut
52              
53             sub keywords {
54 15     15 1 6683 my ($self, @keywords) = @_;
55 15 100       67 $self->{keywords} = [ map { s/\s+/ /g; lc $_ } @keywords ] if @keywords;
  23         82  
  23         101  
56 15         36 return @{ $self->{keywords} };
  15         76  
57             }
58              
59             =begin maintenance
60              
61             =head2 prepare_text
62              
63             Turns the text into a set of Paragraph objects, collapsing multiple
64             spaces in the text and feeding the paragraphs, in order, onto the
65             C member.
66              
67             =head2 para_class
68              
69             The Paragraph class to use. This defaults to 'Text::Context::Para'
70              
71             =end maintenance
72              
73             =cut
74              
75 166     166 1 5641 sub para_class { "Text::Context::Para" }
76              
77             sub prepare_text {
78 9     9 1 18 my $self = shift;
79 9         94 my @paras = split /\n\n/, $self->{text};
80 9         46 for (0 .. $#paras) {
81 83         153 my $x = $paras[$_];
82 83         1407 $x =~ s/\s+/ /g;
83 83         217 $self->para_class->require;
84 83         2034 push @{ $self->{text_a} }, $self->para_class->new($x, $_);
  83         307  
85             }
86             }
87              
88             =begin maintenance
89              
90             =head2 permute_keywords
91              
92             This is very clever. To determine which keywords "apply" to a given
93             paragraph, we first produce a set of all possible keyword sets. For
94             instance, given "a", "b" and "c", we want to produce
95              
96             a b c
97             a b
98             a c
99             a
100             b c
101             b
102             c
103              
104             We do this by counting in binary, and then mapping the counts onto
105             keywords.
106              
107             =end maintenance
108              
109             =cut
110              
111             sub permute_keywords {
112 98     98 1 137 my $self = shift;
113 98         103 my @permutation;
114 98         112 for my $bitstring (1 .. (2**@{ $self->{keywords} }) - 1) {
  98         296  
115 292         771 my @thisperm;
116 292         294 for my $bitmask (0 .. @{ $self->{keywords} } - 1) {
  292         859  
117 618 100       5247 push @thisperm, $self->{keywords}[$bitmask]
118             if $bitstring & 2**$bitmask;
119             }
120 292         1245 push @permutation, \@thisperm;
121             }
122 98         285 return reverse @permutation;
123             }
124              
125             =begin maintenance
126              
127             =head2 score_para / get_appropriate_paras
128              
129             Now we want to find a "score" for this paragraph, finding the best set
130             of keywords which "apply" to it. We favour keyword sets which have a
131             large number of matches (obviously a paragraph is better if it matches
132             "a" and "c" than if it just matches "a") and with multi-word keywords.
133             (A paragraph which matches "fresh cheese sandwiches" en bloc is worth
134             picking out, even if it has no other matches.)
135              
136             =end maintenance
137              
138             =cut
139              
140             sub score_para {
141 98     98 1 169 my ($self, $para) = @_;
142 98         534 my $content = $para->{content};
143 98         116 my %matches;
144              
145             # Do all the matching of keywords in advance of the boring
146             # permutation bit
147 98         1467 for my $word (@{ $self->{keywords} }) {
  98         196  
148 188         204 my $word_score = 0;
149 188 100       4359 $word_score += 1 + ($content =~ tr/ / /) if $content =~ /\b\Q$word\E\b/i;
150 188         559 $matches{$word} = $word_score;
151             }
152              
153             #XXX : Possible optimization: Give up if there are no matches
154              
155 98         261 for my $wordset ($self->permute_keywords) {
156 292         4985 my $this_score = 0;
157 292         951 $this_score += $matches{$_} for @$wordset;
158 292 100       784 $para->{scoretable}[$this_score] = $wordset if $this_score > @$wordset;
159             }
160 98         198 $para->{final_score} = $#{ $para->{scoretable} };
  98         482  
161             }
162              
163             sub _set_intersection {
164 19     19   23 my %union;
165             my %isect;
166 19 100       64 for (@_) { $union{$_}++ && ($isect{$_} = $_) }
  71         234  
167 19         94 return values %isect;
168             }
169              
170             sub _set_difference {
171 18     18   27 my ($a, $b) = @_;
172 18         20 my %seen;
173 18         46 @seen{@$b} = ();
174 18         31 return grep { !exists $seen{$_} } @$a;
  34         113  
175             }
176              
177             sub get_appropriate_paras {
178 16     16 1 26 my $self = shift;
179 16         21 my @app_paras;
180 16         27 my @keywords = @{ $self->{keywords} };
  16         52  
181 408         935 my @paras =
182 16         29 sort { $b->{final_score} <=> $a->{final_score} } @{ $self->{text_a} };
  16         91  
183 16         34 for my $para (@paras) {
184 19         79 my @words = _set_intersection($para->best_keywords, @keywords);
185 19 100       60 if (@words) {
186 18         51 @keywords = _set_difference(\@keywords, \@words);
187 18         38 $para->{marked_words} = \@words;
188 18         36 push @app_paras, $para;
189 18 100       72 last if !@keywords;
190             }
191             }
192 16         46 $self->{app_paras} = [ sort { $a->{order} <=> $b->{order} } @app_paras ];
  3         15  
193 16         26 return @{ $self->{app_paras} };
  16         64  
194             }
195              
196             =head2 paras
197              
198             @paras = $self->paras($maxlen)
199              
200             Return shortened paragraphs to fit together into a snippet of at most
201             C<$maxlen> characters.
202              
203             =cut
204              
205             sub paras {
206 8     8 1 26 my $self = shift;
207 8   50     67 my $max_len = shift || 80;
208 8         34 $self->prepare_text;
209 8         233 $self->score_para($_) for @{ $self->{text_a} };
  8         57  
210 8         39 my @paras = $self->get_appropriate_paras;
211 8 100       35 return unless @paras;
212              
213             # XXX: Algorithm may get better here by considering number of marked
214             # up words as weight
215 7         20 return map { $_->slim($max_len / @paras) } $self->get_appropriate_paras;
  8         45  
216             }
217              
218             =head2 as_text
219              
220             Calculates a "representative" string which contains
221             the given search terms. If there's lots and lots of context between the
222             terms, it's replaced with an ellipsis.
223              
224             =cut
225              
226             sub as_text {
227 4     4 1 48 return join " ... ", map { $_->as_text } $_[0]->paras;
  5         262  
228             }
229              
230             =head2 as_html([ start => "", end => "" ])
231              
232             Markup the snippet as a HTML string using the specified delimiters or
233             with a default set of delimiters (Cspan class="quoted"E>).
234              
235             =cut
236              
237             sub as_html {
238 3     3 1 23 my $self = shift;
239 3         11 my %args = @_;
240              
241 3         9 my ($start, $end) = @args{qw(start end)};
242 3         17 return join " ... ", map { $_->marked_up($start, $end) } $self->paras;
  3         20  
243             }
244              
245             =head1 AUTHOR
246              
247             Original author: Simon Cozens
248              
249             Current maintainer: Tony Bowden
250              
251             =head1 BUGS and QUERIES
252              
253             Please direct all correspondence regarding this module to:
254             bug-Text-Context@rt.cpan.org
255              
256             =head1 COPYRIGHT AND LICENSE
257              
258             Copyright (C) 2002-2005 Kasei
259              
260             This program is free software; you can redistribute it and/or modify
261             it under the terms of the GNU General Public License; either version
262             2 of the License, or (at your option) any later version.
263              
264             This program is distributed in the hope that it will be useful,
265             but WITHOUT ANY WARRANTY; without even the implied warranty of
266             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
267              
268             =cut
269              
270             1;