File Coverage

blib/lib/Text/Context/Porter.pm

Criterion	Covered	Total	%
statement	77	98	78.5
branch	10	18	55.5
condition	3	6	50.0
subroutine	14	14	100.0
pod	4	4	100.0
total	108	140	77.1

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Context::Porter;
2	1			1		723	use base 'Text::Context';
	1					3
	1					714
3	1			1		3430	use strict;
	1					2
	1					31
4	1			1		15	use warnings;
	1					2
	1					23
5	1			1		747	use Lingua::Stem::En;
	1					2313
	1					416
6
7							our $VERSION = "1.1";
8
9							=head1 NAME
10
11							Text::Context::Porter - Text::Context with inflection awareness
12
13							=head1 SYNOPSIS
14
15							use Text::Context::Porter;
16
17							my $snippet = Text::Context::Porter->new($text, @keywords);
18
19							$snippet->keywords("foo", "bar"); # In case you change your mind
20
21							print $snippet->as_html;
22							print $snippet->as_text;
23
24							=head1 DESCRIPTION
25
26							Given a piece of text and some search terms, produces an object
27							which locates the search terms in the message, extracts a reasonable-length
28							string containing all the search terms, and optionally dumps the string out
29							as HTML text with the search terms highlighted in bold.
30
31							However, unlike the ordinary C, this subclass is able to
32							highlight terms in the document which are inflected variants of the search
33							terms. For instance, searching for "testing" should highlight "test",
34							"tested" and so on.
35
36							=cut
37
38							sub keywords {
39	1			1	1	366	my ($self, @keywords) = @_;
40	1	50				4	if (@keywords) {
41	2					3	$self->{keywords} = Lingua::Stem::En::stem({ -words =>
42	1					4	[ map {s/\s+/ /g; lc $_} @keywords ]
	2					10
43							});
44							}
45	1					199	return @{$self->{keywords}};
	1					4
46							}
47
48	12			12	1	142	sub para_class {"Text::Context::Para::Porter"}
49
50							sub paras {
51	2			2	1	646	my $self = shift;
52	2		50			10	my $max_len = shift \|\| 150;
53	2					10	$self->prepare_text;
54	2					26	$self->score_para($_) for @{$self->{text_a}};
	2					10
55	2					13	my @paras = $self->get_appropriate_paras;
56	2					133	return map { $_->slim($max_len / @paras) } @paras;
	2					8
57
58							}
59
60							sub score_para {
61	9			9	1	13	my ($self, $para) = @_;
62	9					19	my $content = $para->{content};
63	9					12	$content = join " ", @{Lingua::Stem::En::stem({ -words => [ split /\W+/, $para->{content} ] })};
	9					90
64	9					1692	my %matches;
65							# Do all the matching of keywords in advance of the boring
66							# permutation bit
67	9					11	for my $word (@{$self->{keywords}}) {
	9					24
68	18					20	my $word_score = 0;
69	18	100				212	$word_score += 1 + ($content =~ tr/ / /) if $content =~ /\b\Q$word\E\b/i;
70	18					49	$matches{$word} = $word_score;
71							}
72							#XXX : Possible optimization: Give up if there are no matches
73
74	9					33	for my $wordset ($self->permute_keywords) {
75	27					314	my $this_score = 0;
76	27					70	$this_score += $matches{$_} for @$wordset;
77	27	100				79	$para->{scoretable}[$this_score] = $wordset if $this_score > @$wordset;
78							}
79	9					17	$para->{final_score} = $#{$para->{scoretable}};
	9					39
80							}
81
82							package Text::Context::Para::Porter;
83	1			1		5	use constant DEFAULT_START_TAG => '';
	1					2
	1					65
84	1			1		4	use constant DEFAULT_END_TAG => "";
	1					2
	1					36
85	1			1		4	use base 'Text::Context::Para';
	1					2
	1					811
86	1			1		9356	use HTML::Entities;
	1					2
	1					692
87
88							sub marked_up {
89	1			1		6	my $self = shift;
90	1		50			8	my $start_tag = shift \|\| DEFAULT_START_TAG;
91	1		50			7	my $end_tag = shift \|\| DEFAULT_END_TAG;
92	1					5	my $content = $self->as_text;
93	1					5	my %words = map {$_ => 1} @{$self->{marked_words}};
	2					28
	1					3
94	1					4	my $output;
95	1					11	for my $word (split /(\s+)/, $content) {
96	17	100				57	if ($word =~ /\S/) {
97	9					9	my ($stemmed) = @{Lingua::Stem::En::stem({ -words => [ $word ]})};
	9					37
98	9	100				491	if ($words{$stemmed}) {
99	2					7	$word = $start_tag . encode_entities($word) . $end_tag;
100							} else {
101	7					21	$word = encode_entities($word);
102							}
103							}
104	17					109	$output .= $word;
105							}
106	1					12	return $output;
107							}
108
109							sub slim {
110	2			2		4	my ($self, $max_weight) = @_;
111	2					5	$self->{content} =~ s/^\s+//;
112	2					9	$self->{content} =~ s/\s+$//;
113	2	50				14	return $self if length $self->{content} <= $max_weight;
114	0						my %words = map {$_ => 1 } @{$self->{marked_words}};
	0
	0
115	0						my $old_length = -1;
116	0						my $this_length = length $self->{content};
117	0						do {{
118	0	0					if ($old_length == $this_length) { return $self; } # Give up
	0
	0
119	0						$old_length = $this_length;
120
121	0						$self->{content} =~ /^\W*(\w+)/;
122	0						my $stemmed = Lingua::Stem::En::stem({ -words => [$1]});
123	0						$stemmed = $stemmed->[0];
124	0	0					if (!exists $words{$stemmed}) {
125	0						$self->{content} =~ s/^\W(\w+)\W/.../
126							};
127
128	0						$self->{content} =~ /(\w+)\W*$/;
129	0						$stemmed = Lingua::Stem::En::stem({ -words => [$1]});
130	0	0					if (!exists $words{$stemmed}) {
131	0						$self->{content} =~ s/(\w+)\W*$/.../ ;
132							};
133	0						$this_length = length $self->{content};
134							}} until ($this_length <= $max_weight);
135
136	0						return $self;
137							}
138							=head1 COPYRIGHT
139
140							Copyright (C) 2004,2006 Simon Cozens
141
142							You may use and redistribute this module under the same terms as Perl
143							itself.
144
145							=cut
146
147							1;