File Coverage

blib/lib/Text/Context/Porter.pm
Criterion Covered Total %
statement 77 98 78.5
branch 10 18 55.5
condition 3 6 50.0
subroutine 14 14 100.0
pod 4 4 100.0
total 108 140 77.1


line stmt bran cond sub pod time code
1             package Text::Context::Porter;
2 1     1   723 use base 'Text::Context';
  1         3  
  1         714  
3 1     1   3430 use strict;
  1         2  
  1         31  
4 1     1   15 use warnings;
  1         2  
  1         23  
5 1     1   747 use Lingua::Stem::En;
  1         2313  
  1         416  
6              
7             our $VERSION = "1.1";
8              
9             =head1 NAME
10              
11             Text::Context::Porter - Text::Context with inflection awareness
12              
13             =head1 SYNOPSIS
14              
15             use Text::Context::Porter;
16              
17             my $snippet = Text::Context::Porter->new($text, @keywords);
18              
19             $snippet->keywords("foo", "bar"); # In case you change your mind
20              
21             print $snippet->as_html;
22             print $snippet->as_text;
23              
24             =head1 DESCRIPTION
25              
26             Given a piece of text and some search terms, produces an object
27             which locates the search terms in the message, extracts a reasonable-length
28             string containing all the search terms, and optionally dumps the string out
29             as HTML text with the search terms highlighted in bold.
30              
31             However, unlike the ordinary C, this subclass is able to
32             highlight terms in the document which are inflected variants of the search
33             terms. For instance, searching for "testing" should highlight "test",
34             "tested" and so on.
35              
36             =cut
37              
38             sub keywords {
39 1     1 1 366 my ($self, @keywords) = @_;
40 1 50       4 if (@keywords) {
41 2         3 $self->{keywords} = Lingua::Stem::En::stem({ -words =>
42 1         4 [ map {s/\s+/ /g; lc $_} @keywords ]
  2         10  
43             });
44             }
45 1         199 return @{$self->{keywords}};
  1         4  
46             }
47              
48 12     12 1 142 sub para_class {"Text::Context::Para::Porter"}
49              
50             sub paras {
51 2     2 1 646 my $self = shift;
52 2   50     10 my $max_len = shift || 150;
53 2         10 $self->prepare_text;
54 2         26 $self->score_para($_) for @{$self->{text_a}};
  2         10  
55 2         13 my @paras = $self->get_appropriate_paras;
56 2         133 return map { $_->slim($max_len / @paras) } @paras;
  2         8  
57              
58             }
59              
60             sub score_para {
61 9     9 1 13 my ($self, $para) = @_;
62 9         19 my $content = $para->{content};
63 9         12 $content = join " ", @{Lingua::Stem::En::stem({ -words => [ split /\W+/, $para->{content} ] })};
  9         90  
64 9         1692 my %matches;
65             # Do all the matching of keywords in advance of the boring
66             # permutation bit
67 9         11 for my $word (@{$self->{keywords}}) {
  9         24  
68 18         20 my $word_score = 0;
69 18 100       212 $word_score += 1 + ($content =~ tr/ / /) if $content =~ /\b\Q$word\E\b/i;
70 18         49 $matches{$word} = $word_score;
71             }
72             #XXX : Possible optimization: Give up if there are no matches
73            
74 9         33 for my $wordset ($self->permute_keywords) {
75 27         314 my $this_score = 0;
76 27         70 $this_score += $matches{$_} for @$wordset;
77 27 100       79 $para->{scoretable}[$this_score] = $wordset if $this_score > @$wordset;
78             }
79 9         17 $para->{final_score} = $#{$para->{scoretable}};
  9         39  
80             }
81              
82             package Text::Context::Para::Porter;
83 1     1   5 use constant DEFAULT_START_TAG => '';
  1         2  
  1         65  
84 1     1   4 use constant DEFAULT_END_TAG => "";
  1         2  
  1         36  
85 1     1   4 use base 'Text::Context::Para';
  1         2  
  1         811  
86 1     1   9356 use HTML::Entities;
  1         2  
  1         692  
87              
88             sub marked_up {
89 1     1   6 my $self = shift;
90 1   50     8 my $start_tag = shift || DEFAULT_START_TAG;
91 1   50     7 my $end_tag = shift || DEFAULT_END_TAG;
92 1         5 my $content = $self->as_text;
93 1         5 my %words = map {$_ => 1} @{$self->{marked_words}};
  2         28  
  1         3  
94 1         4 my $output;
95 1         11 for my $word (split /(\s+)/, $content) {
96 17 100       57 if ($word =~ /\S/) {
97 9         9 my ($stemmed) = @{Lingua::Stem::En::stem({ -words => [ $word ]})};
  9         37  
98 9 100       491 if ($words{$stemmed}) {
99 2         7 $word = $start_tag . encode_entities($word) . $end_tag;
100             } else {
101 7         21 $word = encode_entities($word);
102             }
103             }
104 17         109 $output .= $word;
105             }
106 1         12 return $output;
107             }
108              
109             sub slim {
110 2     2   4 my ($self, $max_weight) = @_;
111 2         5 $self->{content} =~ s/^\s+//;
112 2         9 $self->{content} =~ s/\s+$//;
113 2 50       14 return $self if length $self->{content} <= $max_weight;
114 0           my %words = map {$_ => 1 } @{$self->{marked_words}};
  0            
  0            
115 0           my $old_length = -1;
116 0           my $this_length = length $self->{content};
117 0           do {{
118 0 0         if ($old_length == $this_length) { return $self; } # Give up
  0            
  0            
119 0           $old_length = $this_length;
120              
121 0           $self->{content} =~ /^\W*(\w+)/;
122 0           my $stemmed = Lingua::Stem::En::stem({ -words => [$1]});
123 0           $stemmed = $stemmed->[0];
124 0 0         if (!exists $words{$stemmed}) {
125 0           $self->{content} =~ s/^\W*(\w+)\W*/.../
126             };
127              
128 0           $self->{content} =~ /(\w+)\W*$/;
129 0           $stemmed = Lingua::Stem::En::stem({ -words => [$1]});
130 0 0         if (!exists $words{$stemmed}) {
131 0           $self->{content} =~ s/(\w+)\W*$/.../ ;
132             };
133 0           $this_length = length $self->{content};
134             }} until ($this_length <= $max_weight);
135              
136 0           return $self;
137             }
138             =head1 COPYRIGHT
139              
140             Copyright (C) 2004,2006 Simon Cozens
141              
142             You may use and redistribute this module under the same terms as Perl
143             itself.
144              
145             =cut
146              
147             1;