File Coverage

blib/lib/Text/CommonParts.pm
Criterion Covered Total %
statement 51 51 100.0
branch 12 12 100.0
condition 2 3 66.6
subroutine 8 8 100.0
pod 2 2 100.0
total 75 76 98.6


line stmt bran cond sub pod time code
1             package Text::CommonParts;
2              
3 3     3   47616 use strict;
  3         7  
  3         5699  
4 3     3   19 use vars qw($VERSION @EXPORT_OK);
  3         6  
  3         614  
5 3     3   29 use base qw(Exporter);
  3         7  
  3         2421  
6              
7             $VERSION = 0.5;
8             @EXPORT_OK = qw(common_parts shortest_common_parts);
9              
10             =head1 NAME
11              
12             Text::CommonParts - return the common starting parts of phrases
13              
14             =head1 SYNOPSIS
15              
16             use Text::CommonParts qw(common_parts);
17              
18             # returns "sheep"
19             common_parts("sheep shearing", "sheep dipping", "sheep rustling");
20            
21             # returns "sheep", "sheep shearing"
22             common_parts("sheep shearing", "sheep dipping", "sheep rustling", "sheep shearing shears");
23              
24             # returns "sheep"
25             shortest_common_parts("sheep shearing", "sheep dipping", "sheep rustling");
26            
27             # returns "sheep"
28             shortest_common_parts("sheep shearing", "sheep dipping", "sheep rustling", "sheep shearing shears");
29            
30              
31              
32              
33             =head1 METHODS
34              
35             =head2 common_parts
36              
37             Takes a list of phrases and returns the longest common parts.
38              
39             If a phrase shares no common parts with any other phrases then it will be returned whole.
40              
41             Given a set of phrases which have a common prt and a subset of phrases that have a
42             longer common part then both parts will be returned. e.g given
43              
44             "something good", "something bad", "something in the woodshed", "something in my eye"
45              
46             will return
47              
48             "something", "something in"
49              
50             =cut
51              
52             sub common_parts {
53 6     6 1 26 return _common_parts(1,@_);
54              
55             }
56              
57             =head2 shortest_common_parts
58              
59             Same as common_parts but will not return subsets. e.g given
60              
61             "something good", "something bad", "something in the woodshed", "something in my eye"
62              
63             will just return
64              
65             "something"
66              
67              
68              
69             =cut
70              
71              
72              
73             sub shortest_common_parts {
74 6     6 1 26 return _common_parts(0,@_);
75             }
76              
77             sub _common_parts {
78 12     12   18 my $longest = shift;
79 12         44 my @keys = sort _slength @_;
80 12 100       35 @keys = reverse @keys if !$longest;
81              
82              
83             # this fetches a list of all candidate parts
84 12         31 my %candidates = _get_candidates($longest, @keys);
85              
86 12         20 my %seen;
87             my %results;
88              
89 12         35 my @cand_keys = sort _slength keys %candidates;
90 12 100       29 @cand_keys = reverse @cand_keys if !$longest;
91              
92              
93             # note which phrases we've seen already
94 12         21 foreach my $cand (@cand_keys) {
95 24         25 my @phrases = @{$candidates{$cand}};
  24         53  
96 24         37 foreach my $match (@phrases) {
97 54 100       148 next if $seen{$match}++;
98 38         38 push @{$results{$cand}}, $match;
  38         118  
99             }
100             }
101              
102              
103             # clean up the results hash
104 12         17 my %tmp;
105 12         23 foreach my $result (keys %results) {
106 20         20 my @phrases = @{$results{$result}};
  20         45  
107             # we're golden if it's got more than two phrase attached
108             # Butif there's only one phrase since it will be
109             # the n-1th ngram of the phrases when we want the whole phrase
110             # (we'll deal with that later)
111 20 100       44 if (@phrases>1) {
112 12         38 $tmp{$result}++;
113 12         26 next;
114             }
115             # claim we've never seen it
116 8         39 delete $seen{$_} for @phrases;
117              
118             }
119 12         39 %results = %tmp;
120 12         25 %tmp = ();
121              
122             # now get anything that hasn't been matches already
123             # i.e get singletons
124 12         21 foreach my $key (@keys) {
125 38 100       87 next if $seen{$key};
126 8         20 $results{$key}++;
127             }
128              
129 12         106 return keys %results;
130             }
131              
132             sub _get_candidates {
133 12     12   24 my @keys = @_;
134              
135 12         14 my %cands;
136 12         20 for my $key (@keys) {
137 50         50 my @so_far;
138             # split each phrase up into parts
139 50         116 foreach my $part (split ' ', $key) {
140 104         135 push @so_far, $part;
141             # make the sub part
142 104         162 my $match = join(" ",@so_far);
143             # we don't wnat whole matches yet
144             # since they'll always be the longest match
145             # we'll add whole phrases that share no common parts
146             # in later
147 104 100 66     439 next if $match eq $key && !exists $cands{$match};
148             # keep it for later
149 54         53 push @{$cands{$match}}, $key;
  54         139  
150             }
151             }
152 12         60 return %cands;
153             }
154              
155             # sort by length, longest first
156             sub _slength ($$) {
157 56     56   111 return length($_[1]) <=> length($_[0]);
158             }
159              
160              
161              
162              
163              
164             =head1 AUTHOR
165              
166             Simon Wistow
167              
168             =head1 COPYRIGHT
169              
170             Copyright 2006, Simon Wistow
171              
172             Distributed under the same terms as Perl itself
173              
174             =cut
175              
176             1;