File Coverage

blib/lib/Text/CommonParts.pm

Criterion	Covered	Total	%
statement	51	51	100.0
branch	12	12	100.0
condition	2	3	66.6
subroutine	8	8	100.0
pod	2	2	100.0
total	75	76	98.6

line	stmt	bran	cond	sub	pod	time	code
1							package Text::CommonParts;
2
3	3			3		47616	use strict;
	3					7
	3					5699
4	3			3		19	use vars qw($VERSION @EXPORT_OK);
	3					6
	3					614
5	3			3		29	use base qw(Exporter);
	3					7
	3					2421
6
7							$VERSION = 0.5;
8							@EXPORT_OK = qw(common_parts shortest_common_parts);
9
10							=head1 NAME
11
12							Text::CommonParts - return the common starting parts of phrases
13
14							=head1 SYNOPSIS
15
16							use Text::CommonParts qw(common_parts);
17
18							# returns "sheep"
19							common_parts("sheep shearing", "sheep dipping", "sheep rustling");
20
21							# returns "sheep", "sheep shearing"
22							common_parts("sheep shearing", "sheep dipping", "sheep rustling", "sheep shearing shears");
23
24							# returns "sheep"
25							shortest_common_parts("sheep shearing", "sheep dipping", "sheep rustling");
26
27							# returns "sheep"
28							shortest_common_parts("sheep shearing", "sheep dipping", "sheep rustling", "sheep shearing shears");
29
30
31
32
33							=head1 METHODS
34
35							=head2 common_parts
36
37							Takes a list of phrases and returns the longest common parts.
38
39							If a phrase shares no common parts with any other phrases then it will be returned whole.
40
41							Given a set of phrases which have a common prt and a subset of phrases that have a
42							longer common part then both parts will be returned. e.g given
43
44							"something good", "something bad", "something in the woodshed", "something in my eye"
45
46							will return
47
48							"something", "something in"
49
50							=cut
51
52							sub common_parts {
53	6			6	1	26	return _common_parts(1,@_);
54
55							}
56
57							=head2 shortest_common_parts
58
59							Same as common_parts but will not return subsets. e.g given
60
61							"something good", "something bad", "something in the woodshed", "something in my eye"
62
63							will just return
64
65							"something"
66
67
68
69							=cut
70
71
72
73							sub shortest_common_parts {
74	6			6	1	26	return _common_parts(0,@_);
75							}
76
77							sub _common_parts {
78	12			12		18	my $longest = shift;
79	12					44	my @keys = sort _slength @_;
80	12	100				35	@keys = reverse @keys if !$longest;
81
82
83							# this fetches a list of all candidate parts
84	12					31	my %candidates = _get_candidates($longest, @keys);
85
86	12					20	my %seen;
87							my %results;
88
89	12					35	my @cand_keys = sort _slength keys %candidates;
90	12	100				29	@cand_keys = reverse @cand_keys if !$longest;
91
92
93							# note which phrases we've seen already
94	12					21	foreach my $cand (@cand_keys) {
95	24					25	my @phrases = @{$candidates{$cand}};
	24					53
96	24					37	foreach my $match (@phrases) {
97	54	100				148	next if $seen{$match}++;
98	38					38	push @{$results{$cand}}, $match;
	38					118
99							}
100							}
101
102
103							# clean up the results hash
104	12					17	my %tmp;
105	12					23	foreach my $result (keys %results) {
106	20					20	my @phrases = @{$results{$result}};
	20					45
107							# we're golden if it's got more than two phrase attached
108							# Butif there's only one phrase since it will be
109							# the n-1th ngram of the phrases when we want the whole phrase
110							# (we'll deal with that later)
111	20	100				44	if (@phrases>1) {
112	12					38	$tmp{$result}++;
113	12					26	next;
114							}
115							# claim we've never seen it
116	8					39	delete $seen{$_} for @phrases;
117
118							}
119	12					39	%results = %tmp;
120	12					25	%tmp = ();
121
122							# now get anything that hasn't been matches already
123							# i.e get singletons
124	12					21	foreach my $key (@keys) {
125	38	100				87	next if $seen{$key};
126	8					20	$results{$key}++;
127							}
128
129	12					106	return keys %results;
130							}
131
132							sub _get_candidates {
133	12			12		24	my @keys = @_;
134
135	12					14	my %cands;
136	12					20	for my $key (@keys) {
137	50					50	my @so_far;
138							# split each phrase up into parts
139	50					116	foreach my $part (split ' ', $key) {
140	104					135	push @so_far, $part;
141							# make the sub part
142	104					162	my $match = join(" ",@so_far);
143							# we don't wnat whole matches yet
144							# since they'll always be the longest match
145							# we'll add whole phrases that share no common parts
146							# in later
147	104	100	66			439	next if $match eq $key && !exists $cands{$match};
148							# keep it for later
149	54					53	push @{$cands{$match}}, $key;
	54					139
150							}
151							}
152	12					60	return %cands;
153							}
154
155							# sort by length, longest first
156							sub _slength ($$) {
157	56			56		111	return length($_[1]) <=> length($_[0]);
158							}
159
160
161
162
163
164							=head1 AUTHOR
165
166							Simon Wistow
167
168							=head1 COPYRIGHT
169
170							Copyright 2006, Simon Wistow
171
172							Distributed under the same terms as Perl itself
173
174							=cut
175
176							1;