File Coverage

blib/lib/Text/Compare.pm

Criterion	Covered	Total	%
statement	111	116	95.6
branch	11	20	55.0
condition	1	3	33.3
subroutine	17	17	100.0
pod	10	10	100.0
total	150	166	90.3

line	stmt	bran	cond	sub	pod	time	code
1							########################################################################
2							#
3							# Text::Compare
4							#
5							# Copyright 2005, Marcus Thiesen (marcus@thiesen.org) All rights reserved.
6							# 2007, Serguei Trouchelle (stro@railways.dp.ua)
7							#
8							# This program is free software; you can redistribute it and/or modify
9							# it under the terms of either:
10							#
11							# a) the GNU General Public License as published by the Free Software
12							# Foundation; either version 1, or (at your option) any later
13							# version, or
14							#
15							# b) the "Artistic License" which comes with Perl.
16							#
17							# On Debian GNU/Linux systems, the complete text of the GNU General
18							# Public License can be found in `/usr/share/common-licenses/GPL' and
19							# the Artistic Licence in `/usr/share/common-licenses/Artistic'.
20							#
21							# This program is distributed in the hope that it will be useful,
22							# but WITHOUT ANY WARRANTY; without even the implied warranty of
23							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
24							#
25							########################################################################
26
27							# 2007/06/22 stro v1.02
28							# Fixed bug 15329 (https://rt.cpan.org/Ticket/Display.html?id=15329)
29							# Fixed bug 21587 (https://rt.cpan.org/Ticket/Display.html?id=21587)
30							# Fixed bug 21588 (https://rt.cpan.org/Ticket/Display.html?id=21588)
31							#
32
33							# 2007/06/23 stro v1.03
34							# Fixed POD
35							# License added to meta.yml
36
37							package Text::Compare;
38
39							=pod
40
41							=head1 NAME
42
43							Text::Compare - Language sensitive text comparison
44
45							=head1 SYNOPSIS
46
47							use Text::Compare;
48							# the instant way:
49							my $tc = new Text::Compare( memoize => 1, strip_html => 0 );
50
51							my $sim = $tc->similarity($text_a, $text_b);
52							#$sim will be between 0 and 1
53
54							# second way (cache lists):
55							my $tc2 = new Text::Compare( strip_html => 1 );
56
57							# make a language sensitive word hash:
58							my %wordhash = $tc2->get_words($some_text);
59
60							$tc2->first_list(\%wordhash);
61
62							foreach my $list (@wordlists) {
63							#list is a hashref
64							$tc2->second_list($list);
65
66							print $tc2->similarity();
67							}
68
69							# third way (cache texts)
70							my $tc3 = new Text::Compare();
71
72							$tc3->first($some_text);
73							$tc3->second($some_other_text);
74
75							print $tc3->similarity;
76
77
78
79							=head1 DESCRIPTION
80
81							Text::Compare is an attempt to write a high speed text compare tool
82							based on Vector comparision which uses language dependend stopwords.
83							Text::Compare uses Lingua::Identify to find the language of the
84							given texts, then uses Lingua::StopWords to get the stopwords for the
85							given language and finally uses Linuga::Stem to find word stems.
86
87							=cut
88
89	2			2		51839	use strict;
	2					4
	2					64
90	2			2		11	use warnings;
	2					4
	2					65
91
92	2			2		1995	use Lingua::Identify qw(:language_identification :language_manipulation);
	2					128322
	2					455
93	2			2		1360	use Lingua::StopWords;
	2					569
	2					84
94	2			2		1396	use Lingua::Stem;
	2					11616
	2					89
95
96	2			2		1416	use Sparse::Vector;
	2					13567
	2					84
97
98	2			2		15	use Carp;
	2					4
	2					1990
99
100							our $VERSION = '1.03';
101
102							=head1 METHODS
103
104							=over
105
106							=item new( memoize => , strip_html => )
107
108							Creates a new Text::Compare object. Per default, Text::Compare
109							usese memoize to cache some of the calls. See L for
110							details. If you don't want that to happen, initialize it with memoize
111							=> 0. Furthermore, Text::Compare uses HTML::Strip to stip off the HTML
112							found in the text. If you are sure that you don't have any HTML in
113							your data or simply want to use it, deactivate it with strip_html =>
114							0.
115
116							=cut
117
118							sub new {
119	2			2	1	596	my $class = shift;
120	2					6	my @args = @_;
121
122	2					36	my $self = {
123							'word_count' => 0,
124							'word_index' => {},
125							'word_list' => [],
126							'list' => [],
127							'cache' => {},
128							'memoize' => 0,
129							'strip_html' => 1,
130							'first' => {},
131							'second' => {},
132
133							@args,
134
135							};
136	2					7	$self = bless $self, $class;
137
138	2					12	deactivate_all_languages();
139	2					26	activate_language('da');
140	2					16	activate_language('de');
141	2					20	activate_language('en');
142	2					17	activate_language('fr');
143	2					18	activate_language('it');
144	2					18	activate_language('no');
145	2					21	activate_language('pt');
146	2					22	activate_language('sv');
147
148	2	50				31	if ($self->{'memoize'}) {
149	0					0	require Memoize;
150	0					0	import Memoize;
151	0					0	memoize('get_words');
152	0					0	memoize('langof');
153	0					0	memoize('Lingua::Stem::stem');
154							}
155
156	2	50				9	if ($self->{'strip_html'}) {
157	2					1331	require HTML::Strip;
158							}
159
160	2					9215	return $self;
161							}
162
163							=item similarity($text_a, $text_b)
164
165							Compares both texts and returns a similarity value between 0 and
166							1. Text::Compare does all this language magic, therefore two texts
167							which address the same topic but are in different languages might get
168							relatively high values.
169
170							=cut
171
172							sub similarity {
173	4			4	1	1703	my $self = shift;
174	4					7	my $first = shift;
175	4					6	my $second = shift;
176
177	4	50				15	$self->first($first) if defined $first;
178	4	50				11	$self->second($second) if defined $second;
179
180	4					14	$self->make_word_list();
181	4					8	my $v1 = $self->make_vector( shift @{$self->{'list'}} );
	4					16
182	4					12	my $v2 = $self->make_vector( shift @{$self->{'list'}} );
	4					14
183
184	4					21	return $self->cosine( $v1, $v2 );
185							}
186
187							=item first
188
189							=cut
190
191							sub first {
192	1			1	1	8	my $self = shift;
193	1					3	my $first = shift;
194
195	1					5	$self->first_list($self->get_words($first));
196							}
197
198							=item first_list
199
200							=cut
201
202							sub first_list {
203	2			2	1	52	my $self = shift;
204	2					4	my $list = shift;
205
206	2	50				14	$self->{'first'} = $list if ($list);
207	2					10	return $self->{'first'};
208							}
209
210							=item second
211
212							=cut
213
214							sub second {
215	1			1	1	10	my $self = shift;
216	1					3	my $second = shift;
217
218	1					4	$self->second_list($self->get_words($second));
219							}
220
221							=item second_list
222
223							=cut
224
225							sub second_list {
226	2			2	1	7	my $self = shift;
227	2					5	my $list = shift;
228
229	2	50				10	$self->{'second'} = $list if ($list);
230	2					8	return $self->{'second'};
231							}
232
233							=item cosine
234
235							=cut
236
237							sub cosine {
238	4			4	1	6	my $self = shift;
239
240	4					8	my ( $vec1, $vec2 ) = @_;
241
242	4					13	$vec1->normalize;
243	4					311	$vec2->normalize;
244	4					261	return $vec1->dot( $vec2 ); # inner product
245							}
246
247							=item make_vector
248
249							=cut
250
251							sub make_vector {
252	8			8	1	167	my $self = shift;
253	8					13	my $href = shift;
254	8					38	my %words = %$href;
255	8					41	my $vector = new Sparse::Vector;
256
257	8					77	while (my ($w,$value) = each %words ) {
258	48	100				574	next unless defined $self->{'word_index'}->{$w};
259	40					135	$vector->set($self->{'word_index'}->{$w}, $value);
260							}
261
262	8					94	return $vector;
263							}
264
265							=item make_word_list
266
267							=cut
268
269							sub make_word_list {
270	4			4	1	7	my $self = shift;
271	4					7	my %all_words;
272
273	4					216	$self->{'list'} = [];
274
275	4					8	my %words1 = %{$self->{'first'}};
	4					36
276	4					9	push @{$self->{'list'}}, \%words1;
	4					13
277	4					21	%all_words = %words1;
278
279	4					11	my %words2 = %{$self->{'second'}};
	4					20
280
281	4					8	push @{$self->{'list'}}, \%words2;
	4					10
282	4					16	foreach my $k ( keys %words2 ) {
283	24					48	$all_words{$k} += $words2{$k};
284							}
285
286							# create a lookup hash of word to position
287	4					11	my %lookup;
288	4					30	my @sorted_words = sort keys %all_words;
289	4					27	@lookup{@sorted_words} = (1..$#sorted_words );
290
291	4					10	$self->{'word_index'} = \%lookup;
292	4					13	$self->{'word_list'} = \@sorted_words;
293	4					17	$self->{'word_count'} = scalar @sorted_words;
294							}
295
296							=item get_words
297
298							=cut
299
300							sub get_words {
301	4			4	1	18	my $self = shift;
302	4		33			21	my $text = shift \|\| carp "Need Text as an argument to get_words\n";
303
304	4	50				15	if ($self->{'strip_html'}) {
305	4					24	my $hs = HTML::Strip->new();
306	4					205	$text = $hs->parse($text);
307	4					83	$hs->eof;
308							}
309
310	4					40	my $lang = langof( $text );
311
312	4	50				6375	my $stopwords = Lingua::StopWords::getStopWords(lc($lang ? $lang : 'en'));
313	4	50				2905	my $stemmer = Lingua::Stem->new(-locale => uc($lang ? $lang : 'en'));
314
315	24					306	return { map { $_ => 1 }
	32					4107
316	32					37859	grep { ! exists $$stopwords{$_} }
317	32					50	map { $stemmer->stem( $_ )->[0] }
318	32					86	map { lc $_ }
319	4					317	map { /([a-zA-Z\-']+)/ }
320							split /\s+/s, $text };
321							}
322
323							1;
324
325							=back
326
327							=head1 LANGUAGES
328
329							Text::Compare uses the set of languages which is common to
330							Lingua::Identify, Lingua::Stem and Lingua::StopWords, namely:
331
332							=over 4
333
334							=item da
335
336							=item de
337
338							=item en
339
340							=item fr
341
342							=item it
343
344							=item no
345
346							=item pt
347
348							=item sv
349
350							=back
351
352							=head1 AUTHORS
353
354							Marcus Thiesen, C<< >>
355
356							Serguei Trouchelle C<< >>
357
358							=head1 BUGS
359
360							Please report any bugs or feature requests to
361							C, or through the web interface at
362							L.
363							I will be notified, and then you'll automatically be notified of progress on
364							your bug as I make changes.
365
366							=head1 ACKNOWLEDGEMENTS
367
368							The actual code is heavilly based on Search::VectorSpace by
369							Maciej Ceglowski.
370
371							=head1 COPYRIGHT
372
373							Copyright 2005 Marcus Thiesen, All Rights Reserved.
374
375							Copyright 2007 Serguei Trouchelle
376
377							=head1 LICENSE
378
379							This program is free software; you can redistribute it and/or modify it
380							under the same terms as Perl itself.
381
382							=cut
383