File Coverage

blib/lib/Text/DeDuper.pm
Criterion Covered Total %
statement 21 118 17.8
branch 0 30 0.0
condition 0 6 0.0
subroutine 7 19 36.8
pod 8 8 100.0
total 36 181 19.8


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Text::DeDuper - near duplicates detection module
4              
5             =head1 SYNOPSIS
6              
7             use Text::DeDuper;
8              
9             $deduper = new Text::DeDuper();
10             $deduper->add_doc("doc1", $doc1text);
11             $deduper->add_doc("doc2", $doc2text);
12            
13             @similar_docs = $deduper->find_similar($doc3text);
14              
15             ...
16              
17             # delete near duplicates from an array of texts
18             $deduper = new Text::DeDuper();
19             foreach $text (@texts)
20             {
21             next if $deduper->find_similar($text);
22            
23             $deduper->add_doc($i++, $text);
24             push @no_near_duplicates, $text;
25             }
26              
27             =head1 DESCRIPTION
28              
29             This module uses the resemblance measure as proposed by Andrei Z. Broder at al
30             (http://www.ra.ethz.ch/CDstore/www6/Technical/Paper205/Paper205.html) to detect
31             similar (near-duplicate) documents based on their text.
32              
33             Note of caution: The module only works correctly with languages where texts can
34             be tokenised to words by detecting alphabetical characters sequences. Therefore
35             it might not provide very good results for e.g. Chinese.
36              
37             =cut
38              
39             package Text::DeDuper;
40              
41 1     1   23183 use strict;
  1         1  
  1         33  
42 1     1   4 use warnings;
  1         2  
  1         26  
43 1     1   4 use vars qw($VERSION);
  1         5  
  1         53  
44              
45 1     1   880 use Digest::MD4;
  1         1108  
  1         64  
46 1     1   1145 use Encode;
  1         13149  
  1         1344  
47              
48             $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d", @r };
49              
50             =head1 METHODS
51              
52             =head2 new (CONSTRUCTOR)
53              
54             $deduper = new Text::DeDuper();
55              
56             Create a new DeDuper instance. Supported attributes are described bellow, in the
57             I section.
58              
59             =cut
60              
61             sub new
62             {
63 0     0 1   my $class = shift;
64 0           my %options = @_;
65              
66 0           my $self = bless {
67             ngram_size => 5,
68             sim_trsh => 0.2,
69             encoding => 'utf8',
70             _stoplist => {},
71             _digest_count => {},
72             _doc_ids => {},
73             }, $class;
74              
75 0           $self->stoplist( $options{stoplist});
76 0           $self->ngram_size($options{ngram_size});
77 0           $self->sim_trsh( $options{sim_trsh});
78 0           $self->encoding( $options{encoding});
79              
80 0           return $self;
81             }
82              
83             =cut
84              
85             =head2 add_doc
86              
87             $deduper->add_doc($document_id, $document_text);
88              
89             Add a new document to the DeDuper's database. The C<$document_id> must be
90             unique for each document.
91              
92             =cut
93              
94             sub add_doc
95             {
96 0     0 1   my $self = shift;
97 0           my $docid = shift;
98 0           my $text = shift;
99              
100 0 0         croak("duplicate document id '$docid'")
101             if defined $self->{_digest_count}->{$docid};
102            
103 0           my @tokens = $self->_tokenise($text);
104 0           my @filtered_tokens = $self->_apply_stoplist(@tokens);
105 0           my @digests = $self->_build_ngram_digests(@filtered_tokens);
106              
107 0           $self->{_digest_count}->{$docid} = scalar(@digests);
108 0           foreach my $digest (@digests)
109             {
110 0 0         if (not defined $self->{_doc_ids}->{$digest})
111 0           { $self->{_doc_ids}->{$digest} = [ $docid ]; }
112             else
113 0           { push @{$self->{_doc_ids}->{$digest}}, $docid; }
  0            
114             }
115             }
116              
117             =head2 find_similar
118              
119             $deduper->find_similar($document_text);
120              
121             Returns (possibly empty) array of document IDs of documents in the DeDuper's
122             database similar to the C<$document_text>. This can be very simply used for
123             testing whether a near-duplicate document is in the database:
124              
125             if ($deduper->find_similar($document_text))
126             {
127             print "at least one near duplicate found";
128             }
129              
130             =cut
131              
132             sub find_similar
133             {
134 0     0 1   my $self = shift;
135 0           my $text = shift;
136              
137 0           my @tokens = $self->_tokenise($text);
138 0           my @filtered_tokens = $self->_apply_stoplist(@tokens);
139 0           my @digests = $self->_build_ngram_digests(@filtered_tokens);
140              
141             # compute intersection sizes with all documents in the database
142 0           my %intersection_size;
143 0           foreach my $digest (@digests)
144             {
145             next
146 0 0         unless defined($self->{_doc_ids}->{$digest});
147              
148 0           foreach my $docid (@{$self->{_doc_ids}->{$digest}})
  0            
149             {
150 0 0         if (defined $intersection_size{$docid})
151 0           { $intersection_size{$docid}++; }
152             else
153 0           { $intersection_size{$docid} = 1; }
154             }
155             }
156              
157 0           my @similar;
158 0           foreach my $docid (keys %intersection_size)
159             {
160             # union size
161 0           my $union_size = scalar(@digests) + $self->{_digest_count}->{$docid} -
162             $intersection_size{$docid};
163             # resemblance
164 0 0         my $resemblance = $union_size > 0 ?
165             $intersection_size{$docid} / $union_size : 0;
166             # return docs with resemblance above treshold
167 0 0         push @similar, $docid
168             if $resemblance > $self->{sim_trsh};
169             }
170              
171 0           return @similar;
172             }
173              
174             =head2 clean
175              
176             $deduper->clean()
177              
178             Removes all documents from DeDuper's database.
179              
180             =cut
181              
182             sub clean
183             {
184 0     0 1   my $self = shift;
185            
186 0           $self->{_doc_ids} = {};
187 0           $self->{_digest_count} = {};
188             }
189              
190             =head1 ATTRIBUTES
191              
192             Attributes can be set using the constructor:
193              
194             $deduper = new Text::DeDuper(
195             ngram_size => 4,
196             encoding => 'iso-8859-1'
197             );
198              
199             ... or using the object methods:
200              
201             $deduper->ngram_size(4);
202             $deduper->encoding('iso-8859-1');
203              
204             The object methods can also be used for retrieving the values of the
205             attributes:
206              
207             $ngram_size = $deduper->ngram_size();
208             @stoplist = $deduper->stoplist();
209              
210             =over
211              
212             =item encoding
213              
214             The characters encoding of processed texts. Must be set to correct value so
215             that alphabetical characters could be detected. Accepted values are those
216             supported by the L module (see L).
217              
218             B 'utf8'
219              
220             =item sim_trsh
221              
222             The similarity treshold defines how similar two documents must be to be
223             considered near duplicates. The boundary values are 0 and 1. The similarity
224             value of 1 indicates that the documents are exactly the same. The value of
225             0 on the other hand means that the documents do not share any n-gram.
226              
227             Any two documents will have the similarity value below the default treshold
228             unless they share a significant part of text.
229              
230             B 0.2
231              
232             =item ngram_size
233              
234             The document similarity is based on the information of how many n-grams the
235             documents have in common. An n-gram is a sequence of any n immeadiately
236             subsequent words. For example the text
237              
238             she sells sea shells on the sea shore
239              
240             contains following 5-grams:
241              
242             she sells sea shells on
243             sells sea shells on the
244             sea shells on the sea
245             shells on the sea shore
246              
247             This attribute specifies the value of n (the size of n-gram).
248              
249             B 5
250              
251             =item stoplist
252              
253             The stoplist is a list of very frequent words for given language (for English
254             e.g. a, the, is, ...). It is a good idea to remove the stoplist words from
255             texts before similarity is computed, because it is quite likely that two
256             documents will share n-grams of frequent words even if they are not similar
257             at all.
258              
259             The stoplist can be specified both as an array of words and as a name of
260             a file where the words are stored one per line:
261              
262             $deduper->stoplist('a', 'the', 'is', @next_stopwords);
263             $deduper->stoplist('/path/to/english_stoplist.txt');
264              
265             Do not worry if you do not have a stoplist for your language. DeDuper will do
266             pretty good job even without the stoplist.
267              
268             B empty
269              
270             =back
271              
272             =cut
273              
274             sub encoding
275             {
276 0     0 1   my $self = shift;
277 0           my $encoding = shift;
278 0 0         $self->{encoding} = $encoding
279             if defined $encoding;
280 0           return $self->{encoding};
281             }
282              
283             sub sim_trsh
284             {
285 0     0 1   my $self = shift;
286 0           my $sim_trsh = shift;
287 0 0         $self->{sim_trsh} = $sim_trsh
288             if defined $sim_trsh;
289 0           return $self->{sim_trsh};
290             }
291              
292             sub ngram_size
293             {
294 0     0 1   my $self = shift;
295 0           my $ngram_size = shift;
296 0 0         $self->{ngram_size} = $ngram_size
297             if defined $ngram_size;
298 0           return $self->{ngram_size};
299             }
300              
301             sub stoplist
302             {
303 0     0 1   my $self = shift;
304 0           my @stoplist = @_;
305 0 0 0       if (@stoplist && defined $stoplist[0])
306             {
307 0 0 0       if (@stoplist == 1 && -f $stoplist[0])
308 0           { $self->_process_stoplist($stoplist[0]); }
309             else
310 0           { $self->_process_stoplist(\@stoplist); }
311             }
312 0           return sort keys %{$self->{_stoplist}};
  0            
313             }
314              
315             # process stoplist attribute value
316             sub _process_stoplist
317             {
318 0     0     my $self = shift;
319 0           my $stoplist = shift;
320              
321 0           $self->{_stoplist} = {};
322              
323             return unless
324 0 0         defined $stoplist;
325              
326             # if not array, treat as filename
327 0 0         if (ref($stoplist) ne 'ARRAY')
328             {
329 0 0         open(STOPLIST, '<', $stoplist)
330             or croak("can't open '$stoplist' for reading: $!");
331 0           while (<>)
332             {
333 0           chomp;
334 0           $self->{_stoplist}->{$_} = 1;
335             }
336 0           close(STOPLIST);
337             }
338             else
339             {
340 0           foreach (@$stoplist)
341 0           { $self->{_stoplist}->{$_} = 1; }
342             }
343             }
344              
345             # convert text into array of tokens (words)
346             sub _tokenise
347             {
348 0     0     my $self = shift;
349 0           my $text = shift;
350              
351 1     1   11 no warnings;
  1         3  
  1         159  
352 0           my $dec_text = Encode::decode($self->{encoding}, $text);
353 0           my $lc_text = lc($dec_text);
354              
355 0           my @result;
356 0           while ($lc_text =~ /([[:alnum:]]+)/g)
357 0           { push @result, Encode::encode($self->{encoding}, $1); }
358 1     1   6 use warnings;
  1         2  
  1         231  
359              
360 0           return @result;
361             }
362              
363             # apply stoplist to array tokens (filter out stop words)
364             sub _apply_stoplist
365             {
366 0     0     my $self = shift;
367 0           my @tokens = @_;
368              
369 0           my @result;
370 0           foreach my $token (@tokens)
371             {
372 0 0         push @result, $token
373             unless $self->{_stoplist}->{$token};
374             }
375              
376 0           return @result;
377             }
378              
379             # convert array of tokens to array of unique hashes
380             # of ngrams (built out of the tokens)
381             sub _build_ngram_digests
382             {
383 0     0     my $self = shift;
384 0           my @tokens = @_;
385              
386 0           my %digests;
387 0           for my $i (0 .. scalar(@tokens) - $self->{ngram_size})
388             {
389 0           my @ngram = @tokens[$i..($i+$self->{ngram_size}-1)];
390 0           my $digest = Digest::MD4::md4_base64(@ngram);
391 0           $digests{$digest} = 1;
392             }
393              
394 0           return keys(%digests);
395             }
396              
397             1;
398              
399             __END__