File Coverage

blib/lib/File/FindSimilars.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package File::FindSimilars;
2              
3             # @Author: Tong SUN, (c)2001-2008, all right reserved
4             # @Version: $Date: 2009/01/02 21:10:20 $ $Revision: 2.6 $
5             # @HomeURL: http://xpt.sourceforge.net/
6              
7             # {{{ LICENSE:
8              
9             #
10             # Permission to use, copy, modify, and distribute this software and its
11             # documentation for any purpose and without fee is hereby granted, provided
12             # that the above copyright notices appear in all copies and that both those
13             # copyright notices and this permission notice appear in supporting
14             # documentation, and that the names of author not be used in advertising or
15             # publicity pertaining to distribution of the software without specific,
16             # written prior permission. Tong Sun makes no representations about the
17             # suitability of this software for any purpose. It is provided "as is"
18             # without express or implied warranty.
19             #
20             # TONG SUN DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
21             # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL ADOBE
22             # SYSTEMS INCORPORATED AND DIGITAL EQUIPMENT CORPORATION BE LIABLE FOR ANY
23             # SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
24             # RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
25             # CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
26             # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
27             #
28              
29             # }}}
30              
31             # {{{ POD, Intro:
32              
33             =head1 NAME
34              
35             File::FindSimilars - Fast similar-files finder
36              
37             =head1 SYNOPSIS
38              
39             use File::FindSimilars;
40              
41             my $similars_finder =
42             File::FindSimilars->new( { fc_level => $fc_level, } );
43             $similars_finder->find_for(\@ARGV);
44             $similars_finder->similarity_check();
45              
46             =head1 DESCRIPTION
47              
48             Extremely fast file similarity checker. Similar-sized and similar-named
49             files are picked out as suspicious candidates of duplicated files.
50              
51             It uses advanced soundex vector algorithm to determine the similarity
52             between files. Generally it means that if there are n files, each having
53             approximately m words in the file name, the degree of calculation is merely
54              
55             O(n^2 * m)
56              
57             which is over thousands times faster than any existing file fingerprinting
58             technology.
59              
60             =head2 ALGORITHM EXPLANATION
61              
62             The self-test output will help you understand what the module do and what
63             would you expect from the outcome.
64              
65             $ make test
66             PERL_DL_NONLAZY=1 /usr/bin/perl "-Iblib/lib" "-Iblib/arch" test.pl
67             1..5 todo 2;
68             # Running under perl version 5.010000 for linux
69             # Current time local: Wed Nov 5 17:45:19 2008
70             # Current time GMT: Wed Nov 5 22:45:19 2008
71             # Using Test.pm version 1.25
72             # Testing File::FindSimilars version 2.04
73              
74             . . . .
75              
76             == Testing 2, files under test/ subdir:
77              
78             9 test/(eBook) GNU - Python Standard Library 2001.pdf
79             3 test/Audio Book - The Grey Coloured Bunnie.mp3
80             5 test/ColoredGrayBunny.ogg
81             5 test/GNU - 2001 - Python Standard Library.pdf
82             4 test/GNU - Python Standard Library (2001).rar
83             9 test/LayoutTest.java
84             3 test/PopupTest.java
85             2 test/Python Standard Library.zip
86             ok 2 # (test.pl at line 83 TODO?!)
87              
88             Note:
89              
90             - The findsimilars script will pick out similar files from them in next test.
91             - Let's assume that the number represent the file size in KB.
92              
93             == Testing 3 result should be:
94              
95             ## =========
96             3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/'
97             5 'ColoredGrayBunny.ogg' 'test/'
98              
99             ## =========
100             4 'GNU - Python Standard Library (2001).rar' 'test/'
101             5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
102             ok 3
103              
104             Note:
105              
106             - There are 2 groups of similar files picked out by the script.
107             - The similar files are picked because their file names look similar.
108             Note that the first group looks different and spells differently too,
109             which means that the script is versatile enough to handle file names that
110             don't have space in it, and robust enough to deal with spelling mistakes.
111             - Apart from the file name, the file size plays an important role as well.
112             - There are 2 files in the second similar files group, the book files group.
113             - The file 'Python Standard Library.zip' is not considered to be similar to
114             the group because its size is not similar to the group.
115              
116             == Testing 4, if Python.zip is bigger, result should be:
117              
118             ## =========
119             3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/'
120             5 'ColoredGrayBunny.ogg' 'test/'
121              
122             ## =========
123             4 'Python Standard Library.zip' 'test/'
124             4 'GNU - Python Standard Library (2001).rar' 'test/'
125             5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
126             ok 4
127              
128             Note:
129              
130             - There are now 3 files in the book files group.
131             - The file 'Python Standard Library.zip' is included in the
132             group because its size is now similar to the group.
133              
134             == Testing 5, if Python.zip is even bigger, result should be:
135              
136             ## =========
137             3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/'
138             5 'ColoredGrayBunny.ogg' 'test/'
139              
140             ## =========
141             4 'GNU - Python Standard Library (2001).rar' 'test/'
142             5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
143             6 'Python Standard Library.zip' 'test/'
144             9 '(eBook) GNU - Python Standard Library 2001.pdf' 'test/'
145             ok 5
146              
147             Note:
148              
149             - There are 4 files in the book files group now.
150             - The file 'Python Standard Library.zip' is still in the group.
151             - But this time, because it is also considered to be similar to the .pdf
152             file (since their size are now similar, 6 vs 9), a 4th file the .pdf one
153             is now included in the book group.
154             - If the size of file 'Python Standard Library.zip' is 12(KB), then the
155             book files group will be split into two. Do you know why and
156             which files each group will contain?
157              
158             The File::FindSimilars package comes with a fully functional demo
159             script findsimilars. Please refer to its help file for further
160             explanations.
161              
162             This package is highly customizable. Refer to the class method C for
163             details.
164              
165             =head1 DEPENDS
166              
167             This module depends on L, but not L.
168              
169             =cut
170              
171             # }}}
172              
173             # {{{ Global Declaration:
174              
175             # ============================================================== &us ===
176             # ............................................................. Uses ...
177              
178             # -- global modules
179 1     1   102328 use strict; # !
  1         2  
  1         30  
180              
181 1     1   5 use Carp;
  1         2  
  1         63  
182 1     1   5 use Getopt::Long;
  1         5  
  1         8  
183 1     1   136 use File::Basename;
  1         2  
  1         69  
184 1     1   328 use Text::Soundex;
  0            
  0            
185              
186             use base qw(Class::Accessor::Fast);
187              
188             # -- local modules
189              
190             sub dbg_show {};
191             #use MyDbg; $MyDbg::debugging=010;
192              
193             # ============================================================== &gv ===
194             # .................................................. Global Varibles ...
195             #
196              
197             our @EXPORT = ( ); # may even omit this line
198              
199             use vars qw($progname $VERSION $debugging);
200             use vars qw(%config @filequeue @fileInfo %sdxCnt %wrdLst);
201              
202             # @fileInfo: List of the following list:
203             my (
204             $N_dName, # dir name
205             $N_fName, # file name
206             $N_fSize, # file size
207             $N_fSdxl, # file soundex list, reference
208             ) = (0..9);
209              
210             # ============================================================== &cs ===
211             # ................................................. Constant setting ...
212             #
213             $VERSION = sprintf("%d.%02d", q$Revision: 2.6 $ =~ /(\d+)\.(\d+)/);
214              
215             # }}}
216              
217             # ############################################################## &ss ###
218             # ................................................ Subroutions start ...
219              
220             =head1 METHODS
221              
222             =head2 File::FindSimilars->new(\%config_param)
223              
224             Initialize the object.
225              
226             my $similars_finder = File::FindSimilars->new();
227              
228             or,
229              
230             my $similars_finder = File::FindSimilars->new( {} );
231              
232             which are the same as:
233              
234             my $similars_finder = File::FindSimilars->new( {
235             soundex_weight => 50, # percentage of weight that soundex takes,
236             # the rest is for file size
237             fc_threshold => 75, # over which files are considered similar
238             delimiter => "\n## =========\n", # delimiter between files output
239             format => "%12d '%s' %s'%s'", # file info print format
240             fc_level => 0, # file comparison level
241             verbose => 0,
242              
243             } );
244              
245             What shown above are default settings. Any of the C<%config_param> attribute can be omitted when calling the new method.
246              
247             The C is the only class method. All the rest methods are object methods.
248              
249             =head2 Object attribute: soundex_weight([set_val])
250              
251             Percentage of weight that soundex takes, the rest of percentage is for file size.
252              
253             Provide the C to change the attribute, omitting it to retrieve the attribute value.
254              
255             =head2 Object attribute: fc_threshold([set_val])
256              
257             The threshold over which files are considered similar.
258              
259             Provide the C to change the attribute, omitting it to retrieve the attribute value.
260              
261             =head2 Object attribute: delimiter([set_val])
262              
263             Delimiter printed between file info outputs.
264              
265             Provide the C to change the attribute, omitting it to retrieve the attribute value.
266              
267             =head2 Object attribute: format([set_val])
268              
269             Format used to print file info.
270              
271             Provide the C to change the attribute, omitting it to retrieve the attribute value.
272              
273             =head2 Object attribute: fc_level([set_val])
274              
275             File comparison level. Whether to check similar files within the same folder: 0, no; 1, yes.
276              
277             Provide the C to change the attribute, omitting it to retrieve the attribute value.
278              
279             =head2 Object attribute: verbose([set_val])
280              
281             Verbose level. Whether to output progress info: 0, no; 1, yes.
282              
283             Provide the C to change the attribute, omitting it to retrieve the attribute value.
284              
285             =cut
286              
287             File::FindSimilars
288             ->mk_accessors(qw(soundex_weight fc_threshold
289             delimiter format fc_level verbose));
290              
291             %config =
292             (
293              
294             soundex_weight => 50, # percentage of weight that soundex takes,
295             # the rest is for file size
296             fc_threshold => 75, # over which files are considered similar
297             delimiter => "\n## =========\n", # delimiter between files output
298             format => "%12d '%s' %s'%s'", # file info print format
299              
300             fc_level => 0, # file comparison level
301             verbose => 0,
302             );
303              
304              
305             # =========================================================== &s-sub ===
306              
307             sub new {
308             ref(my $class = shift)
309             and croak "new is a class method. class name needed.";
310             my ($arg_ref) = @_;
311             my $self = $class->SUPER::new({%config, %$arg_ref});
312             $config{soundex_weight} = $self->soundex_weight;
313             $config{fc_threshold} = $self->fc_threshold;
314             $config{delimiter} = $self->delimiter;
315             $config{format} = $self->format;
316             $config{fc_level} = $self->fc_level;
317             $config{verbose} = $self->verbose;
318             #$config{} = $self->;
319             return $self;
320             }
321              
322             # =========================================================== &s-sub ===
323              
324             =head2 Object method: find_for($array_ref)
325              
326             Set directory queue for similarity checking. Each entry in C<$array_ref>
327             is a directory to check into. E.g.,
328              
329             $similars_finder->find_for(\@ARGV);
330              
331             =cut
332              
333             sub find_for {
334             my ($self, $init_dirs) = @_;
335              
336             # threshold $config{fc_threshold}
337             print STDERR "Searching in directory(ies): @$init_dirs with level $config{fc_level}...\n\n"
338             if $config{verbose};
339              
340             @filequeue = @fileInfo = ();
341             @filequeue = (@filequeue, map { [$_, ''] } @$init_dirs);
342             process_entries();
343              
344             dbg_show(100,"\@fileInfo", @fileInfo);
345             dbg_show(100,"%sdxCnt", %sdxCnt);
346             dbg_show(100,"%wrdLst", %wrdLst);
347             }
348              
349             # =========================================================== &s-sub ===
350             # I - Input: global array @filequeue
351             # Input parameters: None
352             #
353             sub process_entries {
354             my($dir, $qf) = ();
355             #warn "] inside process_entries...\n";
356              
357             while ($qf = shift @filequeue) {
358             ($dir, $_) = ($qf->[0], $qf->[1]);
359             #warn "] inside process_entries loop, $dir, $_, ...\n";
360             next if /^..?$/;
361             my $name = "$dir/$_";
362             #warn "] processing file '$name'.\n";
363             if ($name eq '-/') {
364             # get info from stdin
365             process_stdin();
366             }
367             elsif (-d $name) {
368             # a directory, process it recursively.
369             process_dir($name);
370             }
371             else {
372             process_file($dir, $_);
373             }
374             }
375             }
376              
377             # =========================================================== &s-sub ===
378             # D - Process info given from stdin, which should of form same as
379             # find -printf "%p\t%s\n"
380             #
381             sub process_stdin {
382            
383             while (<>){
384             croak "Wrong input format: '$_'" unless m{(.*)/(.+?)\t(\d+)$};
385             my ($dn, $fn, $size) = ( $1, $2, $3 );
386             my $fSdxl = [ get_soundex($fn) ]; # file soundex list
387             push @fileInfo, [ $dn, $fn, $size, $fSdxl, ];
388              
389             dbg_show(100,"fileInfo",@fileInfo);
390             map { $sdxCnt{$_}++ } @$fSdxl;
391             }
392             }
393              
394             # =========================================================== &s-sub ===
395             # D - Process given dir recursively
396             # N - BFS is more memory friendly than DFS
397             #
398             # T - $dir="/home/tong/tmp"
399             sub process_dir {
400             my($dir) = @_;
401             #warn "] processing dir '$dir'...\n";
402              
403             opendir(DIR,$dir) || die "File::FindSimilars error: Can't open $dir";
404             my @filenames = readdir(DIR);
405             closedir(DIR);
406              
407             # record the dirname/fname pair to queue
408             @filequeue = (@filequeue, map { [$dir, $_] } @filenames);
409             dbg_show(100,"filequeue", @filequeue)
410             }
411              
412             # =========================================================== &s-sub ===
413             # S - process_file($dirname, $fname), process file $fname under $dirname
414             # D - Process one file and update global vars
415             # U -
416             #
417             # I - Input parameters:
418             # $dirname: dir name string
419             # $fname: file name string
420             # O - Global vars get updated
421             # fileInfo [ $dirname, $fname, $fsize, [ file_soundex ] ]
422             # T -
423              
424             sub process_file {
425             my ($dn, $fn) = @_;
426             my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,@rest) =
427             stat("$dn/$fn");
428             my $fSdxl = [ get_soundex($fn) ]; # file soundex list
429             push @fileInfo, [ $dn, $fn, $size, $fSdxl, ];
430              
431             dbg_show(100,"fileInfo",@fileInfo);
432             map { $sdxCnt{$_}++ } @$fSdxl;
433             }
434              
435             # =========================================================== &s-sub ===
436             # S - get_soundex($fname), get soundex for file $fname
437             # D - Return a list of soundex of each individual word in file name
438             # U - $aref = [ get_soundex($fname) ];
439             #
440             # I - Input parameters:
441             # $fname: file name string
442             # O - sorted anonymous soundex array w/ duplications removed
443             # T - @out = get_soundex 'Java_RMI - _Remote_Method_Invocation_ch03.tgz';
444             # @out = get_soundex 'ASuchKindOfFile.tgz';
445              
446             sub get_soundex {
447             my ($fn) = @_;
448             # split to individual words
449             my @fn_wlist = split /[-_[:cntrl:][:blank:][:punct:][:digit:]]/i, $fn;
450             # discards file extension, if any
451             pop @fn_wlist if @fn_wlist >= 1;
452             # if it is single word, try further decompose SuchKindOfWord
453             @fn_wlist = $fn_wlist[0] =~ /[A-Z][^A-Z]*/g
454             if @fn_wlist == 1 && $fn_wlist[0] =~ /^[A-Z]/;
455             # wash short
456             dbg_show(100,"wlist 0",@fn_wlist);
457             @fn_wlist = arrwash_short(\@fn_wlist);
458             dbg_show(100,"wlist 1",@fn_wlist);
459              
460             # language specific handling
461             @fn_wlist = arrwash_lang(\@fn_wlist);
462             dbg_show(100,"wlist 2",@fn_wlist);
463            
464             # change word to soundex, record soundex/word in global hash
465             map {
466             if (/[[:alpha:]]/) {
467             my $sdx = soundex($_);
468             $wrdLst{$sdx}{$_}++;
469             s/^.*$/$sdx/;
470             }
471             } @fn_wlist;
472             dbg_show(1,"wrdLst",%wrdLst);
473              
474             # wash empty/duplicates
475             @fn_wlist = grep(!/^$/, @fn_wlist);
476             @fn_wlist = arrwash_dup(\@fn_wlist);
477            
478             return sort @fn_wlist;
479             }
480              
481             # =========================================================== &s-sub ===
482             # S - arrwash_short($arr_ref), wash short from array $arr_ref
483             # D - weed out empty lines and less-than-3-letter words (e.g. ch12)
484             # U - @fn_wlist = arrwash_short(\@fn_wlist);
485             #
486              
487             sub arrwash_short($) {
488             my ($arr_ref) = @_;
489             return @$arr_ref unless @$arr_ref >= 1;
490             my @r= grep tr/a-zA-Z// >=3, @$arr_ref;
491             return @r if @r;
492             return @$arr_ref # for upper ASCII
493             if grep(/[\200-\377]/, @$arr_ref);
494             return @r;
495             }
496              
497             # =========================================================== &s-sub ===
498             # S - arrwash_dup($arr_ref), wash duplicates from array $arr_ref
499             # D - weed out duplicates
500             # U - @fn_wlist = arrwash_dup(\@fn_wlist);
501             #
502              
503             sub arrwash_dup($) {
504             my ($arr_ref) = @_;
505             my %saw;
506             return grep !$saw{$_}++, @$arr_ref;
507             }
508              
509             # =========================================================== &s-sub ===
510             # S - arrwash_lang($arr_ref), language specific washing from array $arr_ref
511             # U - @fn_wlist = arrwash_lang(\@fn_wlist);
512             #
513              
514             sub arrwash_lang($) {
515             my ($arr_ref) = @_;
516            
517             # split Chinese into individual chars
518             my @r;
519             map {
520             if (/[\200-\377]{2}/) {
521             @r = (@r, /[\200-\377]{2}/g);
522             }
523             else {
524             @r = (@r, $_);
525             }
526             } @$arr_ref;
527            
528             return @r;
529             }
530              
531             =head2 Object method: similarity_check()
532              
533             Do similarity check on the queued directories. Print similar files info on
534             stdout according to the configured format and delimiters. E.g.,
535              
536             $similars_finder->similarity_check();
537              
538             =cut
539              
540             # =========================================================== &s-sub ===
541             # S - similarity_check: similarity check on glabal array @fileInfo
542             # U - similarity_check();
543             #
544             # I - Input parameters: None
545             # O - similar files printed on stdout
546              
547             sub similarity_check {
548              
549             # get a ordered (by soundex count and file name) of file Info array
550             # (Use short file names to compare to long file names)
551             # use Schwartzian Transform to sort on 2 fields for efficiency
552             my @fileInfos = map { $_->[0] }
553             sort { $a->[1] cmp $b->[1] }
554             map { [ $_,
555             sprintf "%3d%6s", $#{$_->[$N_fSdxl]}, $_->[$N_fSdxl][0]
556             ] } @fileInfo;
557             dbg_show(100,"\@fileInfos", @fileInfos);
558              
559             my @saw = (0) x ($#fileInfos+1);
560             foreach my $ii (0..$#fileInfos) {
561             #warn "] ii=$ii\n";
562             my @similar = ();
563             my $fnl;
564            
565             dbg_show(100,"\@fileInfos", $fileInfos[$ii]);
566             push @similar, [$ii, $ii, $fileInfos[$ii]->[$N_fSize] ];
567             foreach my $jj (($ii+1) ..$#fileInfos) {
568             $fnl=0; # 0 is good enough since file at [ii] is
569             # shorter in name than the one at [jj]
570             # don't care about same dir files?
571             next
572             if (!$config{fc_level} && ($fileInfos[$ii]->[$N_dName]
573             eq $fileInfos[$jj]->[$N_dName])) ;
574             if (file_diff(\@fileInfos, $ii, $jj) >= $config{fc_threshold}) {
575             push @similar, [$ii, $jj, $fileInfos[$jj]->[$N_fSize] ];
576             $fnl= length($fileInfos[$jj]->[$N_fName]) if
577             $fnl < length($fileInfos[$jj]->[$N_fName]);
578             }
579             }
580             dbg_show(100,"\@similar", @similar);
581             # output unvisited potential similars by each row, order by fSize
582             @similar = grep {!$saw[$_->[1]]}
583             sort { $a->[2] <=> $b->[2] } @similar;
584             next unless @similar>1;
585             print $config{delimiter};
586             foreach my $similar (@similar) {
587             print file_info(\@fileInfos, $similar->[1], $fnl). "\n";
588             $saw[$similar->[1]]++;
589             }
590             }
591             }
592              
593             # =========================================================== &s-sub ===
594             sub file_info ($$$) {
595             my ($fileInfos, $ndx, $fnl) = @_;
596             return sprintf($config{format}, $fileInfos->[$ndx]->[$N_fSize],
597             $fileInfos->[$ndx]->[$N_fName],
598             ' ' x ($fnl - length($fileInfos->[$ndx]->[$N_fName])),
599             "$fileInfos->[$ndx]->[$N_dName]");
600             }
601              
602             # =========================================================== &s-sub ===
603             # S - file_diff: determind how difference two files are by name & size
604             # U - file_diff($fileInfos, $ndx1, $ndx2);
605             #
606             # I - $fileInfos: reference to @fileInfos
607             # $ndx1, $ndx2: index to the two file in @fileInfos
608             # O - 100%: files are identical
609             # 0%: no similarity at all
610             sub file_diff ($$$) {
611             my ($fileInfos, $ndx1, $ndx2) = @_;
612              
613             return 0 unless @{$fileInfos->[$ndx1]->[$N_fSdxl]};
614            
615             # find intersection in two soudex array
616             my %count = ();
617             foreach my $element
618             (@{$fileInfos->[$ndx1]->[$N_fSdxl]},
619             @{$fileInfos->[$ndx2]->[$N_fSdxl]}) { $count{$element}++ }
620             # since there is no duplication in each of file soudex
621             my $intersection =
622             grep $count{$_} > 1, keys %count;
623             # return p * normal(\common soudex) + (1-p) * ( 1 - normal(\delta fSize))
624             # so the bigger the return value is, the similar the two files are
625             $intersection *= $config{soundex_weight} /
626             (@{$fileInfos->[$ndx1]->[$N_fSdxl]});
627             dbg_show(100,"intersection", $intersection, $ndx1, $ndx2);
628             my $WeightfSzie = 100 - $config{soundex_weight};
629             my $dfSize = abs($fileInfos->[$ndx1]->[$N_fSize] -
630             $fileInfos->[$ndx2]->[$N_fSize]) * $WeightfSzie /
631             ($fileInfos->[$ndx1]->[$N_fSize] + 1);
632             $dfSize = $dfSize > $WeightfSzie ? $WeightfSzie : $dfSize;
633             my $file_diff = $intersection + ($WeightfSzie - $dfSize);
634             if ($file_diff >= $config{fc_threshold}) {
635             dbg_show(010,"file_diff",
636             @{$fileInfos->[$ndx1]},
637             @{$fileInfos->[$ndx2]},
638             $intersection, $dfSize, $file_diff
639             );
640             }
641             return $file_diff;
642             }
643              
644              
645             1;
646             __END__