File Coverage

blib/lib/File/Find/Similars.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package File::Find::Similars;
2              
3             # @Author: Tong SUN, (c)2001-2008, all right reserved
4             # @Version: $Date: 2008/11/03 14:19:45 $ $Revision: 2.4 $
5             # @HomeURL: http://xpt.sourceforge.net/
6              
7             # {{{ LICENSE:
8              
9             #
10             # Permission to use, copy, modify, and distribute this software and its
11             # documentation for any purpose and without fee is hereby granted, provided
12             # that the above copyright notices appear in all copies and that both those
13             # copyright notices and this permission notice appear in supporting
14             # documentation, and that the names of author not be used in advertising or
15             # publicity pertaining to distribution of the software without specific,
16             # written prior permission. Tong Sun makes no representations about the
17             # suitability of this software for any purpose. It is provided "as is"
18             # without express or implied warranty.
19             #
20             # TONG SUN DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
21             # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL ADOBE
22             # SYSTEMS INCORPORATED AND DIGITAL EQUIPMENT CORPORATION BE LIABLE FOR ANY
23             # SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
24             # RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
25             # CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
26             # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
27             #
28              
29             # }}}
30              
31             # {{{ POD, Intro:
32              
33             =head1 NAME
34              
35             File::Find::Similars - Fast similar-files finder
36              
37             =head1 SYNOPSIS
38              
39             use File::Find::Similars;
40              
41             my $similars_finder =
42             File::Find::Similars->new( { fc_level => $fc_level, } );
43             $similars_finder->find_for(\@ARGV);
44             $similars_finder->similarity_check();
45              
46             =head1 DESCRIPTION
47              
48             Extremely fast file similarity checker. Similar-sized and similar-named
49             files are picked out as suspicious candidates of duplicated files.
50              
51             It uses advanced soundex vector algorithm to determine the similarity
52             between files. Generally it means that if there are n files, each having
53             approximately m words in the file name, the degree of calculation is merely
54              
55             O(n^2 * m)
56              
57             which is over thousands times faster than any existing file fingerprinting
58             technology.
59              
60             =head2 ALGORITHM EXPLANATION
61              
62             The self-test output will help you understand what the module do and what
63             would you expect from the outcome.
64              
65             $ make test
66             PERL_DL_NONLAZY=1 /usr/bin/perl "-Iblib/lib" "-Iblib/arch" test.pl
67             1..5 todo 2;
68             # Running under perl version 5.010000 for linux
69             # Current time local: Mon Nov 3 08:57:42 2008
70             # Current time GMT: Mon Nov 3 13:57:42 2008
71             # Using Test.pm version 1.25
72             # Testing File::Find::Similars version 2.03
73            
74             . . .
75            
76             == Testing 2, files under test/ subdir:
77            
78             9 test/(eBook) GNU - Python Standard Library 2001.pdf
79             3 test/Audio Book - The Grey Coloured Bunnie.mp3
80             5 test/ColoredGrayBunny.ogg
81             5 test/GNU - 2001 - Python Standard Library.pdf
82             4 test/GNU - Python Standard Library (2001).rar
83             9 test/LayoutTest.java
84             3 test/PopupTest.java
85             2 test/Python Standard Library.zip
86             ok 2 # (test.pl at line 83 TODO?!)
87            
88             Note:
89            
90             - The file-similars script will pick out similar files from them in next test.
91             - Let's assume that the number represent the file size in KB.
92            
93             == Testing 3 result should be:
94            
95             ## =========
96             3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/'
97             5 'ColoredGrayBunny.ogg' 'test/'
98            
99             ## =========
100             4 'GNU - Python Standard Library (2001).rar' 'test/'
101             5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
102             ok 3
103            
104             Note:
105            
106             - There are 2 groups of similar files picked out by the script.
107             - The similar files are picked because their file names look similar.
108             Note that the first group looks different and spells differently too,
109             which means that the script is versatile enough to handle file names that
110             don't have space in it, and robust enough to deal with spelling mistakes.
111             - Apart from the file name, the file size plays an important role as well.
112             - There are 2 files in the second similar files group, the book files group.
113             - The file 'Python Standard Library.zip' is not considered to be similar to
114             the group because its size is not similar to the group.
115            
116             == Testing 4, if Python.zip is bigger, result should be:
117            
118             ## =========
119             4 'Python Standard Library.zip' 'test/'
120             4 'GNU - Python Standard Library (2001).rar' 'test/'
121             5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
122            
123             ## =========
124             3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/'
125             5 'ColoredGrayBunny.ogg' 'test/'
126             ok 4
127            
128             Note:
129            
130             - The previous second similar files group is now the first. I.e.,
131             the order of similar files groups is not important.
132             - There are now 3 files in the book files group.
133             - The file 'Python Standard Library.zip' is included in the
134             group because its size is now similar to the group.
135            
136             == Testing 5, if Python.zip is even bigger, result should be:
137            
138             ## =========
139             4 'GNU - Python Standard Library (2001).rar' 'test/'
140             5 'GNU - 2001 - Python Standard Library.pdf' 'test/'
141             6 'Python Standard Library.zip' 'test/'
142             9 '(eBook) GNU - Python Standard Library 2001.pdf' 'test/'
143            
144             ## =========
145             3 'Audio Book - The Grey Coloured Bunnie.mp3' 'test/'
146             5 'ColoredGrayBunny.ogg' 'test/'
147             ok 5
148            
149             Note:
150            
151             - There are 4 files in the book files group now.
152             - The file 'Python Standard Library.zip' is still in the group.
153             - But this time, because it is also considered to be similar to the .pdf
154             file (since their size are now similar, 6 vs 9), a 4th file the .pdf one
155             is now included in the book group.
156             - If the size of file 'Python Standard Library.zip' is 12(KB), then the
157             book files group will be split into two. Do you know why and
158             which files each group will contain?
159              
160             The File::Find::Similars package comes with a fully functional demo
161             script file-similars. Please refer to its help file for further
162             explanations.
163              
164             This package is highly customizable. Refer to the class method C for
165             details.
166              
167             =head1 DEPENDS
168              
169             This module depends on L, but not L.
170              
171             =cut
172              
173             # }}}
174              
175             # {{{ Global Declaration:
176              
177             # ============================================================== &us ===
178             # ............................................................. Uses ...
179              
180             # -- global modules
181 1     1   105454 use strict; # !
  1         2  
  1         44  
182              
183 1     1   6 use Carp;
  1         1  
  1         85  
184 1     1   8 use Getopt::Long;
  1         5  
  1         9  
185 1     1   172 use File::Basename;
  1         3  
  1         86  
186 1     1   463 use Text::Soundex;
  0            
  0            
187              
188             use base qw(Class::Accessor::Fast);
189              
190             # -- local modules
191              
192             sub dbg_show {};
193             #use MyDbg; $MyDbg::debugging=010;
194              
195             # ============================================================== &gv ===
196             # .................................................. Global Varibles ...
197             #
198              
199             our @EXPORT = ( ); # may even omit this line
200              
201             use vars qw($progname $VERSION $debugging);
202             use vars qw(%config @filequeue @fileInfo %sdxCnt %wrdLst);
203              
204             # @fileInfo: List of the following list:
205             my (
206             $N_dName, # dir name
207             $N_fName, # file name
208             $N_fSize, # file size
209             $N_fSdxl, # file soundex list, reference
210             ) = (0..9);
211              
212             # ============================================================== &cs ===
213             # ................................................. Constant setting ...
214             #
215             $VERSION = sprintf("%d.%02d", q$Revision: 2.4 $ =~ /(\d+)\.(\d+)/);
216              
217             # }}}
218              
219             # ############################################################## &ss ###
220             # ................................................ Subroutions start ...
221              
222             =head1 METHODS
223              
224             =head2 File::Find::Similars->new(\%config_param)
225              
226             Initialize the object.
227              
228             my $similars_finder = File::Find::Similars->new();
229              
230             or,
231              
232             my $similars_finder = File::Find::Similars->new( {} );
233              
234             which are the same as:
235              
236             my $similars_finder = File::Find::Similars->new( {
237             soundex_weight => 50, # percentage of weight that soundex takes,
238             # the rest is for file size
239             fc_threshold => 75, # over which files are considered similar
240             delimiter => "\n## =========\n", # delimiter between files output
241             format => "%12d '%s' %s'%s'", # file info print format
242             fc_level => 0, # file comparison level
243             verbose => 0,
244              
245             } );
246              
247             What shown above are default settings. Any of the C<%config_param> attribute can be omitted when calling the new method.
248              
249             The C is the only class method. All the rest methods are object methods.
250              
251             =head2 Object attribute: soundex_weight([set_val])
252              
253             Percentage of weight that soundex takes, the rest of percentage is for file size.
254              
255             Provide the C to change the attribute, omitting it to retrieve the attribute value.
256              
257             =head2 Object attribute: fc_threshold([set_val])
258              
259             The threshold over which files are considered similar.
260              
261             Provide the C to change the attribute, omitting it to retrieve the attribute value.
262              
263             =head2 Object attribute: delimiter([set_val])
264              
265             Delimiter printed between file info outputs.
266              
267             Provide the C to change the attribute, omitting it to retrieve the attribute value.
268              
269             =head2 Object attribute: format([set_val])
270              
271             Format used to print file info.
272              
273             Provide the C to change the attribute, omitting it to retrieve the attribute value.
274              
275             =head2 Object attribute: fc_level([set_val])
276              
277             File comparison level. Whether to check similar files within the same folder: 0, no; 1, yes.
278              
279             Provide the C to change the attribute, omitting it to retrieve the attribute value.
280              
281             =head2 Object attribute: verbose([set_val])
282              
283             Verbose level. Whether to output progress info: 0, no; 1, yes.
284              
285             Provide the C to change the attribute, omitting it to retrieve the attribute value.
286              
287             =cut
288              
289             File::Find::Similars
290             ->mk_accessors(qw(soundex_weight fc_threshold
291             delimiter format fc_level verbose));
292              
293             %config =
294             (
295              
296             soundex_weight => 50, # percentage of weight that soundex takes,
297             # the rest is for file size
298             fc_threshold => 75, # over which files are considered similar
299             delimiter => "\n## =========\n", # delimiter between files output
300             format => "%12d '%s' %s'%s'", # file info print format
301              
302             fc_level => 0, # file comparison level
303             verbose => 0,
304             );
305              
306              
307             # =========================================================== &s-sub ===
308              
309             sub new {
310             ref(my $class = shift)
311             and croak "new is a class method. class name needed.";
312             my ($arg_ref) = @_;
313             my $self = $class->SUPER::new({%config, %$arg_ref});
314             $config{soundex_weight} = $self->soundex_weight;
315             $config{fc_threshold} = $self->fc_threshold;
316             $config{delimiter} = $self->delimiter;
317             $config{format} = $self->format;
318             $config{fc_level} = $self->fc_level;
319             $config{verbose} = $self->verbose;
320             #$config{} = $self->;
321             return $self;
322             }
323              
324             # =========================================================== &s-sub ===
325              
326             =head2 Object method: find_for($array_ref)
327              
328             Set directory queue for similarity checking. Each entry in C<$array_ref>
329             is a directory to check into. E.g.,
330              
331             $similars_finder->find_for(\@ARGV);
332              
333             =cut
334              
335             sub find_for {
336             my ($self, $init_dirs) = @_;
337              
338             # threshold $config{fc_threshold}
339             print STDERR "Searching in directory(ies): @$init_dirs with level $config{fc_level}...\n\n"
340             if $config{verbose};
341              
342             @filequeue = @fileInfo = ();
343             @filequeue = (@filequeue, map { [$_, ''] } @$init_dirs);
344             process_entries();
345              
346             dbg_show(100,"\@fileInfo", @fileInfo);
347             dbg_show(100,"%sdxCnt", %sdxCnt);
348             dbg_show(100,"%wrdLst", %wrdLst);
349             }
350              
351             # =========================================================== &s-sub ===
352             # I - Input: global array @filequeue
353             # Input parameters: None
354             #
355             sub process_entries {
356             my($dir, $qf) = ();
357             #warn "] inside process_entries...\n";
358              
359             while ($qf = shift @filequeue) {
360             ($dir, $_) = ($qf->[0], $qf->[1]);
361             #warn "] inside process_entries loop, $dir, $_, ...\n";
362             next if /^..?$/;
363             my $name = "$dir/$_";
364             #warn "] processing file '$name'.\n";
365             if ($name eq '-/') {
366             # get info from stdin
367             process_stdin();
368             }
369             elsif (-d $name) {
370             # a directory, process it recursively.
371             process_dir($name);
372             }
373             else {
374             process_file($dir, $_);
375             }
376             }
377             }
378              
379             # =========================================================== &s-sub ===
380             # D - Process info given from stdin, which should of form same as
381             # find -printf "%p\t%s\n"
382             #
383             sub process_stdin {
384            
385             while (<>){
386             croak "Wrong input format: '$_'" unless m{(.*)/(.+?)\t(\d+)$};
387             my ($dn, $fn, $size) = ( $1, $2, $3 );
388             my $fSdxl = [ get_soundex($fn) ]; # file soundex list
389             push @fileInfo, [ $dn, $fn, $size, $fSdxl, ];
390              
391             dbg_show(100,"fileInfo",@fileInfo);
392             map { $sdxCnt{$_}++ } @$fSdxl;
393             }
394             }
395              
396             # =========================================================== &s-sub ===
397             # D - Process given dir recursively
398             # N - BFS is more memory friendly than DFS
399             #
400             # T - $dir="/home/tong/tmp"
401             sub process_dir {
402             my($dir) = @_;
403             #warn "] processing dir '$dir'...\n";
404              
405             opendir(DIR,$dir) || die "File::Find::Similars error: Can't open $dir";
406             my @filenames = readdir(DIR);
407             closedir(DIR);
408              
409             # record the dirname/fname pair to queue
410             @filequeue = (@filequeue, map { [$dir, $_] } @filenames);
411             dbg_show(100,"filequeue", @filequeue)
412             }
413              
414             # =========================================================== &s-sub ===
415             # S - process_file($dirname, $fname), process file $fname under $dirname
416             # D - Process one file and update global vars
417             # U -
418             #
419             # I - Input parameters:
420             # $dirname: dir name string
421             # $fname: file name string
422             # O - Global vars get updated
423             # fileInfo [ $dirname, $fname, $fsize, [ file_soundex ] ]
424             # T -
425              
426             sub process_file {
427             my ($dn, $fn) = @_;
428             my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,@rest) =
429             stat("$dn/$fn");
430             my $fSdxl = [ get_soundex($fn) ]; # file soundex list
431             push @fileInfo, [ $dn, $fn, $size, $fSdxl, ];
432              
433             dbg_show(100,"fileInfo",@fileInfo);
434             map { $sdxCnt{$_}++ } @$fSdxl;
435             }
436              
437             # =========================================================== &s-sub ===
438             # S - get_soundex($fname), get soundex for file $fname
439             # D - Return a list of soundex of each individual word in file name
440             # U - $aref = [ get_soundex($fname) ];
441             #
442             # I - Input parameters:
443             # $fname: file name string
444             # O - sorted anonymous soundex array w/ duplications removed
445             # T - @out = get_soundex 'Java_RMI - _Remote_Method_Invocation_ch03.tgz';
446             # @out = get_soundex 'ASuchKindOfFile.tgz';
447              
448             sub get_soundex {
449             my ($fn) = @_;
450             # split to individual words
451             my @fn_wlist = split /[-_[:cntrl:][:blank:][:punct:][:digit:]]/i, $fn;
452             # discards file extension, if any
453             pop @fn_wlist if @fn_wlist >= 1;
454             # if it is single word, try further decompose SuchKindOfWord
455             @fn_wlist = $fn_wlist[0] =~ /[A-Z][^A-Z]*/g
456             if @fn_wlist == 1 && $fn_wlist[0] =~ /^[A-Z]/;
457             # wash short
458             dbg_show(100,"wlist 0",@fn_wlist);
459             @fn_wlist = arrwash_short(\@fn_wlist);
460             dbg_show(100,"wlist 1",@fn_wlist);
461              
462             # language specific handling
463             @fn_wlist = arrwash_lang(\@fn_wlist);
464             dbg_show(100,"wlist 2",@fn_wlist);
465            
466             # change word to soundex, record soundex/word in global hash
467             map {
468             if (/[[:alpha:]]/) {
469             my $sdx = soundex($_);
470             $wrdLst{$sdx}{$_}++;
471             s/^.*$/$sdx/;
472             }
473             } @fn_wlist;
474             dbg_show(1,"wrdLst",%wrdLst);
475              
476             # wash empty/duplicates
477             @fn_wlist = grep(!/^$/, @fn_wlist);
478             @fn_wlist = arrwash_dup(\@fn_wlist);
479            
480             return sort @fn_wlist;
481             }
482              
483             # =========================================================== &s-sub ===
484             # S - arrwash_short($arr_ref), wash short from array $arr_ref
485             # D - weed out empty lines and less-than-3-letter words (e.g. ch12)
486             # U - @fn_wlist = arrwash_short(\@fn_wlist);
487             #
488              
489             sub arrwash_short($) {
490             my ($arr_ref) = @_;
491             return @$arr_ref unless @$arr_ref >= 1;
492             my @r= grep tr/a-zA-Z// >=3, @$arr_ref;
493             return @r if @r;
494             return @$arr_ref # for upper ASCII
495             if grep(/[\200-\377]/, @$arr_ref);
496             return @r;
497             }
498              
499             # =========================================================== &s-sub ===
500             # S - arrwash_dup($arr_ref), wash duplicates from array $arr_ref
501             # D - weed out duplicates
502             # U - @fn_wlist = arrwash_dup(\@fn_wlist);
503             #
504              
505             sub arrwash_dup($) {
506             my ($arr_ref) = @_;
507             my %saw;
508             return grep !$saw{$_}++, @$arr_ref;
509             }
510              
511             # =========================================================== &s-sub ===
512             # S - arrwash_lang($arr_ref), language specific washing from array $arr_ref
513             # U - @fn_wlist = arrwash_lang(\@fn_wlist);
514             #
515              
516             sub arrwash_lang($) {
517             my ($arr_ref) = @_;
518            
519             # split Chinese into individual chars
520             my @r;
521             map {
522             if (/[\200-\377]{2}/) {
523             @r = (@r, /[\200-\377]{2}/g);
524             }
525             else {
526             @r = (@r, $_);
527             }
528             } @$arr_ref;
529            
530             return @r;
531             }
532              
533             =head2 Object method: similarity_check()
534              
535             Do similarity check on the queued directories. Print similar files info on
536             stdout according to the configured format and delimiters. E.g.,
537              
538             $similars_finder->similarity_check();
539              
540             =cut
541              
542             # =========================================================== &s-sub ===
543             # S - similarity_check: similarity check on glabal array @fileInfo
544             # U - similarity_check();
545             #
546             # I - Input parameters: None
547             # O - similar files printed on stdout
548              
549             sub similarity_check {
550              
551             # get a ordered (by soundex count) file Info array
552             # (Use short file names to compare to long file names)
553             my @fileInfos =
554             sort { $#{$a->[$N_fSdxl]} cmp $#{$b->[$N_fSdxl]} } @fileInfo;
555             dbg_show(100,"\@fileInfos", @fileInfos);
556              
557             my @saw = (0) x ($#fileInfos+1);
558             foreach my $ii (0..$#fileInfos) {
559             #warn "] ii=$ii\n";
560             my @similar = ();
561             my $fnl;
562            
563             dbg_show(100,"\@fileInfos", $fileInfos[$ii]);
564             push @similar, [$ii, $ii, $fileInfos[$ii]->[$N_fSize] ];
565             foreach my $jj (($ii+1) ..$#fileInfos) {
566             $fnl=0; # 0 is good enough since file at [ii] is
567             # shorter in name than the one at [jj]
568             # don't care about same dir files?
569             next
570             if (!$config{fc_level} && ($fileInfos[$ii]->[$N_dName]
571             eq $fileInfos[$jj]->[$N_dName])) ;
572             if (file_diff(\@fileInfos, $ii, $jj) >= $config{fc_threshold}) {
573             push @similar, [$ii, $jj, $fileInfos[$jj]->[$N_fSize] ];
574             $fnl= length($fileInfos[$jj]->[$N_fName]) if
575             $fnl < length($fileInfos[$jj]->[$N_fName]);
576             }
577             }
578             dbg_show(100,"\@similar", @similar);
579             # output unvisited potential similars by each row, order by fSize
580             @similar = grep {!$saw[$_->[1]]}
581             sort { $a->[2] <=> $b->[2] } @similar;
582             next unless @similar>1;
583             print $config{delimiter};
584             foreach my $similar (@similar) {
585             print file_info(\@fileInfos, $similar->[1], $fnl). "\n";
586             $saw[$similar->[1]]++;
587             }
588             }
589             }
590              
591             # =========================================================== &s-sub ===
592             sub file_info ($$$) {
593             my ($fileInfos, $ndx, $fnl) = @_;
594             return sprintf($config{format}, $fileInfos->[$ndx]->[$N_fSize],
595             $fileInfos->[$ndx]->[$N_fName],
596             ' ' x ($fnl - length($fileInfos->[$ndx]->[$N_fName])),
597             "$fileInfos->[$ndx]->[$N_dName]");
598             }
599              
600             # =========================================================== &s-sub ===
601             # S - file_diff: determind how difference two files are by name & size
602             # U - file_diff($fileInfos, $ndx1, $ndx2);
603             #
604             # I - $fileInfos: reference to @fileInfos
605             # $ndx1, $ndx2: index to the two file in @fileInfos
606             # O - 100%: files are identical
607             # 0%: no similarity at all
608             sub file_diff ($$$) {
609             my ($fileInfos, $ndx1, $ndx2) = @_;
610              
611             return 0 unless @{$fileInfos->[$ndx1]->[$N_fSdxl]};
612            
613             # find intersection in two soudex array
614             my %count = ();
615             foreach my $element
616             (@{$fileInfos->[$ndx1]->[$N_fSdxl]},
617             @{$fileInfos->[$ndx2]->[$N_fSdxl]}) { $count{$element}++ }
618             # since there is no duplication in each of file soudex
619             my $intersection =
620             grep $count{$_} > 1, keys %count;
621             # return p * normal(\common soudex) + (1-p) * ( 1 - normal(\delta fSize))
622             # so the bigger the return value is, the similar the two files are
623             $intersection *= $config{soundex_weight} /
624             (@{$fileInfos->[$ndx1]->[$N_fSdxl]});
625             dbg_show(100,"intersection", $intersection, $ndx1, $ndx2);
626             my $WeightfSzie = 100 - $config{soundex_weight};
627             my $dfSize = abs($fileInfos->[$ndx1]->[$N_fSize] -
628             $fileInfos->[$ndx2]->[$N_fSize]) * $WeightfSzie /
629             ($fileInfos->[$ndx1]->[$N_fSize] + 1);
630             $dfSize = $dfSize > $WeightfSzie ? $WeightfSzie : $dfSize;
631             my $file_diff = $intersection + ($WeightfSzie - $dfSize);
632             if ($file_diff >= $config{fc_threshold}) {
633             dbg_show(010,"file_diff",
634             @{$fileInfos->[$ndx1]},
635             @{$fileInfos->[$ndx2]},
636             $intersection, $dfSize, $file_diff
637             );
638             }
639             return $file_diff;
640             }
641              
642              
643             1;
644             __END__