| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# ABSTRACT: Public methods for the App::dupfind deduplication engine |
|
2
|
|
|
|
|
|
|
|
|
3
|
8
|
|
|
8
|
|
4197
|
use strict; |
|
|
8
|
|
|
|
|
14
|
|
|
|
8
|
|
|
|
|
268
|
|
|
4
|
8
|
|
|
8
|
|
36
|
use warnings; |
|
|
8
|
|
|
|
|
12
|
|
|
|
8
|
|
|
|
|
349
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package App::dupfind::Common; |
|
7
|
|
|
|
|
|
|
{ |
|
8
|
|
|
|
|
|
|
$App::dupfind::Common::VERSION = '0.140230'; # TRIAL |
|
9
|
|
|
|
|
|
|
} |
|
10
|
|
|
|
|
|
|
|
|
11
|
8
|
|
|
8
|
|
168
|
use 5.010; |
|
|
8
|
|
|
|
|
47
|
|
|
|
8
|
|
|
|
|
268
|
|
|
12
|
|
|
|
|
|
|
|
|
13
|
8
|
|
|
8
|
|
36
|
use Moo; |
|
|
8
|
|
|
|
|
18
|
|
|
|
8
|
|
|
|
|
43
|
|
|
14
|
8
|
|
|
8
|
|
8316
|
use Digest::xxHash 'xxhash_hex'; |
|
|
8
|
|
|
|
|
6265
|
|
|
|
8
|
|
|
|
|
512
|
|
|
15
|
|
|
|
|
|
|
|
|
16
|
8
|
|
|
8
|
|
47
|
use lib 'lib'; |
|
|
8
|
|
|
|
|
13
|
|
|
|
8
|
|
|
|
|
49
|
|
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
has opts => ( is => 'ro', required => 1 ); |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
with 'App::dupfind::Guts'; |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
before [ qw/ weed_dups digest_dups / ] => sub |
|
23
|
|
|
|
|
|
|
{ |
|
24
|
|
|
|
|
|
|
require Term::ProgressBar if shift->opts->{progress} |
|
25
|
|
|
|
|
|
|
}; |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
before delete_dups => sub { require Term::Prompt }; |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub count_dups |
|
31
|
|
|
|
|
|
|
{ |
|
32
|
0
|
|
|
0
|
1
|
|
my ( $self, $dups ) = @_; |
|
33
|
|
|
|
|
|
|
|
|
34
|
0
|
|
|
|
|
|
my $count = 0; |
|
35
|
|
|
|
|
|
|
|
|
36
|
0
|
|
|
|
|
|
$count += @$_ for map { $dups->{ $_ } } keys %$dups; |
|
|
0
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
|
|
38
|
0
|
|
|
|
|
|
return $count; |
|
39
|
|
|
|
|
|
|
} |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub get_size_dups |
|
42
|
|
|
|
|
|
|
{ |
|
43
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
44
|
|
|
|
|
|
|
|
|
45
|
0
|
|
|
|
|
|
my ( $size_dups, $scan_count ) = ( {}, 0 ); |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
$self->ftl->list_dir |
|
48
|
|
|
|
|
|
|
( |
|
49
|
|
|
|
|
|
|
$self->opts->{dir} => |
|
50
|
|
|
|
|
|
|
{ |
|
51
|
|
|
|
|
|
|
recurse => 1, |
|
52
|
|
|
|
|
|
|
callback => sub |
|
53
|
|
|
|
|
|
|
{ |
|
54
|
|
|
|
|
|
|
## my ( $selfdir, $subdirs, $files ) = @_; |
|
55
|
|
|
|
|
|
|
|
|
56
|
0
|
|
|
0
|
|
|
my $files = $_[2]; # save two vars |
|
57
|
|
|
|
|
|
|
|
|
58
|
0
|
|
|
|
|
|
$scan_count += @$files; |
|
59
|
|
|
|
|
|
|
|
|
60
|
0
|
0
|
|
|
|
|
push @{ $size_dups->{ -s $_ } }, $_ |
|
|
0
|
|
|
|
|
|
|
|
61
|
0
|
|
|
|
|
|
for grep { !-l $_ && defined -s $_ } @$files; |
|
62
|
|
|
|
|
|
|
} |
|
63
|
|
|
|
|
|
|
} |
|
64
|
0
|
|
|
|
|
|
); |
|
65
|
|
|
|
|
|
|
|
|
66
|
0
|
|
|
|
|
|
delete $size_dups->{ $_ } |
|
67
|
0
|
|
|
|
|
|
for grep { @{ $size_dups->{ $_ } } == 1 } |
|
|
0
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
keys %$size_dups; |
|
69
|
|
|
|
|
|
|
|
|
70
|
0
|
|
|
|
|
|
return $size_dups, $scan_count, $self->count_dups( $size_dups ); |
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
sub toss_out_hardlinks |
|
74
|
|
|
|
|
|
|
{ |
|
75
|
0
|
|
|
0
|
1
|
|
my ( $self, $size_dups ) = @_; |
|
76
|
|
|
|
|
|
|
|
|
77
|
0
|
|
|
|
|
|
for my $size ( keys %$size_dups ) |
|
78
|
|
|
|
|
|
|
{ |
|
79
|
0
|
|
|
|
|
|
my $group = $size_dups->{ $size }; |
|
80
|
0
|
|
|
|
|
|
my %dev_inodes; |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# this will automatically throw out hardlinks, with the only surviving |
|
83
|
|
|
|
|
|
|
# file being the first asciibetically-sorted entry |
|
84
|
0
|
|
|
|
|
|
$dev_inodes{ join '', ( stat $_ )[0,1] } = $_ for reverse sort @$group; |
|
85
|
|
|
|
|
|
|
|
|
86
|
0
|
0
|
|
|
|
|
if ( scalar keys %dev_inodes == 1 ) |
|
87
|
|
|
|
|
|
|
{ |
|
88
|
0
|
|
|
|
|
|
delete $size_dups->{ $size }; |
|
89
|
|
|
|
|
|
|
} |
|
90
|
|
|
|
|
|
|
else |
|
91
|
|
|
|
|
|
|
{ |
|
92
|
0
|
|
|
|
|
|
$size_dups->{ $size } = [ values %dev_inodes ]; |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
} |
|
95
|
|
|
|
|
|
|
|
|
96
|
0
|
|
|
|
|
|
return $size_dups; |
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub weed_dups |
|
100
|
|
|
|
|
|
|
{ |
|
101
|
|
|
|
|
|
|
my ( $self, $size_dups ) = @_; |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
my $zero_sized = delete $size_dups->{0}; |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
my $pass_count = 0; |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
$self->_do_weed_pass( $size_dups => $_ => ++$pass_count ) |
|
108
|
|
|
|
|
|
|
for $self->_plan_weed_passes; |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
$size_dups->{0} = $zero_sized if ref $zero_sized; |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
return $size_dups; |
|
113
|
|
|
|
|
|
|
} |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
sub digest_dups |
|
116
|
|
|
|
|
|
|
{ |
|
117
|
|
|
|
|
|
|
my ( $self, $size_dups ) = @_; |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
my ( $digests, $progress, $i ) = ( {}, undef, 0 ); |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
my $digest_cache = {}; |
|
122
|
|
|
|
|
|
|
my $cache_stop = $self->opts->{cachestop}; |
|
123
|
|
|
|
|
|
|
my $max_cache = $self->opts->{cachesize}; |
|
124
|
|
|
|
|
|
|
my $ram_caching = !! $self->opts->{ramcache}; |
|
125
|
|
|
|
|
|
|
my $cache_size = 0; |
|
126
|
|
|
|
|
|
|
my $cache_hits = 0; |
|
127
|
|
|
|
|
|
|
my $cache_misses = 0; |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# don't bother to hash zero-size files |
|
130
|
|
|
|
|
|
|
$digests->{ xxhash_hex '', 0 } = delete $size_dups->{0} |
|
131
|
|
|
|
|
|
|
if exists $size_dups->{0}; |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
if ( $self->opts->{progress} ) |
|
134
|
|
|
|
|
|
|
{ |
|
135
|
|
|
|
|
|
|
my $dup_count = $self->count_dups( $size_dups ); |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
$progress = Term::ProgressBar->new |
|
138
|
|
|
|
|
|
|
( |
|
139
|
|
|
|
|
|
|
{ |
|
140
|
|
|
|
|
|
|
name => ' ...PROGRESS', |
|
141
|
|
|
|
|
|
|
count => $dup_count, |
|
142
|
|
|
|
|
|
|
remove => 1, |
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
); |
|
145
|
|
|
|
|
|
|
} |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
local $/; |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
SIZES: for my $size ( keys %$size_dups ) |
|
150
|
|
|
|
|
|
|
{ |
|
151
|
|
|
|
|
|
|
my $group = $size_dups->{ $size }; |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
GROUPING: for my $file ( @$group ) |
|
154
|
|
|
|
|
|
|
{ |
|
155
|
|
|
|
|
|
|
my $digest; |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
open my $fh, '<', $file or next; |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
my $data = <$fh>; |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
close $fh; |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
if ( $ram_caching ) |
|
164
|
|
|
|
|
|
|
{ |
|
165
|
|
|
|
|
|
|
if ( $digest = $digest_cache->{ $data } ) |
|
166
|
|
|
|
|
|
|
{ |
|
167
|
|
|
|
|
|
|
$cache_hits++; |
|
168
|
|
|
|
|
|
|
} |
|
169
|
|
|
|
|
|
|
else |
|
170
|
|
|
|
|
|
|
{ |
|
171
|
|
|
|
|
|
|
if ( $cache_size < $max_cache && $size <= $cache_stop ) |
|
172
|
|
|
|
|
|
|
{ |
|
173
|
|
|
|
|
|
|
$digest_cache->{ $data } = $digest = xxhash_hex $data, 0; |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
$cache_size++; |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
$cache_misses++; |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
else |
|
180
|
|
|
|
|
|
|
{ |
|
181
|
|
|
|
|
|
|
$digest = xxhash_hex $data, 0; |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
} |
|
184
|
|
|
|
|
|
|
} |
|
185
|
|
|
|
|
|
|
else |
|
186
|
|
|
|
|
|
|
{ |
|
187
|
|
|
|
|
|
|
$digest = xxhash_hex $data, 0; |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
push @{ $digests->{ $digest } }, $file; |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
$progress->update( ++$i ) if $progress; |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
$digest_cache = {}; # it's only worthwhile per-size-grouping |
|
196
|
|
|
|
|
|
|
$cache_size = 0; |
|
197
|
|
|
|
|
|
|
} |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
delete $digests->{ $_ } |
|
200
|
|
|
|
|
|
|
for grep { @{ $digests->{ $_ } } == 1 } |
|
201
|
|
|
|
|
|
|
keys %$digests; |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
$self->stats->{cache_hits} = $cache_hits; |
|
204
|
|
|
|
|
|
|
$self->stats->{cache_misses} = $cache_misses; |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
return $digests; |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub sort_dups |
|
210
|
|
|
|
|
|
|
{ |
|
211
|
0
|
|
|
0
|
1
|
|
my ( $self, $dups ) = @_; |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# sort dup groupings |
|
214
|
0
|
|
|
|
|
|
for my $identifier ( keys %$dups ) |
|
215
|
|
|
|
|
|
|
{ |
|
216
|
0
|
|
|
|
|
|
my @group = @{ $dups->{ $identifier } }; |
|
|
0
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
|
|
218
|
0
|
|
|
|
|
|
$dups->{ $identifier } = [ sort { $a cmp $b } @group ]; |
|
|
0
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
} |
|
220
|
|
|
|
|
|
|
|
|
221
|
0
|
|
|
|
|
|
return $dups; |
|
222
|
|
|
|
|
|
|
} |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
sub show_dups # also calls $self->sort_dups before displaying output |
|
225
|
|
|
|
|
|
|
{ |
|
226
|
0
|
|
|
0
|
1
|
|
my ( $self, $digests ) = @_; |
|
227
|
0
|
|
|
|
|
|
my $dupes = 0; |
|
228
|
|
|
|
|
|
|
|
|
229
|
0
|
|
|
|
|
|
$digests = $self->sort_dups( $digests ); |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
my $for_humans = sub # human-readable output |
|
232
|
|
|
|
|
|
|
{ |
|
233
|
0
|
|
|
0
|
|
|
my ( $digest, $files ) = @_; |
|
234
|
|
|
|
|
|
|
|
|
235
|
0
|
|
|
|
|
|
say sprintf 'DUPLICATES (digest: %s | size: %db)', $digest, -s $$files[0]; |
|
236
|
|
|
|
|
|
|
|
|
237
|
0
|
|
|
|
|
|
say " $_" for @$files; |
|
238
|
|
|
|
|
|
|
|
|
239
|
0
|
|
|
|
|
|
say ''; |
|
240
|
0
|
|
|
|
|
|
}; |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
my $for_robots = sub # machine parseable output |
|
243
|
|
|
|
|
|
|
{ |
|
244
|
0
|
|
|
0
|
|
|
my $files = pop; |
|
245
|
|
|
|
|
|
|
|
|
246
|
0
|
|
|
|
|
|
say join "\t", @$files |
|
247
|
0
|
|
|
|
|
|
}; |
|
248
|
|
|
|
|
|
|
|
|
249
|
0
|
0
|
|
|
|
|
my $formatter = $self->opts->{format} eq 'human' ? $for_humans : $for_robots; |
|
250
|
|
|
|
|
|
|
|
|
251
|
0
|
|
|
|
|
|
for my $digest |
|
|
0
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
( |
|
253
|
|
|
|
|
|
|
sort { $digests->{ $a }->[0] cmp $digests->{ $b }->[0] } keys %$digests |
|
254
|
|
|
|
|
|
|
) |
|
255
|
|
|
|
|
|
|
{ |
|
256
|
0
|
|
|
|
|
|
my $files = $digests->{ $digest }; |
|
257
|
|
|
|
|
|
|
|
|
258
|
0
|
|
|
|
|
|
$formatter->( $digest => $files ); |
|
259
|
|
|
|
|
|
|
|
|
260
|
0
|
|
|
|
|
|
$dupes += @$files - 1; |
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
|
|
263
|
0
|
|
|
|
|
|
return $dupes |
|
264
|
|
|
|
|
|
|
} |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
sub delete_dups |
|
267
|
|
|
|
|
|
|
{ |
|
268
|
|
|
|
|
|
|
my ( $self, $digests ) = @_; |
|
269
|
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
my $removed = 0; |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
for my $digest ( keys %$digests ) |
|
273
|
|
|
|
|
|
|
{ |
|
274
|
|
|
|
|
|
|
my $group = $digests->{ $digest }; |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
say sprintf 'ORIGINAL (%s) %s', $digest, $group->[0]; |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
shift @$group; |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
for my $dup ( @$group ) |
|
281
|
|
|
|
|
|
|
{ |
|
282
|
|
|
|
|
|
|
if ( $self->opts->{prompt} ) |
|
283
|
|
|
|
|
|
|
{ |
|
284
|
|
|
|
|
|
|
unless ( Term::Prompt::prompt( 'y', "REMOVE DUPE? $dup", '', 'n' ) ) |
|
285
|
|
|
|
|
|
|
{ |
|
286
|
|
|
|
|
|
|
say sprintf 'KEPT (%s) %s', $digest, $dup; |
|
287
|
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
next; |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
} |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
unlink $dup or warn "COULD NOT REMOVE $dup! $!" and next; |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
$removed++; |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
say sprintf 'REMOVED (%s) %s', $digest, $dup; |
|
297
|
|
|
|
|
|
|
} |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
say '--'; |
|
300
|
|
|
|
|
|
|
} |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
say "** TOTAL DUPLICATE FILES REMOVED: $removed"; |
|
303
|
|
|
|
|
|
|
} |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
sub cache_stats |
|
306
|
|
|
|
|
|
|
{ |
|
307
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
308
|
|
|
|
|
|
|
|
|
309
|
0
|
|
|
|
|
|
return $self->stats->{cache_hits}, |
|
310
|
|
|
|
|
|
|
$self->stats->{cache_misses} |
|
311
|
|
|
|
|
|
|
} |
|
312
|
|
|
|
|
|
|
|
|
313
|
0
|
0
|
|
0
|
1
|
|
sub say_stderr { return if shift->opts->{quiet}; warn "$_\n" for @_ }; |
|
|
0
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
1; |
|
318
|
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
__END__ |