-
Notifications
You must be signed in to change notification settings - Fork 0
/
dupes.pl
executable file
·166 lines (131 loc) · 4.58 KB
/
dupes.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/perl -w
######################################################################
#
# Perl Duplicate File Finder
# Copyright (C) 2001-2012 Doug Mitchell
#
# This module is free software. You can redistribute it and/or
# modify it under the terms of the Artistic License 2.0.
#
# This program is distributed in the hope that it will be useful,
# but without any warranty; without even the implied warranty of
# merchantability or fitness for a particular purpose.
#
######################################################################
use strict;
use warnings;
use File::Find;
use Digest::MD5;
# minimum size files to look at
my $min_size = 65536;
######################################################################
#
# get_md5_hashes
#
# Returns array of MD5 hashes for provided array of filenames
#
######################################################################
sub get_md5_hashes {
my @filenames = @_;
my @hashvals;
foreach my $filename (@filenames) {
open( FILE, $filename ) or warn "Can't open '$filename': $!";
my $hashval = Digest::MD5->new->addfile(*FILE)->hexdigest;
close(FILE);
if ( defined $hashval && $hashval ne '' ) {
push @hashvals, ( $hashval . " " . $filename );
}
}
return @hashvals;
}
######################################################################
#
# main
#
# finds files with matching sizes and compares MD5 hashes
# to determine which files are exact duplicates
#
######################################################################
{
my $items_scanned = 0;
my $regular_files_found = 0;
my %filesizes;
my %size_inode;
die "provide starting directories on command line"
if ( !defined $ARGV[0] );
if ( $ARGV[0] eq '-s' ) {
my $junk = shift;
$min_size = shift;
}
sub wanted {
my (
$dev, $ino, $mode, $nlink, $uid, $gid, $rdev,
$size, $atime, $mtime, $ctime, $blksize, $blocks
) = lstat($_);
if ( $dev == $File::Find::topdev && $_ ne '.svn' ) {
$items_scanned++;
if ( -f _ && ( $size >= $min_size ) ) {
$filesizes{$size}{$File::Find::name} = "";
$size_inode{$size}{$ino} = 0;
$regular_files_found++;
}
}
else {
$File::Find::prune = 1;
}
}
# Traverse desired filesystems
foreach (@ARGV) {
File::Find::find( \&wanted, $_ );
}
print "scanned $items_scanned filesystem items\n";
print "found $regular_files_found regular files\n";
# Remove non-duplicate sizes from filesize hash
my $same_size_files = 0;
foreach my $size ( sort { $a <=> $b } keys %filesizes ) {
my @inodes = keys %{ $size_inode{$size} };
if ( $size < $min_size || ( scalar @inodes <= 1 ) ) {
delete $filesizes{$size};
delete $size_inode{$size};
}
else {
my @filenames = keys %{ $filesizes{$size} };
$same_size_files += scalar @filenames;
}
}
print "found $same_size_files files sharing "
. scalar( keys %filesizes )
. " sizes\n";
my $total_dupe_bytes = 0;
my $total_dupe_count = 0;
foreach my $size ( sort { $a <=> $b } keys %filesizes ) {
my @filenames = keys %{ $filesizes{$size} };
my @inodes = keys %{ $size_inode{$size} };
my %fingerprints = ();
my @hashvals = get_md5_hashes(@filenames);
# index filenames by hash value
foreach (@hashvals) {
my ( $md5, $filename ) = split( ' ', $_, 2 );
push @{ $fingerprints{$md5} }, $filename;
}
# print out hash values with multiple filenames
foreach my $md5 ( keys %fingerprints ) {
my $filecount = scalar @{ $fingerprints{$md5} };
if ( $filecount > 1 ) {
print "\nmd5 $md5 / $size bytes:\n";
foreach ( @{ $fingerprints{$md5} } ) {
my (
$dev, $ino, $mode, $nlink, $uid,
$gid, $rdev, $size, $atime, $mtime,
$ctime, $blksize, $blocks
) = lstat($_);
printf " %-9d %s\n", $ino, $_;
}
$total_dupe_count += ( $filecount - 1 );
$total_dupe_bytes += ( $size * ( $filecount - 1 ) );
}
}
}
print "\n$total_dupe_bytes bytes in $total_dupe_count duplicate files\n";
}
# vim: set autoindent expandtab tabstop=4 shiftwidth=4 shiftround: