How to search duplicate files within two directories?

Printer-friendly versionPDF version

#!/usr/bin/perl
use strict;
use warnings;

# findDupeFiles:
# This script attempts to identify which files might be duplicates.
# It searches specified directories for files with a given suffix
# and reports on files that have the same MD5 digest.
# The suffix or suffixes to be searched for are specified by the first
# command-line argument - each suffix separated from the next by a vertical bar.
# The subsequent command-line arguments specify the directories to be searched.
# If no directories are specified on the command-line,
# it searches the current directory.
# Files whose names start with "._" are ignored.
#
# Cameron Hayne (<a href="mailto:macdev@hayne.net">macdev@hayne.net</a>)  January 2006
#
#
# Examples of use:
# ----------------
# findDupeFiles '.aif|.aiff' AAA BBB CCC
# would look for duplicates among all the files with ".aif" or ".aiff" suffixes
# under the directories AAA, BBB, and CCC
#
# findDupeFiles '.aif|.aiff'
# would look for duplicates among all the files with ".aif" or ".aiff" suffixes
# under the current directory
#
# findDupeFiles '' AAA BBB CCC
# would look for duplicates among all the files (no matter what suffix)
# under the directories AAA, BBB, and CCC
#
# findDupeFiles
# would look for duplicates among all the files (no matter what suffix)
# under the current directory
# -----------------------------------------------------------------------------


use File::Find;
use File::stat;
use Digest::MD5;

my $matchSomeSuffix; # reference to a subroutine for matching suffixes
if (defined($ARGV[0]))
{
  # the list of desired suffixes is supplied in $ARGV[0]
  # separated by vertical bars - e.g. ".mp3|.aiff"
  # Note that if $ARGV[0] is '', then all files will be looked at
 
  my @suffixes = split(/\|/, $ARGV[0]);
  if (scalar(@suffixes) > 0)
  {
    # create an efficient matching subroutine using the Friedl technique
    my $matchExpr = join('||', map {"m/\$suffixes[$_]\$/io"} 0..$#suffixes);

    $matchSomeSuffix = eval "sub {$matchExpr}";
  }
  shift @ARGV;
}

# if no dirs supplied as command-line args, we search the current directory
my @searchDirs = @ARGV ? @ARGV : ".";

# verify that these are in fact directories
foreach my $dir (@searchDirs)
{
  die "\"$dir\" is not a directory\n" unless -d "$dir";
}

my %filesByMd5; # global variable holding hash of arrays of dupes

# calcMd5: returns the MD5 digest of the given file
sub calcMd5($)
{
  my ($filename) = @_;

  if (-d $filename)
  {
    # doing MD5 on a directory is not supported
    return "unsupported"; # we need to return something
  }

  $filename =~ s#^(\s)#./$1#; # protect against leading whitespace
  open(FILE, "< $filename\0")
     or die "Unable to open file \"$filename\": $!\n";
  binmode(FILE); # just in case we're on Windows!
  my $md5 = Digest::MD5->new->addfile(*FILE)->hexdigest;
  close(FILE);
  return $md5;
}

# checkFile: invoked from the 'find' routine on each file or directory in turn
sub checkFile()
{
  return unless -f $_; # only interested in files, not directories

  my $filename = $_;
  my $dirname = $File::Find::dir;
 
  return if $filename =~ /^\._/; # ignore files whose names start with "._"
 
  if (defined($matchSomeSuffix))
  {
    return unless &$matchSomeSuffix;
  }

  my $statInfo = stat($filename)
        or warn "Can't stat file \"$dirname/$filename\": $!\n" and return;
  my $size = $statInfo->size;
  my $md5 = calcMd5($filename);
 
  my $fileInfo = {
    'dirname'  => $dirname,
    'filename' => $filename,
    'size'   => $size,
    'md5'    => $md5,
    };
 
  push(@{$filesByMd5{$md5}}, $fileInfo);
}

MAIN:
{
  # traverse the directories and build up lists of dupes in %filesByMd5
  find(\&checkFile, @searchDirs);
 
  # for each set of dupes, print the full path to the files
  my $numDupes = 0;
  my $numDupeBytes = 0;
  foreach my $key (keys %filesByMd5)
  {
    my @dupList = @{$filesByMd5{$key}};
    my $numCopies = scalar(@dupList);
    next unless $numCopies > 1;

    my $size = -1;
    foreach my $fileInfo (@dupList)
    {
      my $dirname = $fileInfo->{dirname};
      my $filename = $fileInfo->{filename};
      if ($size == -1)
      {
        $size = $fileInfo->{size};
      }
      elsif ($size != $fileInfo->{size}) # sanity check
      {
        print "File sizes don't match!\n";
        print "previous: $size current: $fileInfo->{size}\n";
      }
     
      print "rm $dirname/$filename\n";
    }
    #print "----------\n";
   
    $numDupes += ($numCopies - 1);
    $numDupeBytes += ($size * ($numCopies - 1));
  }
 
  my $numDupeMegabytes = sprintf("%.1f", $numDupeBytes / (1024 * 1024));
  #print "Number of duplicate files: $numDupes\n";
  #print "Megabytes duplicated: $numDupeMegabytes\n";
}

Source: http://hayne.net/MacDev/Perl/findDupeFiles

I used it to match the test cases output to a predefined set and find out which cased failed. I modified the script to remove the "-----" lines after every match and prepends rm to all file names, so that I can direct the output to a .bat file and remove the common files.
Usage:
perl findDupeFiles.pl '.aif|.aiff' AAA BBB CCC > rm.bat



AttachmentSize
findDupeFiles.pl_.txt4.73 KB
Your rating: None Average: 1 (1 vote)