dupfind

#!/usr/bin/env perl

use strict;
use warnings;

use 5.010;

BEGIN { $|++ }

use File::Util;
use Digest::xxHash 'xxhash_hex';
use Getopt::Long;
use Term::Prompt 'prompt';
use Term::ProgressBar;
use Time::HiRes 'usleep';
use Benchmark ':hireswallclock';

my $opts =
{
   dir      => undef,
   help     => undef,
   bytes    => 1024 ** 3, # 1 GB max read
   maxdepth => 10,
   prompt   => 0,
   remove   => 0,
   links    => 0,
   format   => 'human', # formatting options are either "human" or "robot"
   weed     => 1,
};

GetOptions
(
   'dir|d=s'      => \$opts->{dir},
   'bytes|b=i'    => \$opts->{bytes},
   'maxdepth|m=i' => \$opts->{maxdepth},
   'links|l'      => \$opts->{links},
   'prompt|p'     => \$opts->{prompt},
   'remove|x'     => \$opts->{remove},
   'delete'       => \$opts->{remove}, # <-- that's not a typo
   'help|h|?'     => \$opts->{help},
   'format|f=s'   => \$opts->{format},
   'weed|w=s'     => \$opts->{weed},
) or die usage();

die usage() unless defined $opts->{dir};

$opts->{remove}++ if $opts->{prompt};
$opts->{weed} = 0 if $opts->{weed} =~ /^(n|no)$/i;

my $ftl = File::Util->new
(
   {
      use_flock   => 0,
      diag        => 1,
      read_limit  => $opts->{bytes},
      abort_depth => $opts->{maxdepth},
      onfail      => 'undefined',
   }
);

my $benchmarks =
{
   scanfs   => { },
   prune    => { },
   weed     => { },
   digest   => { },
   remove   => { },
   run      => { },
};

my ( $scan_count, $size_dup_count, $dup_count ) = run();

say <<__SUMMARY__;
------------------------------
** THREADS...............No threading
** TOTAL FILES SCANNED...$scan_count
** TOTAL SAME SIZE.......$size_dup_count
** TOTAL ACTUAL DUPES....$dup_count
      -- TIMES --
** TREE SCAN TIME........$benchmarks->{scanfs}->{result}
** HARDLINK PRUNE TIME...$benchmarks->{prune}->{result}
** WEED-OUT TIME.........$benchmarks->{weed}->{result}
** CRYPTO-HASHING TIME...$benchmarks->{digest}->{result}
** DELETION TIME.........$benchmarks->{remove}->{result}
** TOTAL RUN TIME........$benchmarks->{run}->{result}
__SUMMARY__

exit;


sub run
{
   bench_this( run => 'start' );

   my ( $size_dups, $scan_count, $size_dup_count ) = scanfs();

   $size_dups = prune_hardlinks( $size_dups );

   $size_dups = weed_out( $size_dups );

   my $digest_dups = digest_the_rest( $size_dups );

   undef $size_dups; # free up some RAM

   say '** DISPLAYING OUTPUT'; say '-' x 30;

   my $dup_count = show_dups( $digest_dups );

   delete_dups( $digest_dups ) if $opts->{remove};

   bench_this( run => 'end' );

   calculate_bench_times();

   return $scan_count, $size_dup_count, $dup_count;
}

sub scanfs
{
   bench_this( scanfs => 'start' );

   say '** SCANNING ALL FILES FOR SIZE DUPLICATES';

   my ( $size_dups, $scan_count, $size_dup_count ) = get_size_dups();

   bench_this( scanfs => 'end' );

   say '** NO DUPLICATES FOUND' and exit unless keys %$size_dups;

   return $size_dups, $scan_count, $size_dup_count;
}

sub prune_hardlinks
{
   my $size_dups = shift;

   say '** PRUNING HARD LINKS';

   bench_this( prune => 'start' );

   $size_dups = toss_out_hardlinks( $size_dups );

   bench_this( prune => 'end' );

   say '** NO DUPLICATES FOUND' and exit unless keys %$size_dups;

   return $size_dups;
}

sub weed_out
{
   my $size_dups = shift;

   say '** WEEDING-OUT FILES THAT ARE OBVIOUSLY DIFFERENT';

   bench_this( weed => 'start' );

   $size_dups = remove_obviously_different( $size_dups );

   bench_this( weed => 'end' );

   say '** NO DUPLICATES FOUND' and exit unless keys %$size_dups;

   return $size_dups;
}

sub digest_the_rest
{
   my $size_dups = shift;

   say '** CHECKSUMMING SIZE DUPLICATES';

   bench_this( digest => 'start' );

   my $digest_dups = get_dup_digests( $size_dups );

   bench_this( digest => 'end' );

   say '** NO DUPLICATES FOUND' and exit unless keys %$digest_dups;

   return $digest_dups;
}

sub remove_obviously_different
{
   # weed out files that are obviously different, based on the last
   # few bytes in the file.  This saves us from unnecessary hashing

   my $size_dups    = shift;
   my $zero_sized   = delete $size_dups->{0};
   my $dup_count    = 0;

   $dup_count += @$_ for map { $size_dups->{ $_ } } keys %$size_dups;

   my $progress_bar = Term::ProgressBar->new
      (
         {
            name   => '   ...FIRST PASS',
            count  => $dup_count,
            remove => 1,
         }
      );

   my $i = 0;

   for my $same_size ( keys %$size_dups )
   {
      my @group = sort { $a cmp $b } @{ $size_dups->{ $same_size } };

      my $same_first_bytes = {};

      for my $file ( @group )
      {
         my $first_bytes = get_file_first_bytes( $file => $same_size );

         push @{ $same_first_bytes->{ $first_bytes } }, $file;

         $progress_bar->update( $i++ );
      }

      # delete obvious non-dupe files from the group of same-size files
      # by virtue of the fact that they will be a single length arrayref

      delete $same_first_bytes->{ $_ }
         for grep { @{ $same_first_bytes->{ $_ } } == 1 }
         keys %$same_first_bytes;

      # recompose the arrayref of filenames for the same-size file grouping
      # but leave out the files we just weeded out from the group

      $size_dups->{ $same_size } = []; # start fresh

      push @{ $size_dups->{ $same_size } },
         map { @{ $same_first_bytes->{ $_ } } }
         keys %$same_first_bytes;
   }

   $progress_bar->update( $i );

   undef $dup_count;

   $dup_count += @$_ for map { $size_dups->{ $_ } } keys %$size_dups;

   $i = 0; # reset

   $progress_bar = Term::ProgressBar->new
      (
         {
            name   => '   ...SECOND PASS',
            count  => $dup_count,
            remove => 1,
         }
      );

   for my $same_size ( keys %$size_dups )
   {
      my @group = @{ $size_dups->{ $same_size } };

      my $same_last_bytes = {};

      for my $file ( @group )
      {
         my $last_bytes = get_file_last_bytes( $file => $same_size );

         push @{ $same_last_bytes->{ $last_bytes } }, $file;

         $progress_bar->update( $i++ );
      }

      # delete obvious non-dupe files from the group of same-size files
      # by virtue of the fact that they will be a single length arrayref

      delete $same_last_bytes->{ $_ }
         for grep { @{ $same_last_bytes->{ $_ } } == 1 }
         keys %$same_last_bytes;

      # recompose the arrayref of filenames for the same-size file grouping
      # but leave out the files we just weeded out from the group

      $size_dups->{ $same_size } = []; # start fresh

      push @{ $size_dups->{ $same_size } },
         map { @{ $same_last_bytes->{ $_ } } }
         keys %$same_last_bytes;
   }

   $progress_bar->update( $i );

   $size_dups->{0} = $zero_sized if ref $zero_sized;

   return $size_dups;
}

sub get_file_first_bytes
{
   my ( $file, $len ) = @_;

   my $buff;

   sysopen my $fh, $file, 0;

   return unless defined $fh;

   sysread $fh, $buff, 64;

   close $fh or return;

   return $buff;
}

sub get_file_last_bytes
{
   my ( $file, $len ) = @_;

   my $buff;

   sysopen my $fh, $file, 0;

   return unless defined $fh;

   sysseek $fh, $len - 1024, 0;

   sysread $fh, $buff, 1024;

   close $fh or return;

   return $buff;
}

sub toss_out_hardlinks
{
   my $size_dups = shift;

   for my $size ( keys %$size_dups )
   {
      my $group = $size_dups->{ $size };
      my %dev_inodes;

      # this will automatically throw out hardlinks, with the only surviving
      # file being the first asciibetically-sorted entry
      $dev_inodes{ join '', ( stat $_ )[0,1] } = $_ for reverse sort @$group;

      if ( scalar keys %dev_inodes == 1 )
      {
         delete $size_dups->{ $size };
      }
      else
      {
         $size_dups->{ $size } = [ values %dev_inodes ];
      }
   }

   return $size_dups;
}

sub get_size_dups
{
   my ( $size_dups, $scan_count, $size_dup_count ) = ( {}, 0, 0 );

   $ftl->list_dir
   (
      $opts->{dir} =>
      {
         recurse => 1,
         callback => sub
            {
               my ( $selfdir, $subdirs, $files ) = @_;

               $scan_count += @$files;

               push @{ $size_dups->{ -s $_ } }, $_
                  for grep { !-l $_ && defined -s $_ } @$files;
            }
      }
   );

   delete $size_dups->{ $_ }
      for grep { @{ $size_dups->{ $_ } } == 1 }
      keys %$size_dups;

   $size_dup_count += @$_ for map { $size_dups->{ $_ } } keys %$size_dups;

   return $size_dups, $scan_count, $size_dup_count;
}

sub get_dup_digests
{
   my $size_dups = shift;
   my $digests   = {};
   my $dup_count = 0;
   my $prgs_iter = 0;

   $dup_count += @$_ for map { $size_dups->{ $_ } } keys %$size_dups;

   my $progress  = Term::ProgressBar->new
      (
         {
            name   => '   ...PROGRESS',
            count  => $dup_count,
            remove => 1,
         }
      );

   local $/;

   for my $size ( keys %$size_dups )
   {
      my $group = $size_dups->{ $size };

      for my $file ( @$group )
      {
         open my $fh, '<', $file or next;

         my $data = <$fh>;

         close $fh;

         my $digest = xxhash_hex $data, 0;

         push @{ $digests->{ $digest } }, $file;

         $progress->update( $prgs_iter++ );
      }
   }

   delete $digests->{ $_ }
      for grep { @{ $digests->{ $_ } } == 1 }
      keys %$digests;

   my $priv_digests = {};

   # sort dup groupings
   for my $digest ( keys %$digests )
   {
      my @group = @{ $digests->{ $digest } };

      $priv_digests->{ $digest } = [ sort { $a cmp $b } @group ];
   }

   $progress->update( $dup_count );

   undef $digests;

   return $priv_digests;
}

sub show_dups
{
   my $digests = shift;
   my $dupes  = 0;

   my $for_humans = sub # human-readable output
   {
      my ( $digest, $files ) = @_;

      say sprintf 'DUPLICATES (digest: %s | size: %db)', $digest, -s $$files[0];

      say "   $_" for @$files;

      say '';
   };

   my $for_robots = sub # machine parseable output
   {
      my ( $digest, $files ) = @_;

      say join "\t", @$files
   };

   my $formatter = $opts->{format} eq 'human' ? $for_humans : $for_robots;

   for my $digest
   (
      sort { $digests->{ $a }->[0] cmp $digests->{ $b }->[0] } keys %$digests
   )
   {
      my $files = $digests->{ $digest };

      $formatter->( $digest => $files );

      $dupes += @$files - 1;
   }

   return $dupes
}

sub delete_dups
{
   my $digests = shift;

   bench_this( remove => 'start' );

   my $removed = 0;

   for my $digest ( keys %$digests )
   {
      my $group = $digests->{ $digest };

      say sprintf 'KEPT    (%s) %s', $digest, $group->[0];

      shift @$group;

      for my $dup ( @$group )
      {
         if ( $opts->{prompt} )
         {
            unless ( prompt 'y', "REMOVE DUPLICATE? $dup", '', 'n' )
            {
               say sprintf 'KEPT    (%s) %s', $digest, $dup;

               next;
            }
         }

         unlink $dup or warn "COULD NOT REMOVE $dup!  $!" and next;

         $removed++;

         say sprintf 'REMOVED (%s) %s', $digest, $dup;
      }

      say '--';
   }

   say "** TOTAL DUPLICATE FILES REMOVED: $removed";

   bench_this( remove => 'end' );
}

sub bench_this
{
   my ( $mark, $start_end ) = @_;

   $benchmarks->{ $mark }->{ $start_end } = Benchmark->new();
}

sub calculate_bench_times
{
   for my $mark ( keys %$benchmarks )
   {
      next unless $benchmarks->{ $mark }->{start};

      $benchmarks->{ $mark }->{result} =
         timestr timediff
         (
            $benchmarks->{ $mark }->{end},
            $benchmarks->{ $mark }->{start}
         );
   }

   $benchmarks->{weed}->{result}   ||= 'did not weed';
   $benchmarks->{remove}->{result} ||= 'no deletions';
}

# This is just the help message:

sub usage { <<'__USAGE__' }
USAGE:
   dupfind [ --options ] --dir ./path/to/search/

EXAMPLE:
   dupfind --format robot --maxdepth 100 --bytes 1099511627776 --dir /dedup

DESCRIPTION:
   finds duplicate files in a directory tree.  Options are explained
   in detail below.  Options marked with an asterisk (*) are not yet
   implemented and are planned for a future release

ARGUMENTS AND FLAGS:
   -b, --bytes    Maximum size in bytes that you are willing to compare.
                  The current default maximum is 1 gigabyte.

                  Sizing guide:
                     1 kilobyte = 1024
                     1 megabyte = 1048576        or 1024 ** 2
                     1 gigabyte = 1073741824     or 1024 ** 3
                     1 terabyte = 1099511627776  or 1024 ** 4

   -d, --dir      Name of the directory you want to search for duplicates

   -f, --format   Specify either "human" or "robot".  Human-readable output
                  is generated for easy viewing by default.  If you want output
                  that is machine-parseable, specify "robot"

*  -l, --links    Follow symlinks (by default it does not).  Because this
                  has some safety implications and is a complex matter,
                  it is not yet supported.  Sorry, check back later.

   -m, --maxdepth The maximum directory depth to which the comparison
                  scan will recurse.  Note that this does not mean the
                  total number of directories to scan.

   -p, --prompt   Interactively prompt user to delete detected duplicates

   -x, --remove   Delete (WITHOUT PROMPTING) all but the first copy if
                  duplicate files are found.  This will leave you with no
                  duplicate files when execution is finished.

   -w, --weed     Either yes or no.  (Default yes).  Tries to avoid unnecessary
                  file hashing by weeding out potential duplicates with a
                  simple, quick comparison of the last 1024 bytes of data in
                  same-size files.  This typically produces very significant
                  performance gains, especially with large numbers of files.

__USAGE__