#!/usr/bin/perl
# Copyright (c) 2000 Flavio Glock <fglock@pucrs.br>. All rights reserved.
# This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
# repetidos.pl
# Finds duplicate directories and files.
# Makes a "cache" if asked.
#
# USAGE:
#	See: repetidos.pl --help
#
# TODO:
#	pod
#	--outfile   to store report
#	use compare_text for text files (?)
#	--load-index=file --verify   to check file integrity
#	--base-dir=dir --relative    to make non-absolute filenames
#	save file dates (?)
#	invert compare - show what is different
#	
$VERSION = "0.012";
#
# 0.012
#	glob
#
# 0.011
#	doesn't change files atime
#	don't show files if they are in duplicate dirs already, unless some of them are in other places
#	--verbose   to see "read: ..."
#
# 0.010
#	corrected dir splice error
#	if parent dirs match, doesn't show subdirs
#
# 0.009
#	save and load index files: can be used to make a "cache" for CD-ROMs
#	use Digest::MD5 instead of File::Compare
#	much more processing required!
#	safer separator: \t
#
# 0.008
#	Last version using File::Compare
#	always group compares
#
# 0.007
#	prints filenames using "\" in Windows
#
# 0.006
#	more uniform compare
#	doesn't miss compares
#	Directories can match even if filenames are different
#
# 0.005
#	nicer output
#	Uses more pointers
#
# 0.004
#   Corrected Subdir size
#
# 0.003
#	Sorts numerically by directory size
#
# 0.002
#	Subdir compare is exact
#
# 0.001
# 	Subdir compare is uncertain - will show "?"
# 	File compare is exact
#

use Digest::MD5;
use Getopt::Long;

GetOptions(
    'save-index=s'   => \$SAVE_INDEX,
    'load-index=s'   => \@LOAD_INDEX,
	'verbose!'	     => \$VERBOSE,
    'version'        => \&print_version,
    'help'           => \&usage,
    ) || usage();

$dots = 20;
$join = '/';
$separator = "\t";
$nFields = 3;

@b = ();
@subdir = ();
%visited = ();

foreach (@LOAD_INDEX) {
foreach (glob $_) {
	print "Loading: $_\n\n" if $VERBOSE;
	open (FILE, "<$_");
	$type = 0;
	foreach(<FILE>) {
		chomp($_);
		if ($_ eq "DIRECTORIES") {
			$type = 1;
		}
		elsif ($_ eq "FILES") {
			$type = 2;
		}
		elsif ($type == 1) {
			push @subdir, $_;
			($a) = /\d$separator(.*?)$separator/;
			$visited{$a} = 1;
		}
		elsif ($type == 2) {
			push @b, $_;
		}
	}
	close (FILE);
}
}

$a  = shift;
usage() unless $a or @b or @subdir;

print "Reading:" if $VERBOSE;
while ($a) {
foreach (glob $a) {
	$b = $_;
	# $b =~ s|\\|\/|g;
	$b =~ s/([^:])\/$/$1/;
	if ($visited{$b}) {
		print "\n\t(", &print_filename($b), " done)" if $VERBOSE;
	}
	else {
		$size = &readmydir ($b);
		if (-d $b) {
			push @subdir, "$size$separator$b$separator$md5";
			$visited{$b} = 1;
		}
	}
}
	$a  = shift;
}
undef %visited;
print "\n\n" if $VERBOSE;

print "Sorting: " if $VERBOSE;
# sort files in reverse order of filesize
@b = sort  {$b <=> $a} @b;
# sort dirs in reverse order of the sum of filesizes
@subdir = sort {$b <=> $a} @subdir;

if ($SAVE_INDEX) {
	print "Saving: $SAVE_INDEX\n" if $VERBOSE;
	open (FILE, ">$SAVE_INDEX");
	print FILE "DIRECTORIES\n";
	print FILE join("\n", @subdir);
	print FILE "\nFILES\n";
	print FILE join("\n", @b);
	close (FILE);
}

foreach(0 .. $#b) {
	($filesize[$_], $filename[$_], $filemd5[$_]) = split($separator, $b[$_], $nFields);
	# print "($filesize[$_], $filename[$_], $filemd5[$_])\n";
}
undef @b;
foreach(0 .. $#subdir) {
	($dirsize[$_], $dirname[$_], $dirmd5[$_]) = split($separator, $subdir[$_], $nFields);
}
undef @subdir;
print "done.\n\n" if $VERBOSE;

print "Duplicate Subdirs:\n";
foreach (0 .. $#dirname) {
	$this = $_;
	$last = $_ + 1;
	$match = 0;
	while (($last <= $#dirname) and	($dirsize[$this] == $dirsize[$last])) {
		if ( &compare_dir ($this, $last) == 0 ) {
			$report_size = $dirsize[$this];
			$report_size =~ s/(.*)\.(.*)/$1 bytes in $2 file/;
			$report_size .= 's' if $2 > 1;
			print "\n\t", &print_filename($dirname[$this]), " \t$report_size\n" unless $match;
			print "\t", &print_filename($dirname[$last]), "\n";

			# remove subdirs from list

			for ($i = $#dirname; $i > $last; $i--) {
				if ((substr($dirname[$i], 0, length($dirname[$this]))  eq $dirname[$this]) or
					(substr($dirname[$i], 0, length($dirname[$last]))  eq $dirname[$last]) )
				{
					# print "  $dirname[$i] is subdir\n";
					splice(@dirname, $i, 1);
					splice(@dirsize, $i, 1);
					splice(@dirmd5,  $i, 1);
				}
			}

			splice(@dirname, $last, 1);
			splice(@dirsize, $last, 1);
			splice(@dirmd5,  $last, 1);
			$match = 1;
		}
		else {
			$last++;
		}
	}
}
print "\n";

print "Duplicate Files:\n";
foreach(0 .. $#filename) {
	$last = $_ + 1;
	$match = 0;
	$unmarked = 0;
	$report = '';
nextfile:
	while (($last < $#filename) and	($filesize[$_] == $filesize[$last])) {
		if ( &compare_file ($_, $last) == 0 ) {
			$unmarked = 1 if not $files_marked[$_];
			$unmarked = 1 if not $files_marked[$last];
			$report .= "\n\t" . &print_filename($filename[$_]) . " \t$filesize[$_] bytes\n" unless $match;
			$report .= "\t" . &print_filename($filename[$last]) . "\n";
			splice(@filename, $last, 1);
			splice(@filesize, $last, 1);
			splice(@filemd5,  $last, 1);
			splice(@files_marked, $last, 1);
			$match = 1;
		}
		else {
			$last++;
		}
	}
	print $report if $unmarked;
}

sub readmydir {
	# globals: @subdir, @b
	my ($a) = @_;
	my ($size, $atime, $mtime);
	my ($md5, $count);

	if (-d $a) {
		# is-subdir
		my @dir;
		my ($tmpsize, $tmpmd5);
		$size = 0;

		if ($visited{$a}) {
			print "\n\t(", &print_filename($a), " done)" if $VERBOSE;
		}
		else {
			print "\n\tread:\t", &print_filename($a), " \t" if $VERBOSE;
	        my (@file_stat) = stat($a);
			if (@file_stat) {
				# $size  = $file_stat[7];
				$atime = $file_stat[8];
				$mtime = $file_stat[9];
				opendir(DIR,$a);
				@dir = readdir(DIR);
				closedir DIR;
				$count = 0;
				foreach(@dir) {
					if (($_ ne ".") and ($_ ne "..")) {
						$item = "$a$join$_";
						($tmpsize, $tmpmd5) = readmydir ($item);
						$size += $tmpsize;
						$count ++;
						# tmpmd5 ??
					}
				}
				push @subdir, "$size.$count$separator$a$join$separator$md5";
				$visited{$a} = 1;
				utime $atime, $mtime, $a;
				return $size;
			}
			else {
				return -1;
			}
		}
	} 
	else {
		# is-file
        my (@file_stat) = stat($a);
		if (@file_stat) {
			$size  = $file_stat[7];
			$atime = $file_stat[8];
			$mtime = $file_stat[9];
			open(FILE,$a);
		    binmode(FILE);
			# $digest = md5_base64(*FILE);
			$md5 = Digest::MD5->new->addfile(*FILE)->b64digest;
		    # print "\n $md5 $a ";
			close(FILE);
			utime $atime, $mtime, $a;
			# print "FILE: $a = $md5\n";
			if ($k++ > $dots) { 
				print "." if $VERBOSE; 
				$k = 0; 
			}
			push @b, "$size$separator$a$separator$md5";
			return ($size, $md5);
		}
		else {
			return (-1,"");
		}
	}
}

sub compare_file {
	# globals: @filesize, @filename
	my ($this, $other) = @_;

	return  1 if ($filesize[$this] < 1) or ($filesize[$other] < 1);
	return  0 if ($filemd5[$this] eq $filemd5[$other]);
	return  1;
}

sub compare_file_zero {
	# globals: @filesize, @filename
	my ($this, $other) = @_;

	return  0 if ($filemd5[$this] eq $filemd5[$other]);
	return  1;
}

sub compare_dir {
	# globals: @dirname, @dirsize, @filename
	my ($this, $other) = @_;
	my (@files_this, @files_other, $tmp, $tmp_this, $tmp_other);
	my (@files_matched);
	return -2 if ($this == $other);
	return  1 if ($dirsize[$this] != $dirsize[$other]);
	return -3 if ($dirsize[$this] < 1);
	return -4 if ($dirname[$this] eq $dirname[$other]);
	return -5 if
		( substr($dirname[$other], 0, length($dirname[$this]))  eq $dirname[$this]) or
		( substr($dirname[$this],  0, length($dirname[$other])) eq $dirname[$other]);
	# get list of files in subdirs
		$tmp_this = $dirname[$this];
		$tmp_other = $dirname[$other];
		@files_this = ();
		@files_other = ();
		foreach(0 .. $#filename) {
			$tmp = substr($filename[$_], 0, length($tmp_this));
			if ($tmp eq $tmp_this) {
				push @files_this, $_;
			}
			$tmp = substr($filename[$_], 0, length($tmp_other));
			if ($tmp eq $tmp_other) {
				push @files_other, $_;
			}
		}
	# compare files
		CMPFILE: foreach $this (0 .. $#files_this) {
			foreach $other (0 .. $#files_other) {
				if ( &compare_file_zero ($files_this[$this], $files_other[$other]) == 0 ) {
					# match
					push @files_matched, $files_this[$this];
					push @files_matched, $files_other[$other];
					splice(@files_other, $other, 1);
					next CMPFILE;
				}
			}
			# no match
			return 1;
		}
	# equal
		foreach (@files_matched) {
			# print " * $_ ";
			$files_marked[$_] = 1;
		}
	# report
		return 0;
}

sub print_filename {
    if ($^O =~ /win32/i) { 
        my $a = $_[0];
		$a =~ tr|\/|\\|;
		return $a;
    }
	return $_[0];
}

sub print_version {
	print "repetidos.pl version $VERSION\n";
	exit 0;
}

sub usage {
	print <<EOT;
usage:
    repetidos.pl [options] [directory|file] ([directory|file] ...)

options:
    -s --save-index=file
    -l --load-index=file ([load-index=file] ...)
    -h --help
    --version
    --verbose

example:
    repetidos.pl win* --verbose
EOT
	exit 0;
}

1;
