• Re: program to remove duplicates

    From =?UTF-8?Q?Josef_M=C3=B6llers?=@josef@invalid.invalid to comp.lang.c on Tue Oct 1 16:34:47 2024
    From Newsgroup: comp.lang.c

    On 21.09.24 20:53, fir wrote:


    i think if to write a simple comandline program
    that remove duplicates in a given folder
    [...]

    I have had the same problem. My solution was to use extended file
    attributes and some file checksum, eg sha512sum, also, I wrote this in
    PERL (see code below). Using the file attributes, I can re-run the
    program after a while without having to re-calculate the checksums.
    So, this solution only works for filesystems that have extended file attributes, but you could also use some simple database (sqlite3?) to
    map checksums to pathnames.

    What I did was to walk through the directory tree and check if the file
    being considered already has a checksum in an extended attribute. If
    not. I'll calculate the checksum and store that in the extended
    attribute. Also, I store the pathname in a hash (remember, this is
    PERL), key is the checksum.
    If there is a collision (checksum already in the hash), I remove the new
    file (and link the new filename to the old file). One could be paranoid
    and do a byte-by-byte file comparison then.

    If I needed to do this in a C program, I'd probably use a GList to store
    the hash, but otherwise the code logic would be the same.

    HTH,

    Josef

    #! /usr/bin/perl

    use warnings;
    use strict;
    use File::ExtAttr ':all'; # In case of problems, maybe insert "use Scalar:Utils;" in /usr/lib/x86_64-linux-gnu/perl5/5.22/File/ExtAttr.pm
    use Digest::SHA;
    use File::Find;
    use Getopt::Std;

    # OPTIONS:
    # s: force symlink
    # n: don't do the actula removing/linking
    # v: be more verbose
    # h: print short help
    my %opt = (
    s => undef,
    n => undef,
    v => undef,
    h => undef,
    );
    getopts('hnsv', \%opt);

    if ($opt{h}) {
    print STDERR "usage: lndup [-snvh] [dirname..]\n";
    print STDERR "\t-s: use symlink rather than hard link\n";
    print STDERR "\t-n: don't remove/link, just show what would be done\n";
    print STDERR "\t-v: be more verbose (show pathname and SHA512 sum\n";
    print STDERR "\t-h: show this text\n";
    exit(0);
    }

    my %file;

    if (@ARGV == 0) {
    find({ wanted => \&lndup, no_chdir => 1 }, '.');
    } else {
    find({ wanted => \&lndup, no_chdir => 1 }, @ARGV);
    }

    # NAME: lndup
    # PURPOSE: To handle a single file
    # ARGUMENTS: None, pathname is taken from $File::Find::name
    # RETURNS: Nothing
    # NOTE: The SHA512 sum of a file is calculated.
    # IF a file with the same sum was already found earlier, AND
    # iF both files are NOT the same (hard link) AND
    # iF both files reside on the same disk
    # THEN the second occurrence is removed and
    # replaced by a link to the first occurrence
    sub lndup {
    my $pathname = $File::Find::name;

    return if ! -f $pathname;
    if (-s $pathname) {
    my $sha512sum = getfattr($pathname, 'SHA512');
    if (!defined $sha512sum) {
    my $ctx = Digest::SHA->new(512);
    $ctx->addfile($pathname);
    $sha512sum = $ctx->hexdigest;
    print STDERR "$pathname $sha512sum\n" if $opt{v};
    setfattr($pathname, "SHA512", $sha512sum);
    } elsif ($opt{v}) {
    print STDERR "Using sha512sum from attributes\n";
    }

    if (exists $file{$sha512sum}) {
    if (!same_file($pathname, $file{$sha512sum})) {
    my $links1 = (stat($pathname))[3];
    my $links2 = (stat($file{$sha512sum}))[3];
    # If one of them is a symbolic link, make sure it's
    $pathname
    if (is_symlink($file{$sha512sum})) {
    print STDERR "Swapping $pathname and
    $file{$sha512sum}\n" if $opt{v};
    swap($file{$sha512sum}, $pathname);
    }
    # If $pathname has more links than $file{$sha512sum},
    # exchange the two names.
    # This ensures that $file{$sha512sum} has the most links.
    elsif ($links1 > $links2) {
    print STDERR "Swapping $pathname and
    $file{$sha512sum}\n" if $opt{v};
    swap($file{$sha512sum}, $pathname);
    }

    print "rm \"$pathname\"; ln \"$file{$sha512sum}\" \"$pathname\"\n";
    if (! $opt{n}) {
    my $same_disk = same_disk($pathname,
    $file{$sha512sum});
    if (unlink($pathname)) {
    if (! $same_disk || $opt{s}) {
    symlink($file{$sha512sum}, $pathname) ||
    print STDERR "Failed to symlink($file{$sha512sum}, $pathname): $!\n";
    } else {
    link($file{$sha512sum}, $pathname) || print STDERR "Failed to link($file{$sha512sum}, $pathname): $!\n";
    }
    } else {
    print STDERR "Failed to unlink $pathname: $!\n";
    }
    }
    # print "Removing $pathname\n";
    # unlink $pathname or warn "$0: Cannot remove $_: $!\n";

    }
    } else {
    $file{$sha512sum} = $pathname;
    }
    }
    }

    # NAME: same_disk
    # PURPOSE: To check if two files are on the same disk
    # ARGUMENTS: pn1, pn2: pathnames of files
    # RETURNS: true if files are on the same disk, else false
    # NOTE: The check is made by comparing the device numbers of the
    # filesystems of the two files.
    sub same_disk {
    my ($pn1, $pn2) = @_;

    my @s1 = stat($pn1);
    my @s2 = stat($pn2);

    return $s1[0] == $s2[0];
    }

    # NAME: same_file
    # PURPOSE: To check if two files are the same
    # ARGUMENTS: pn1, pn2: pathnames of files
    # RETURNS: true if files are the same, else false
    # NOTE: files are the same if device number AND inode number
    # are identical
    sub same_file {
    my ($pn1, $pn2) = @_;

    my @s1 = stat($pn1);
    my @s2 = stat($pn2);

    return ($s1[0] == $s2[0]) && ($s1[1] == $s2[1]);
    }

    sub is_symlink {
    my ($path) = @_;

    return -l $path;
    }

    sub swap {
    my $tmp;
    $tmp = $_[0];
    $_[0] = $_[1];
    $_[1] = $tmp;
    }
    --- Synchronet 3.20a-Linux NewsLink 1.114
  • From gazelle@gazelle@shell.xmission.com (Kenny McCormack) to comp.lang.c on Tue Oct 1 20:38:23 2024
    From Newsgroup: comp.lang.c

    In article <lm2fk7FpccjU1@mid.individual.net>,
    Josef Mllers <josef@invalid.invalid> wrote:
    ...
    I have had the same problem. My solution was to use extended file
    attributes and some file checksum, eg sha512sum, also, I wrote this in
    PERL (see code below). Using the file attributes, I can re-run the

    And is thus entirely OT here. Keith will tell the same.
    --
    "You can safely assume that you have created God in your own image when
    it turns out that God hates all the same people you do." -- Anne Lamott

    --- Synchronet 3.20a-Linux NewsLink 1.114