CheckDuplicates

From wiki
Revision as of 12:21, 29 June 2020 by Hdridder (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

Python script to check for files with the same content.

#!/usr/bin/env python3

def usage():
    print ("Usage: "+__file__ +'<directory>')
    print("Check for duplicate files using their hash")
    version = 'R1A, 19620224, Initial release'
    print("Version: "+version)
    sys.exit(1)


# USER CONFIGURABLE ITEMS

BLOCKSIZE = 2**16

# END USER CONFIGURABLE ITEMS, DO NOT CHANGE ANYTHING BELOW THIS LINE

import os
import sys
import hashlib
from collections import defaultdict

def main():
    hashset = set ()
    duphashes = set()
    filedict = defaultdict(list)
    if len(sys.argv) < 2:
        usage()
    for directory, subdirs, filenames in os.walk(sys.argv[1]):
        for filename in filenames:
            filepath = '/'.join([directory,filename])
            if os.path.isfile(filepath):
                filehash = get_hash(filepath)
                filedict[filehash].append(filepath)
                if filehash in hashset:
                    duphashes.add(filehash)
                hashset.add(filehash)
    for filehash in duphashes:
        for filename in filedict[filehash]:
            print(filename)
        print('\n')
        
    return


def get_hash(filespec):
    hasher = hashlib.sha256()
    with open(filespec, 'rb') as fh:
        buf = fh.read(BLOCKSIZE)
        while len(buf) > 0:
            hasher.update(buf)
            buf = fh.read(BLOCKSIZE)
         
    return hasher.hexdigest()


main()