Difference between revisions of "CheckDuplicates"

From wiki
Jump to navigation Jump to search
(Created page with "Category:Python Python script to check for files with the same content. <syntaxhighlight> #!/usr/bin/env python3 def usage(): print ("Usage: "+__file__ +'<direct...")
 
m
 
Line 2: Line 2:
 
[[Python]] script to check for files with the same content.
 
[[Python]] script to check for files with the same content.
  
<syntaxhighlight>
+
<syntaxhighlight lang=python>
 
#!/usr/bin/env python3
 
#!/usr/bin/env python3
  

Latest revision as of 12:21, 29 June 2020

Python script to check for files with the same content.

#!/usr/bin/env python3

def usage():
    print ("Usage: "+__file__ +'<directory>')
    print("Check for duplicate files using their hash")
    version = 'R1A, 19620224, Initial release'
    print("Version: "+version)
    sys.exit(1)


# USER CONFIGURABLE ITEMS

BLOCKSIZE = 2**16

# END USER CONFIGURABLE ITEMS, DO NOT CHANGE ANYTHING BELOW THIS LINE

import os
import sys
import hashlib
from collections import defaultdict

def main():
    hashset = set ()
    duphashes = set()
    filedict = defaultdict(list)
    if len(sys.argv) < 2:
        usage()
    for directory, subdirs, filenames in os.walk(sys.argv[1]):
        for filename in filenames:
            filepath = '/'.join([directory,filename])
            if os.path.isfile(filepath):
                filehash = get_hash(filepath)
                filedict[filehash].append(filepath)
                if filehash in hashset:
                    duphashes.add(filehash)
                hashset.add(filehash)
    for filehash in duphashes:
        for filename in filedict[filehash]:
            print(filename)
        print('\n')
        
    return


def get_hash(filespec):
    hasher = hashlib.sha256()
    with open(filespec, 'rb') as fh:
        buf = fh.read(BLOCKSIZE)
        while len(buf) > 0:
            hasher.update(buf)
            buf = fh.read(BLOCKSIZE)
         
    return hasher.hexdigest()


main()