Difference between revisions of "CheckDuplicates"
Jump to navigation
Jump to search
(Created page with "Category:Python Python script to check for files with the same content. <syntaxhighlight> #!/usr/bin/env python3 def usage(): print ("Usage: "+__file__ +'<direct...") |
m |
||
Line 2: | Line 2: | ||
[[Python]] script to check for files with the same content. | [[Python]] script to check for files with the same content. | ||
− | <syntaxhighlight> | + | <syntaxhighlight lang=python> |
#!/usr/bin/env python3 | #!/usr/bin/env python3 | ||
Latest revision as of 12:21, 29 June 2020
Python script to check for files with the same content.
#!/usr/bin/env python3
def usage():
print ("Usage: "+__file__ +'<directory>')
print("Check for duplicate files using their hash")
version = 'R1A, 19620224, Initial release'
print("Version: "+version)
sys.exit(1)
# USER CONFIGURABLE ITEMS
BLOCKSIZE = 2**16
# END USER CONFIGURABLE ITEMS, DO NOT CHANGE ANYTHING BELOW THIS LINE
import os
import sys
import hashlib
from collections import defaultdict
def main():
hashset = set ()
duphashes = set()
filedict = defaultdict(list)
if len(sys.argv) < 2:
usage()
for directory, subdirs, filenames in os.walk(sys.argv[1]):
for filename in filenames:
filepath = '/'.join([directory,filename])
if os.path.isfile(filepath):
filehash = get_hash(filepath)
filedict[filehash].append(filepath)
if filehash in hashset:
duphashes.add(filehash)
hashset.add(filehash)
for filehash in duphashes:
for filename in filedict[filehash]:
print(filename)
print('\n')
return
def get_hash(filespec):
hasher = hashlib.sha256()
with open(filespec, 'rb') as fh:
buf = fh.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = fh.read(BLOCKSIZE)
return hasher.hexdigest()
main()