-
Notifications
You must be signed in to change notification settings - Fork 1
/
duplicate_remover.py
58 lines (46 loc) · 1.51 KB
/
duplicate_remover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
from hashlib import md5
import time
# generate hash value for file
def file_hash(filepath):
"""
Function that returns a hash of a file
Parameters:
filepath: string of a file path
Returns:
string: file hash
"""
with open(filepath, 'rb') as f:
return md5(f.read()).hexdigest()
# delete all duplicates files
def remove_duplicates(path):
"""
Function that loops through the images in a folder
and remove duplicates
"""
duplicates = []
hash_keys = {}
print(f"Scanning path {path}")
files_list = os.listdir(path)
print(f"{len(files_list)} files found")
for index, filename in enumerate(files_list):
filepath = path + "\\" + filename
if os.path.isfile(filepath):
filehash = file_hash(filepath)
if filehash not in hash_keys.keys(): # a novel image
hash_keys[filehash] = index
else: # a duplicate image
duplicates.append((index, hash_keys[filehash]))
# remove duplicate images by their paths
for index, _ in duplicates:
filepath = path + "\\" + files_list[index]
os.remove(filepath)
print(f"File {files_list[index]} removed successfully")
print(f"All {len(duplicates)} duplicates have been removed")
# edit path below
PATH = r"E:\dataset15\St Paul's Cathedral Melbourne"
if __name__ == "__main__":
starttime = time.time()
remove_duplicates(PATH)
endtime = time.time()
print('\nTotal time: %f seconds.' % (endtime - starttime))