-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathduplicate_finder.py
154 lines (116 loc) · 4.05 KB
/
duplicate_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""
"Find the needle from the haystack"
Author : Arun Aniyan
Contact : [email protected]
Version : V.2.0
Takes in a set of reference or training images from a single
directory and searches same / similar images from another directory.
To run the code :
python duplicate_finder.py -r <training_directory> -t <test_directory>
The results will be written to output.txt in the current directory.
Result format : <Test Image>, <Duplicates..,>
"""
""" Imports """
import argparse
import os
from time import time
import imagehash
from PIL import Image
""" Helper Functions """
# Traverse Directory and get list of files
def traverse_dir(dirname):
fl_list = []
for filename in os.listdir(dirname):
if (
filename.endswith(".jpg")
or filename.endswith(".png")
or filename.endswith(".jpeg")
):
fl_list.append(filename)
else:
continue
return fl_list
""" Extract pHash and dHash for images"""
# Hasher function
def hasher(basedir, files):
pdb = {}
ddb = {}
pdb = pdb.fromkeys(files)
ddb = ddb.fromkeys(files)
for infile in list(pdb.keys()):
try:
pdb[infile] = imagehash.phash(
Image.open(os.path.join(basedir, infile))
) # pHash
ddb[infile] = imagehash.dhash(
Image.open(os.path.join(basedir, infile))
) # dHash
except:
print(f"Error with file {infile} in {basedir}")
return pdb, ddb
# Map filenames from hashcode
def find_key(dic, val):
return [x for x in list(dic.keys()) if dic[x] == val]
# Find duplicates and originals
def find_duplicates(traindb, testdb):
# Compare dicts
shared_items = set(traindb.values()) & set(testdb.values())
shared_items = [x for x in iter(shared_items)] # Convert set to list
# Get Originals
originals = [find_key(traindb, item) for item in shared_items]
# Find duplicates
duplicates = [find_key(testdb, item) for item in shared_items]
return originals, duplicates
# Pretty Reformating - Remove unwanted characters from list
def reformat(reflist, duplist):
l = reflist + " " + str(duplist)
l = l.replace("[", "")
l = l.replace("]", "")
l = l.replace("'", "")
l = l.replace(",", "")
l = l.replace(" ", ",")
return l
""" Main """
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument(
"-r",
"--train_directory",
required=True,
help="Location of reference / train images",
)
ap.add_argument(
"-t", "--test_directory", required=True, help="Location of test images"
)
args = vars(ap.parse_args())
refdir = args["train_directory"]
testdir = args["test_directory"]
start_time = time() # Start time for calculations
# Get file list
ref_files = traverse_dir(refdir)
test_files = traverse_dir(testdir)
# Get hashes for test and train
ref_pdb, ref_ddb = hasher(refdir, ref_files)
test_pdb, test_ddb = hasher(testdir, test_files)
# Find duplicates and originals from hashcode
p_originals, p_duplicates = find_duplicates(ref_pdb, test_pdb)
d_originals, d_duplicates = find_duplicates(ref_ddb, test_ddb)
# Exit of no copies are found
if (len(p_originals) == 0) or (len(d_originals) == 0):
print("No duplicates found...")
print(f"Search Time was {(time() - start_time)} seconds")
exit(0)
# Names of unique originals
originals = p_originals or d_originals
originals = [str(i[0]) for i in originals]
# Name of unique duplicate files
duplicates = p_duplicates or d_duplicates
end_time = time() # End time for calculations
# Save results to output.txt
for i in range(0, len(originals)):
text = reformat(originals[i], duplicates[i])
with open("output.txt", "a") as text_file:
text_file.write(text + "\n")
print(f"Took {(end_time - start_time)} seconds for search.")
print(f"Found duplicates for {len(originals)} images.")
print("Result written to output.txt")