-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathimagehash_module_image_comparison.py
237 lines (202 loc) · 11.2 KB
/
imagehash_module_image_comparison.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#!/usr/local/bin/python3
##################################################################################
# “AS-IS” Clause
#
# Except as represented in this agreement, all work produced by Developer is
# provided “AS IS”. Other than as provided in this agreement, Developer makes no
# other warranties, express or implied, and hereby disclaims all implied warranties,
# including any warranty of merchantability and warranty of fitness for a particular
# purpose.
##################################################################################
##################################################################################
#
# Date Completed: July 24, 2019
# Author: John Bumgarner
#
# Date Revised: September 27, 2020
# Revised by: John Bumgarner
#
# This Python script is designed to use the ImageHash module to calculate the
# variance difference between two images. If these images have similarity
# score of 0 then the images are identical. Similar images will similarity scores
# that fall within a specific threshold range and images that are dissimilar will
# have scores that fall outside this threshold.
#
##################################################################################
#############################################################################################
# The OS module in provides functions for interacting with the operating system.
#
# OS.walk() generate the file names in a directory tree by walking the tree.
#############################################################################################
import os
from os import walk
#############################################################################################
# The pandas module is used to generate DataFrames containing the hashing results for each
# algorithm.
#############################################################################################
import pandas as pd
######################################################################################
# The Python module Pillow is the folk of PIL, the Python Imaging Library
# reference: https://pillow.readthedocs.io/en/3.0.x/index.html
######################################################################################
# This module is used to load images
from PIL import Image
#######################################################################################################################
# This Python module was developed by Johannes Bucher
# source: https://github.com/JohannesBuchner/imagehash
#
# The module has 4 hashing methods:
#
# 1. aHash - average hash, for each of the pixels output 1 if the pixel is bigger or equal to the average and
# 0 otherwise.
#
# 2. pHash - perceptive hash, does the same as aHash, but first it does a Discrete Cosine Transformation
#
# 3. dHash - gradient hash, calculate the difference for each of the pixel and compares the difference with the
# average differences.
#
# 4. wavelet - wavelet hashing, works in the frequency domain as pHash but it uses Discrete Wavelet Transformation
# (DWT) instead of DCT
#
# aHash, pHash and dHash all use the same approach:
# 1. Scale an image into a grayscale 8x8 image
# 2. Performs some calculations for each of these 64 pixels and assigns a binary 1 or 0 value. These 64 bits form
# the output of algorithm
#
#######################################################################################################################
import imagehash
def get_image_files(directory_of_images):
"""
This function is designed to traverse a directory tree and extract all
the image names contained in the directory.
:param directory_of_images: the name of the target directory containing
the images to be trained on.
:return: list of images to be processed.
"""
images_to_process = []
accepted_extensions = ('.bmp', '.gif', '.jpg', '.jpeg', '.png', '.svg', '.tiff')
for (dirpath, dirnames, filenames) in walk(directory_of_images):
for filename in filenames:
if filename.endswith(accepted_extensions):
images_to_process.append(os.path.join(dirpath, filename))
return images_to_process
def compute_average_hash_computational_score(base_image, comparison_image):
"""
Calculates the average hash (aHash) for 2 given images. The aHash (also called Mean Hash) algorithm
crunches the an image into a grayscale 8x8 image and sets the 64 bits in the hash based on whether
the pixel's value is greater than the average color for the image.
:param base_image: primary image to evaluate against the comparison images
:param comparison_image: images used in the aHash comparison process
:return: computational score and image names
"""
# load the images and compute their average hashes
hash0 = imagehash.average_hash(Image.open(base_image))
hash1 = imagehash.average_hash(Image.open(comparison_image))
# Methods to generate computational score:
#
# Method 1: computational_score = (hash0 - hash1) / len(hash0.hash) **2
# this method generate a computational_score as a percentage, such as 0.125000
# this percentage can be multiplied (.125000 x 100) to obtain the value of 12.50%
# which can be rounded to 13%, which is the average value per hash bit.
#
# Method 2: computational_score = (hash0 - hash1)
# this method generate a computational_score as an int, such as 8.
#
computational_score = (hash0 - hash1)
return computational_score
def compute_perception_hash_computational_score(base_image, comparison_image):
"""
Calculates the perception hashing (pHash) for 2 given images. The phash algorithm is similar to aHash algorithm
but use a discrete cosine transform (DCT) and compares based on frequencies rather than color values.
:param base_image: primary image to evaluate against the comparison images
:param comparison_image: images used in the pHash comparison process
:return: computational score and image names
"""
# load the images and compute their perception hashes
hash0 = imagehash.phash(Image.open(base_image))
hash1 = imagehash.phash(Image.open(comparison_image))
# Methods to generate computational score:
#
# Method 1: computational_score = (hash0 - hash1) / len(hash0.hash) **2
# this method generate a computational_score as a percentage, such as 0.125000
# this percentage can be multiplied (.125000 x 100) to obtain the value of 12.50%
# which can be rounded to 13%, which is the perception value per hash bit.
#
# Method 2: computational_score = (hash0 - hash1)
# this method generate a computational_score as an int, such as 8.
#
computational_score = (hash0 - hash1)
return computational_score
def compute_difference_hash_computational_score(base_image, comparison_image):
"""
Calculates the difference hashing (dHash) for 2 given images. The dHash algorithm is nearly identical to aHash, but it
tracks gradients instead of evaluating frequency patterns like aHash.
:param base_image: primary image to evaluate against the comparison images
:param comparison_image: images used in the dHash comparison process
:return: computational score and image names
"""
# load the images and compute their difference hashes
hash0 = imagehash.dhash(Image.open(base_image))
hash1 = imagehash.dhash(Image.open(comparison_image))
# Methods to generate computational score:
#
# Method 1: computational_score = (hash0 - hash1) / len(hash0.hash) **2
# this method generate a computational_score as a percentage, such as 0.125000
# this percentage can be multiplied (.125000 x 100) to obtain the value of 12.50%
# which can be rounded to 13%, which is the difference value per hash bit.
#
# Method 2: computational_score = (hash0 - hash1)
# this method generate a computational_score as an int, such as 8.
#
computational_score = (hash0 - hash1)
return computational_score
def compute_wavelet_hash_computational_score(base_image, comparison_image):
"""
Calculates the wavelet hashing (wHash) for 2 given images. The wHash algorithm uses Discrete Wavelet Transform to transform image
pixels into wavelets, which are then used for wavelet-based compression and coding.
:param base_image: primary image to evaluate against the comparison images
:param comparison_image: images used in the (wHash comparison process
:return: computational score and image names
"""
hash0 = imagehash.whash(Image.open(base_image))
hash1 = imagehash.whash(Image.open(comparison_image))
# Methods to generate computational score:
#
# Method 1: computational_score = (hash0 - hash1) / len(hash0.hash) **2
# this method generate a computational_score as a percentage, such as 0.125000
# this percentage can be multiplied (.125000 x 100) to obtain the value of 12.50%
# which can be rounded to 13%, which is the wavelet value per hash bit.
#
# Method 2: computational_score = (hash0 - hash1)
# this method generate a computational_score as an int, such as 8.
#
computational_score = (hash0 - hash1)
return computational_score
# pandas DataFrames used for the algorithm hashing results
df_average_hash = pd.DataFrame(columns=['base_image', 'comparison_image', 'similarity score'])
df_perception_hash = pd.DataFrame(columns=['base_image', 'comparison_image', 'similarity score'])
df_difference_hash = pd.DataFrame(columns=['base_image', 'comparison_image', 'similarity score'])
df_wavelet_hash = pd.DataFrame(columns=['base_image', 'comparison_image', 'similarity score'])
# options to disable certain display limits
pd.options.display.max_columns = None
pd.options.display.max_rows = None
target_image = 'jennifer_aniston.jpeg'
image_directory = 'female_headshots_with_earrings'
images = get_image_files(image_directory)
for image in images:
ahash_result = compute_average_hash_computational_score(target_image, image)
df_average_hash = df_average_hash.append({'base_image': target_image, 'comparison_image': image.split('/')[1],
'similarity score': ahash_result}, ignore_index=True)
phash_result = compute_perception_hash_computational_score(target_image, image)
df_perception_hash = df_perception_hash.append({'base_image': target_image, 'comparison_image': image.split('/')[1],
'similarity score': phash_result}, ignore_index=True)
dhash_result = compute_difference_hash_computational_score(target_image, image)
df_difference_hash = df_difference_hash.append({'base_image': target_image, 'comparison_image': image.split('/')[1],
'similarity score': dhash_result}, ignore_index=True)
whash_result = compute_wavelet_hash_computational_score(target_image, image)
df_wavelet_hash = df_wavelet_hash.append({'base_image': target_image, 'comparison_image': image.split('/')[1],
'similarity score': whash_result}, ignore_index=True)
# Display the results in a pandas DataFrame, which is sorted by similarity score
# the index numbers are removed from this output
final_df = df_difference_hash.sort_values(by=['similarity score'], ascending=True)
print(final_df.to_string(index=False))