I have a working code to get the duplicate images using Python but it takes a lot of time to execute and determine duplicate images present anywhere in the folder hierarchy.
I am using hash digest and per pixel color diff to check if two images are duplicate out of thousands of them.
I intent to run it on my hard disk (but its a very CPU intense work and takes lots of CPU Utilization)
Here is the Script that does that for us:
import sys
import os
import json
import datetime
def is_duplicate(image_obj_1, image_obj_2):
# 1) Check if 2 images are equals
if image_obj_1.shape == image_obj_2.shape:
difference = cv2.subtract(image_obj_1, image_obj_2)
b, g, r = cv2.split(difference)
if cv2.countNonZero(b) == 0 and cv2.countNonZero(g) == 0 and cv2.countNonZero(r) == 0:
return True
return False
def generate_image_hash(image_path):
return hashlib.md5((open(image_path, "rb").read())).hexdigest()
def get_folder_content(scan_folder_path, images_in_folder):
def get_images_from_folder():
try:
images = np.array([] if images_in_folder is None else images_in_folder)
for file_name in os.listdir(scan_folder_path):
file_path = os.path.join(scan_folder_path, file_name)
file_obj = cv2.imread(file_path)
if file_obj is not None:
images = np.append(
images,
{
'image_name': file_name,
'image_path': file_path,
'md5_hash_digest': generate_image_hash(file_path)
}
)
except (PermissionError, FileNotFoundError) as error:
pass
return images
def get_subfolders():
try:
subfolders = [os.path.join(scan_folder_path, content_path) for content_path in os.listdir(scan_folder_path)
if
os.path.isdir(os.path.join(scan_folder_path, content_path)) and not content_path.startswith(
('.', '$'))]
except (PermissionError, FileNotFoundError) as error:
subfolders = []
return subfolders
images_in_folder = get_images_from_folder()
subfolders_in_folder = get_subfolders()
if subfolders_in_folder:
for scan_folder_path in subfolders_in_folder:
images_in_folder = get_folder_content(scan_folder_path, images_in_folder)
return images_in_folder
if __name__ == '__main__':
start_time = datetime.datetime.now()
scan_folder_path, duplicate_json_path = "<Source Folder>", "<Where to Put Duplicate Image Info>"
log_file_path = os.path.join(duplicate_json_path , "all_duplicate_images.json")
if not os.path.exists(duplicate_json_path ):
os.makedirs(duplicate_json_path )
else:
if os.path.exists(log_file_path):
os.remove(log_file_path)
all_images_in_folder = get_folder_content(scan_folder_path, None)
first_image_index, duplicate_images_info = 1, []
for image_obj1 in all_images_in_folder:
for image_obj2 in all_images_in_folder[first_image_index:]:
if image_obj1.get('md5_hash_digest') == image_obj2.get('md5_hash_digest'):
if is_duplicate(get_image_obj(image_obj1['image_path']),
get_image_obj(image_obj2['image_path'])):
image_obj1['duplicate_image_path'] = image_obj2['image_path']
duplicate_images_info.append(image_obj1)
first_image_index += 1
file_obj = open(log_file_path, "a")
file_obj.write(json.dumps(duplicate_images_info))
file_obj.close()