How to optimize detection of duplicate Images using cv2 and Hash digest using python?

Question

I have a working code to get the duplicate images using Python but it takes a lot of time to execute and determine duplicate images present anywhere in the folder hierarchy.

I am using hash digest and per pixel color diff to check if two images are duplicate out of thousands of them.

I intent to run it on my hard disk (but its a very CPU intense work and takes lots of CPU Utilization)

Here is the Script that does that for us:

import sys
import os
import json
import datetime


def is_duplicate(image_obj_1, image_obj_2):
    # 1) Check if 2 images are equals
    if image_obj_1.shape == image_obj_2.shape:
        difference = cv2.subtract(image_obj_1, image_obj_2)
        b, g, r = cv2.split(difference)
        if cv2.countNonZero(b) == 0 and cv2.countNonZero(g) == 0 and cv2.countNonZero(r) == 0:
            return True
    return False


def generate_image_hash(image_path):
    return hashlib.md5((open(image_path, "rb").read())).hexdigest()


def get_folder_content(scan_folder_path, images_in_folder):
    def get_images_from_folder():
        try:
            images = np.array([] if images_in_folder is None else images_in_folder)
            for file_name in os.listdir(scan_folder_path):
                file_path = os.path.join(scan_folder_path, file_name)
                file_obj = cv2.imread(file_path)
                if file_obj is not None:
                    images = np.append(
                        images,
                        {
                            'image_name': file_name,
                            'image_path': file_path,
                            'md5_hash_digest': generate_image_hash(file_path)
                        }
                    )
        except (PermissionError, FileNotFoundError) as error:
            pass
        return images

    def get_subfolders():

        try:
            subfolders = [os.path.join(scan_folder_path, content_path) for content_path in os.listdir(scan_folder_path)
                          if
                          os.path.isdir(os.path.join(scan_folder_path, content_path)) and not content_path.startswith(
                              ('.', '$'))]
        except (PermissionError, FileNotFoundError) as error:
            subfolders = []
        return subfolders

    images_in_folder = get_images_from_folder()
    subfolders_in_folder = get_subfolders()

    if subfolders_in_folder:
        for scan_folder_path in subfolders_in_folder:
            images_in_folder = get_folder_content(scan_folder_path, images_in_folder)
    return images_in_folder



if __name__ == '__main__':
    start_time = datetime.datetime.now()
    scan_folder_path, duplicate_json_path = "<Source Folder>", "<Where to Put Duplicate Image Info>"

    log_file_path = os.path.join(duplicate_json_path , "all_duplicate_images.json")
    if not os.path.exists(duplicate_json_path ):
        os.makedirs(duplicate_json_path )
    else:
        if os.path.exists(log_file_path):
            os.remove(log_file_path)

    all_images_in_folder = get_folder_content(scan_folder_path, None)

    first_image_index, duplicate_images_info = 1, []
    for image_obj1 in all_images_in_folder:
        for image_obj2 in all_images_in_folder[first_image_index:]:
            if image_obj1.get('md5_hash_digest') == image_obj2.get('md5_hash_digest'):
                if is_duplicate(get_image_obj(image_obj1['image_path']),
                                get_image_obj(image_obj2['image_path'])):
                    image_obj1['duplicate_image_path'] = image_obj2['image_path']
                    duplicate_images_info.append(image_obj1)
        first_image_index += 1

    file_obj = open(log_file_path, "a")
    file_obj.write(json.dumps(duplicate_images_info))
    file_obj.close()

learn about https://en.wikipedia.org/wiki/Content-based_image_retrieval because your problem can't be solved without the required background knowledge -- your current approach will always be O(n^2) with a huge constant factor because you compare all pixels of any image pair. this is not how it's done. -- cryptographic hashes on the encoded file only give you exact file matches, not content matches (where files may differ) -- the cryptographic hashing gains you nothing at all. you're still going to have to compare per-pixel against all non-matches — Christoph Rackwitz, Jun 11 '23 at 15:41
I have reduced this to a significant number now. Firstly we could use os.walk to walk over the directory and its content, That reduces the recursion technique. And Per Pixel checking needs to be there for certaininity that two images are duplicate for sure. — Manish Kumar, Jun 12 '23 at 11:28
I tried using Pandas but when you get into difference between each pixel of each file, It gets massy. Any Better solution would be appreciated — Manish Kumar, Jun 12 '23 at 11:33

How to optimize detection of duplicate Images using cv2 and Hash digest using python?

0 Answers0