How to optimize detection of duplicate Images using cv2 and Hash digest using python?

49 Views Asked by At

I have a working code to get the duplicate images using Python but it takes a lot of time to execute and determine duplicate images present anywhere in the folder hierarchy.

I am using hash digest and per pixel color diff to check if two images are duplicate out of thousands of them.

I intent to run it on my hard disk (but its a very CPU intense work and takes lots of CPU Utilization)

Here is the Script that does that for us:

import sys
import os
import json
import datetime


def is_duplicate(image_obj_1, image_obj_2):
    # 1) Check if 2 images are equals
    if image_obj_1.shape == image_obj_2.shape:
        difference = cv2.subtract(image_obj_1, image_obj_2)
        b, g, r = cv2.split(difference)
        if cv2.countNonZero(b) == 0 and cv2.countNonZero(g) == 0 and cv2.countNonZero(r) == 0:
            return True
    return False


def generate_image_hash(image_path):
    return hashlib.md5((open(image_path, "rb").read())).hexdigest()


def get_folder_content(scan_folder_path, images_in_folder):
    def get_images_from_folder():
        try:
            images = np.array([] if images_in_folder is None else images_in_folder)
            for file_name in os.listdir(scan_folder_path):
                file_path = os.path.join(scan_folder_path, file_name)
                file_obj = cv2.imread(file_path)
                if file_obj is not None:
                    images = np.append(
                        images,
                        {
                            'image_name': file_name,
                            'image_path': file_path,
                            'md5_hash_digest': generate_image_hash(file_path)
                        }
                    )
        except (PermissionError, FileNotFoundError) as error:
            pass
        return images

    def get_subfolders():

        try:
            subfolders = [os.path.join(scan_folder_path, content_path) for content_path in os.listdir(scan_folder_path)
                          if
                          os.path.isdir(os.path.join(scan_folder_path, content_path)) and not content_path.startswith(
                              ('.', '$'))]
        except (PermissionError, FileNotFoundError) as error:
            subfolders = []
        return subfolders

    images_in_folder = get_images_from_folder()
    subfolders_in_folder = get_subfolders()

    if subfolders_in_folder:
        for scan_folder_path in subfolders_in_folder:
            images_in_folder = get_folder_content(scan_folder_path, images_in_folder)
    return images_in_folder



if __name__ == '__main__':
    start_time = datetime.datetime.now()
    scan_folder_path, duplicate_json_path = "<Source Folder>", "<Where to Put Duplicate Image Info>"

    log_file_path = os.path.join(duplicate_json_path , "all_duplicate_images.json")
    if not os.path.exists(duplicate_json_path ):
        os.makedirs(duplicate_json_path )
    else:
        if os.path.exists(log_file_path):
            os.remove(log_file_path)

    all_images_in_folder = get_folder_content(scan_folder_path, None)

    first_image_index, duplicate_images_info = 1, []
    for image_obj1 in all_images_in_folder:
        for image_obj2 in all_images_in_folder[first_image_index:]:
            if image_obj1.get('md5_hash_digest') == image_obj2.get('md5_hash_digest'):
                if is_duplicate(get_image_obj(image_obj1['image_path']),
                                get_image_obj(image_obj2['image_path'])):
                    image_obj1['duplicate_image_path'] = image_obj2['image_path']
                    duplicate_images_info.append(image_obj1)
        first_image_index += 1

    file_obj = open(log_file_path, "a")
    file_obj.write(json.dumps(duplicate_images_info))
    file_obj.close()
0

There are 0 best solutions below