dedup/dedup.py

import sys
import cv2
# My Custom Library called delibs.py
import delibs
# coordinates is an OPTIONAL module!!!!!!!!! Feel free to commit it out.
import coordinates

"""
Copyright (c) 2025 by Robert Strutts
License: MIT

Key Optimizations:

Multi-Scale Processing:
First alignment at low resolution (faster)
Final refinement at full resolution (accurate)

Matrix Scaling:
The translation components of the transformation matrix are scaled up
Rotation and scaling components remain the same

Smart Downscaling:
Uses INTER_AREA interpolation which is ideal for size reduction
Maintains aspect ratio

Performance Benefits:
Processing time scales with area, so 4x downscale = ~16x faster initial alignment
Memory usage significantly reduced
"""

within_feet_check = True
withinFeet = 10 # < 10 feet
# Photo File Size Limits:
too_small = 1024 # 1KB
too_large = 10 * 1024 * 1024 # 10MB

def not_a_dup():
    print("👌Not a Duplicate")
    delibs.exit_timer(0)

def is_same_location(point1, point2):
    if point1 == point2:
        return True
    elif within_feet_check == True:
        return coordinates.haversine_distance_feet(point1, point2)
    else:
        return False

def is_same_camera(cam1, cam2):
    return cam1 == cam2

def handle_GPS(location1, location2):
    camera_info1, latitude1, longitude1 = location1
    camera_info2, latitude2, longitude2 = location2
    point1 = (latitude1, longitude1)
    point2 = (latitude2, longitude2)
    camera1 = (camera_info1['Make'], camera_info1['Model'])
    camera2 = (camera_info2['Make'], camera_info2['Model'])
    same_cams = is_same_camera(camera1, camera2)

    the_location = is_same_location(point1, point2)
    match the_location:
        case True:
            print("Images are both from same exact Location")
            if same_cams:
                print("Cameras are the same.")
                delibs.exit_timer(5)

        case False:
            print("Images are from different Locations")

        case float() if isinstance(the_location, float):  # Checks if it's a float
            print(f"Images distance in feet: {the_location:.2f}")
            if the_location < withinFeet:
                print(f"With in requirements of {withinFeet}")
                if is_same_camera(camera1, camera2):
                    print("Cameras are the same.")
                    delibs.exit_timer(6)

    if same_cams == False:
        print("Different Cameras detected.")
        not_a_dup()

def is_module_imported(module_name):
    return module_name in sys.modules

def main():
    if len(sys.argv) < 3:
        print("Usage: python3 dedup.py file1.jpg file2.jpg")
        sys.exit(3)

    if "-noansi" in sys.argv:
        delibs.disable_ansi()
    elif "-ansi" in sys.argv:
        delibs.enable_ansi()

    file1 = sys.argv[1]
    file2 = sys.argv[2]

    with delibs.Timer("Getting File Size"):
        size1 = delibs.check_file_size_bytes(file1, too_small, too_large)
        size2 = delibs.check_file_size_bytes(file2, too_small, too_large)

    if size1 != None:
        print(f"ERROR: {size1}")
        delibs.exit_timer(9)
    if size2 != None:
        print(f"ERROR: {size2}")
        delibs.exit_timer(3) # Mark as Skipped

    with delibs.Timer("Hashing"):
        # Quick hashes
        hash1 = delibs.quick_file_hash(file1)
        hash2 = delibs.quick_file_hash(file2)

    if (hash1 == hash2):
        print("xxHash found duplicates")
        print("❌ Perfect match - images are identical - Duplicate Found!")
        delibs.exit_timer(1)
    else:
        print("Done hashing...")

    if is_module_imported('coordinates'):
        print("Using Pillow GPS Coordinates module")
        coordinates1 = coordinates.get_coordinates_from_image(file1)
        if coordinates1 != None:
             coordinates2 = coordinates.get_coordinates_from_image(file2)
             if coordinates2 != None:
                handle_GPS(coordinates1, coordinates2)

    else:
        print("Not using Coordinates module")

    with delibs.Timer("Loading Images"):
        # Load large images
        large_img1 = cv2.imread(file1)  # e.g., 4000x3000 pixels
        large_img2 = cv2.imread(file2)   # e.g., 4000x3000 pixels

    w, h = delibs.get_image_dimensions_cv(large_img1)
    w2, h2 = delibs.get_image_dimensions_cv(large_img2)
    if w == None or h == None:
        print("❌Aborting❌...Invalid Image!")
        delibs.exit_timer(8)
    if w2 == None or h2 == None:
        print("❌Aborting❌...Invalid Image!")
        delibs.exit_timer(4) # Mark as Skipped

    if w != w2 and w != h2 and h != h2 and h != w2:
        print("Diffent Resolutions")
        not_a_dup()

    print("Done loading images...")
    with delibs.Timer("Module - Aligning with downscaling 1/4 size - Total Time"):
        # Align with downscaling (initially process at 1/4 size)
        aligned, matrix, angle = delibs.align_with_downscaling(
            large_img1, large_img2,
            downscale_factor=4,
            try_common_rotations=True
        )

    # Save result
    # cv2.imwrite('aligned_large.jpg', aligned)

    # Print debug info
    print(f"Detected rotation: {angle}°")
    print(f"Final transformation matrix:\n{matrix}")

    # Calculate scores
    matrix_score = delibs.matrix_similarity_score(matrix)

    if "-scores" in sys.argv:
        is_score = True
    else:
    	is_score = False

    if matrix_score == 1.0 and is_score == False:
        print("❌ Perfect match score, images should be identical - Duplicate Found!")
        delibs.exit_timer(1)
    if matrix_score < 0.3 and is_score != False:
        print("👌Not a Duplicate, best guess!")
        delibs.exit_timer(0)
    if is_score == True:
	    score = delibs.find_duplicate_with_rotation(large_img1, aligned)
	    print(f"Score: {score}")

    decomposed_score = delibs.decomposed_similarity_score(matrix, large_img1.shape[1])
    combined_score = delibs.comprehensive_similarity(large_img1, aligned, matrix)

    # Check for perfect alignment
    if matrix_score == 1.0 and decomposed_score == 1.0 and combined_score == 1.0:
        print("No transformation needed")
        print("❌ Perfect match - images are identical - Duplicate Found!")
        exit_code = 1
    elif matrix_score > 0.9 and decomposed_score > 0.9 and combined_score > 0.7:
	    print("✅ Near-perfect alignment - minor differences detected")
	    exit_code = 2
    else:
        print("👌Not a Duplicate")
        exit_code = 0

    print(f"Matrix deviation score: {matrix_score:.4f}")
    print(f"Decomposed similarity: {decomposed_score:.4f}")
    print(f"Combined similarity: {combined_score:.4f}")
    delibs.exit_timer(exit_code)

# --- Example usage ---
if __name__ == "__main__":
	main()

"""
Matrix-based scores are fast but don't consider image content
Decomposed analysis gives more interpretable results (separate rotation/scale/translation)
Combined approaches with pixel comparison are most accurate but slower
Normalization is crucial - translation should be relative to image size
"""