Photo De-Duplication
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
dedup/dedup.py

215 lines
7.0 KiB

import sys
import cv2
# My Custom Library called delibs.py
import delibs
# coordinates is an OPTIONAL module!!!!!!!!! Feel free to commit it out.
import coordinates
"""
Copyright (c) 2025 by Robert Strutts
License: MIT
Key Optimizations:
Multi-Scale Processing:
First alignment at low resolution (faster)
Final refinement at full resolution (accurate)
Matrix Scaling:
The translation components of the transformation matrix are scaled up
Rotation and scaling components remain the same
Smart Downscaling:
Uses INTER_AREA interpolation which is ideal for size reduction
Maintains aspect ratio
Performance Benefits:
Processing time scales with area, so 4x downscale = ~16x faster initial alignment
Memory usage significantly reduced
"""
within_feet_check = True
withinFeet = 10 # < 10 feet
# Photo File Size Limits:
too_small = 1024 # 1KB
too_large = 10 * 1024 * 1024 # 10MB
def not_a_dup():
print("👌Not a Duplicate")
delibs.exit_timer(0)
def is_same_location(point1, point2):
if point1 == point2:
return True
elif within_feet_check == True:
return coordinates.haversine_distance_feet(point1, point2)
else:
return False
def is_same_camera(cam1, cam2):
return cam1 == cam2
def handle_GPS(location1, location2):
camera_info1, latitude1, longitude1 = location1
camera_info2, latitude2, longitude2 = location2
point1 = (latitude1, longitude1)
point2 = (latitude2, longitude2)
camera1 = (camera_info1['Make'], camera_info1['Model'])
camera2 = (camera_info2['Make'], camera_info2['Model'])
same_cams = is_same_camera(camera1, camera2)
the_location = is_same_location(point1, point2)
match the_location:
case True:
print("Images are both from same exact Location")
if same_cams:
print("Cameras are the same.")
delibs.exit_timer(5)
case False:
print("Images are from different Locations")
case float() if isinstance(the_location, float): # Checks if it's a float
print(f"Images distance in feet: {the_location:.2f}")
if the_location < withinFeet:
print(f"With in requirements of {withinFeet}")
if is_same_camera(camera1, camera2):
print("Cameras are the same.")
delibs.exit_timer(6)
if same_cams == False:
print("Different Cameras detected.")
not_a_dup()
def is_module_imported(module_name):
return module_name in sys.modules
def main():
if len(sys.argv) < 3:
print("Usage: python3 dedup.py file1.jpg file2.jpg")
sys.exit(3)
if "-noansi" in sys.argv:
delibs.disable_ansi()
elif "-ansi" in sys.argv:
delibs.enable_ansi()
file1 = sys.argv[1]
file2 = sys.argv[2]
with delibs.Timer("Getting File Size"):
size1 = delibs.check_file_size_bytes(file1, too_small, too_large)
size2 = delibs.check_file_size_bytes(file2, too_small, too_large)
if size1 != None:
print(f"ERROR: {size1}")
delibs.exit_timer(9)
if size2 != None:
print(f"ERROR: {size2}")
delibs.exit_timer(3) # Mark as Skipped
with delibs.Timer("Hashing"):
# Quick hashes
hash1 = delibs.quick_file_hash(file1)
hash2 = delibs.quick_file_hash(file2)
if (hash1 == hash2):
print("xxHash found duplicates")
print("❌ Perfect match - images are identical - Duplicate Found!")
delibs.exit_timer(1)
else:
print("Done hashing...")
if is_module_imported('coordinates'):
print("Using Pillow GPS Coordinates module")
coordinates1 = coordinates.get_coordinates_from_image(file1)
if coordinates1 != None:
coordinates2 = coordinates.get_coordinates_from_image(file2)
if coordinates2 != None:
handle_GPS(coordinates1, coordinates2)
else:
print("Not using Coordinates module")
with delibs.Timer("Loading Images"):
# Load large images
large_img1 = cv2.imread(file1) # e.g., 4000x3000 pixels
large_img2 = cv2.imread(file2) # e.g., 4000x3000 pixels
w, h = delibs.get_image_dimensions_cv(large_img1)
w2, h2 = delibs.get_image_dimensions_cv(large_img2)
if w == None or h == None:
print("❌Aborting❌...Invalid Image!")
delibs.exit_timer(8)
if w2 == None or h2 == None:
print("❌Aborting❌...Invalid Image!")
delibs.exit_timer(4) # Mark as Skipped
if w != w2 and w != h2 and h != h2 and h != w2:
print("Diffent Resolutions")
not_a_dup()
print("Done loading images...")
with delibs.Timer("Module - Aligning with downscaling 1/4 size - Total Time"):
# Align with downscaling (initially process at 1/4 size)
aligned, matrix, angle = delibs.align_with_downscaling(
large_img1, large_img2,
downscale_factor=4,
try_common_rotations=True
)
# Save result
# cv2.imwrite('aligned_large.jpg', aligned)
# Print debug info
print(f"Detected rotation: {angle}°")
print(f"Final transformation matrix:\n{matrix}")
# Calculate scores
matrix_score = delibs.matrix_similarity_score(matrix)
if "-scores" in sys.argv:
is_score = True
else:
is_score = False
if matrix_score == 1.0 and is_score == False:
print("❌ Perfect match score, images should be identical - Duplicate Found!")
delibs.exit_timer(1)
if matrix_score < 0.3 and is_score != False:
print("👌Not a Duplicate, best guess!")
delibs.exit_timer(0)
if is_score == True:
score = delibs.find_duplicate_with_rotation(large_img1, aligned)
print(f"Score: {score}")
decomposed_score = delibs.decomposed_similarity_score(matrix, large_img1.shape[1])
combined_score = delibs.comprehensive_similarity(large_img1, aligned, matrix)
# Check for perfect alignment
if matrix_score == 1.0 and decomposed_score == 1.0 and combined_score == 1.0:
print("No transformation needed")
print("❌ Perfect match - images are identical - Duplicate Found!")
exit_code = 1
elif matrix_score > 0.9 and decomposed_score > 0.9 and combined_score > 0.7:
print("✅ Near-perfect alignment - minor differences detected")
exit_code = 2
else:
print("👌Not a Duplicate")
exit_code = 0
print(f"Matrix deviation score: {matrix_score:.4f}")
print(f"Decomposed similarity: {decomposed_score:.4f}")
print(f"Combined similarity: {combined_score:.4f}")
delibs.exit_timer(exit_code)
# --- Example usage ---
if __name__ == "__main__":
main()
"""
Matrix-based scores are fast but don't consider image content
Decomposed analysis gives more interpretable results (separate rotation/scale/translation)
Combined approaches with pixel comparison are most accurate but slower
Normalization is crucial - translation should be relative to image size
"""