diff --git a/.gitignore b/.gitignore index 79399a2..fc9ec1d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,3 @@ myenv __pycache__ -dups.txt -alike.txt -invalid.txt +*.txt diff --git a/BadImageFormat.jpg b/BadImageFormat.jpg new file mode 100644 index 0000000..26380bf Binary files /dev/null and b/BadImageFormat.jpg differ diff --git a/README.md b/README.md index fda3fc9..6573a14 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,6 @@ python dedup.py 0.jpg 1.jpg scores ``` ### Files made by get_dups Scripts: -error level: 0 = NOT a Dup, 1 = Duplicate, 2 = Close Match, 5 = Same GPS GEO-location, 8 = Invalid Image. Possible files: dups.txt, alike.txt, sameGPS.txt, and invalid.txt +error level: 0 = NOT a Dup, 1 = Duplicate, 2 = Close Match, 5 = Same GPS GEO-location, 8 = Invalid Image, 9 = File Too small/big. Possible files: dups.txt, alike.txt, sameGPS.txt, invalid.txt, size.txt. [![Image of ScreenShot](Screenshot2025-04-26.png)] diff --git a/TooSmall.jpg b/TooSmall.jpg new file mode 100644 index 0000000..b51ea21 --- /dev/null +++ b/TooSmall.jpg @@ -0,0 +1 @@ +adsfkjnhdsfljdkfsljdsklfdsjflds diff --git a/dedup.py b/dedup.py index f0038f8..762a173 100644 --- a/dedup.py +++ b/dedup.py @@ -29,6 +29,8 @@ Memory usage significantly reduced """ within_one_mile_check = True +too_small = 1024 # 1KB +too_large = 10 * 1024 * 1024 # 10MB def handle_GPS(location1, location2): camera_info1, latitude1, longitude1 = location1 @@ -71,6 +73,17 @@ def main(): file1 = sys.argv[1] file2 = sys.argv[2] + with delibs.Timer("Getting File Size"): + size1 = delibs.check_file_size_bytes(file1, too_small, too_large) + size2 = delibs.check_file_size_bytes(file2, too_small, too_large) + + if size1 != None: + print(f"ERROR: {size1}") + delibs.exit_timer(9) + if size2 != None: + print(f"ERROR: {size2}") + delibs.exit_timer(4) # Mark as Skipped + with delibs.Timer("Hashing"): # Quick hashes hash1 = delibs.quick_file_hash(file1) @@ -102,15 +115,14 @@ def main(): w, h = delibs.get_image_dimensions_cv(large_img1) w2, h2 = delibs.get_image_dimensions_cv(large_img2) - if w == None or w2 == None or h == None or h2 == None: + if w == None or h == None: print("❌Aborting❌...Invalid Image!") delibs.exit_timer(8) - if w != w2 and w != h2: - print("Diffent Resolutions") - print("👌Not a Duplicate") - delibs.exit_timer(0) - - if h != h2 and h != w2: + if w2 == None or h2 == None: + print("❌Aborting❌...Invalid Image!") + delibs.exit_timer(4) # Mark as Skipped + + if w != w2 and w != h2 and h != h2 and h != w2: print("Diffent Resolutions") print("👌Not a Duplicate") delibs.exit_timer(0) diff --git a/delibs.py b/delibs.py index 93cbd6a..e923207 100644 --- a/delibs.py +++ b/delibs.py @@ -392,7 +392,23 @@ def get_image_dimensions_cv(img): if img is not None: height, width = img.shape[:2] return width, height - return None, None + return None, None + +def check_file_size_bytes(file_path, too_small, too_large): + try: + file_size = os.path.getsize(file_path) + + if file_size < too_small: + return f"File is ({file_size} bytes) must be over {too_small}" + elif file_size > too_large: + return f"File is ({file_size} bytes) must be less than {too_large}" + else: + return None + + except FileNotFoundError: + return "File not found" + except Exception as e: + return f"Error checking file size: {str(e)}" """ xxhash is about 5–10x faster than SHA256, non-cryptographic. diff --git a/get_dups.bat b/get_dups.bat index 3f9a0a4..3360d4b 100644 --- a/get_dups.bat +++ b/get_dups.bat @@ -46,6 +46,9 @@ for /l %%i in (1,1,%count%) do ( echo %~1\!outer_image!>> alike.txt goto :break_inner ) + if !errorlevel! equ 4 ( + goto :break_inner + ) if !errorlevel! equ 5 ( echo %~1\!outer_image!>> sameGPS.txt goto :break_inner @@ -54,11 +57,14 @@ for /l %%i in (1,1,%count%) do ( echo %~1\!outer_image!>> sameGPSmile.txt goto :break_inner ) - if !errorlevel! equ 8 ( echo %~1\!outer_image!>> invalid.txt goto :break_inner ) + if !errorlevel! equ 9 ( + echo %~1\!outer_image!>> size.txt + goto :break_inner + ) ) :break_inner ) diff --git a/get_dups.sh b/get_dups.sh index a49f73e..07a4e94 100755 --- a/get_dups.sh +++ b/get_dups.sh @@ -29,7 +29,7 @@ for ((i = 0; i < ${#images[@]}; i++)); do for ((j = i + 1; j < ${#images[@]}; j++)); do inner_image="${images[$j]}" - echo -e "Compairing files: $outer_image TO $inner_image \n" + echo -e "\nCompairing files: $outer_image TO $inner_image" python3 dedup.py "$1/$outer_image" "$1/$inner_image" "$2" exit_code=$? if [ $exit_code -eq 1 ]; then @@ -40,6 +40,9 @@ for ((i = 0; i < ${#images[@]}; i++)); do echo "$1/$outer_image # $inner_image" >> alike.txt break # No need to check more once found close match to duplicate fi + if [ $exit_code -eq 4 ]; then + break # Skip Invaild inner Image + fi if [ $exit_code -eq 5 ]; then echo "$1/$outer_image # $inner_image" >> sameGPS.txt break # No need to check more once found matching GPS image @@ -48,11 +51,14 @@ for ((i = 0; i < ${#images[@]}; i++)); do echo "$1/$outer_image # $inner_image" >> sameGPSmile.txt break # No need to check more once found matching GPS image fi - if [ $exit_code -eq 8 ]; then - echo "$1/$outer_image # $inner_image" >> invalid.txt + echo "$1/$outer_image" >> invalid.txt break # No need to check more once found bad image fi + if [ $exit_code -eq 9 ]; then + echo "$1/$outer_image" >> size.txt + break # No need to check more once found image too Small or Large + fi done done