From a90c542d9fd31ec680e86bf192524d1ec473d6a0 Mon Sep 17 00:00:00 2001 From: Ferdinand Date: Mon, 13 Apr 2026 15:59:12 +0200 Subject: [PATCH] feat: detect exact copies via MD5 hash, separate from perceptual duplicates Co-Authored-By: Claude Sonnet 4.6 --- analyzer.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/analyzer.py b/analyzer.py index 73ab5b3..eb4df75 100644 --- a/analyzer.py +++ b/analyzer.py @@ -33,6 +33,28 @@ def is_underexposed(path: str, threshold: float = 30.0) -> bool: return _mean_brightness(path) < threshold +def find_exact_copies(paths: List[str]) -> List[List[str]]: + """ + Findet exakte Kopien anhand von MD5-Hash (byte-identische Dateien). + Das erste Element jeder Gruppe gilt als Original, der Rest als Kopien. + """ + import hashlib + + hashes: dict = {} + for path in paths: + try: + h = hashlib.md5() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(65536), b""): + h.update(chunk) + digest = h.hexdigest() + hashes.setdefault(digest, []).append(path) + except Exception: + continue + + return [group for group in hashes.values() if len(group) > 1] + + def find_duplicates(paths: List[str], threshold: int = 8) -> List[List[str]]: """ Findet Gruppen aehnlicher Bilder via perceptual hashing. @@ -155,7 +177,16 @@ def analyze_folder( except Exception: continue - dup_groups = find_duplicates(paths, dup_threshold) + exact_copy_paths: set = set() + exact_groups = find_exact_copies(paths) + for group in exact_groups: + original = os.path.basename(group[0]) + for copy_path in group[1:]: + results[copy_path].append(f"exakte Kopie von {original}") + exact_copy_paths.add(copy_path) + + dup_paths = [p for p in paths if p not in exact_copy_paths] + dup_groups = find_duplicates(dup_paths, dup_threshold) for group in dup_groups: original = os.path.basename(group[0]) for dup_path in group[1:]: