feat: detect exact copies via MD5 hash, separate from perceptual duplicates
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
33
analyzer.py
33
analyzer.py
@@ -33,6 +33,28 @@ def is_underexposed(path: str, threshold: float = 30.0) -> bool:
|
|||||||
return _mean_brightness(path) < threshold
|
return _mean_brightness(path) < threshold
|
||||||
|
|
||||||
|
|
||||||
|
def find_exact_copies(paths: List[str]) -> List[List[str]]:
|
||||||
|
"""
|
||||||
|
Findet exakte Kopien anhand von MD5-Hash (byte-identische Dateien).
|
||||||
|
Das erste Element jeder Gruppe gilt als Original, der Rest als Kopien.
|
||||||
|
"""
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
hashes: dict = {}
|
||||||
|
for path in paths:
|
||||||
|
try:
|
||||||
|
h = hashlib.md5()
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
for chunk in iter(lambda: f.read(65536), b""):
|
||||||
|
h.update(chunk)
|
||||||
|
digest = h.hexdigest()
|
||||||
|
hashes.setdefault(digest, []).append(path)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return [group for group in hashes.values() if len(group) > 1]
|
||||||
|
|
||||||
|
|
||||||
def find_duplicates(paths: List[str], threshold: int = 8) -> List[List[str]]:
|
def find_duplicates(paths: List[str], threshold: int = 8) -> List[List[str]]:
|
||||||
"""
|
"""
|
||||||
Findet Gruppen aehnlicher Bilder via perceptual hashing.
|
Findet Gruppen aehnlicher Bilder via perceptual hashing.
|
||||||
@@ -155,7 +177,16 @@ def analyze_folder(
|
|||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
dup_groups = find_duplicates(paths, dup_threshold)
|
exact_copy_paths: set = set()
|
||||||
|
exact_groups = find_exact_copies(paths)
|
||||||
|
for group in exact_groups:
|
||||||
|
original = os.path.basename(group[0])
|
||||||
|
for copy_path in group[1:]:
|
||||||
|
results[copy_path].append(f"exakte Kopie von {original}")
|
||||||
|
exact_copy_paths.add(copy_path)
|
||||||
|
|
||||||
|
dup_paths = [p for p in paths if p not in exact_copy_paths]
|
||||||
|
dup_groups = find_duplicates(dup_paths, dup_threshold)
|
||||||
for group in dup_groups:
|
for group in dup_groups:
|
||||||
original = os.path.basename(group[0])
|
original = os.path.basename(group[0])
|
||||||
for dup_path in group[1:]:
|
for dup_path in group[1:]:
|
||||||
|
|||||||
Reference in New Issue
Block a user