#!/usr/bin/env python3 """Find duplicate files by checksum.""" import hashlib import os from pathlib import Path from collections import defaultdict import mimetypes # Configuration MIN_SIZE_MB = 1 MIN_SIZE_BYTES = MIN_SIZE_MB * 1024 * 1024 # Directories to skip SKIP_DIRS = { '.cache', 'node_modules', '.git', '.local', 'tmp', 'tmp.*', '.npm', '.cargo', '.ollama', '.config/llama-swap', '.local/share/Steam', '.local/share/containers', '.local/lib/docker', '.thumbnails' } def should_skip(path): """Check if path should be skipped.""" parts = path.parts for skip in SKIP_DIRS: if '*' in skip: pattern = skip.replace('*', '') if any(pattern in p for p in parts): return True elif skip in parts: return True return False def sha256_file(filepath, block_size=65536): """Calculate SHA256 checksum of a file.""" hasher = hashlib.sha256() with open(filepath, 'rb') as f: for block in iter(lambda: f.read(block_size), b''): hasher.update(block) return hasher.hexdigest() def get_file_type(filepath): """Guess file type from extension.""" mime, _ = mimetypes.guess_type(filepath) if mime: if mime.startswith('image/'): return 'image' elif mime.startswith('video/'): return 'video' elif mime.startswith('audio/'): return 'audio' elif 'pdf' in mime: return 'pdf' elif 'zip' in mime or 'tar' in mime or 'compressed' in mime: return 'archive' elif mime.startswith('text/'): return 'text' return 'other' def main(): home = Path.home() duplicates = defaultdict(list) total_files = 0 skipped = 0 print(f"🔍 Scanning {home} for duplicates (>{MIN_SIZE_MB}MB)...") print(f" Skipping: {', '.join(sorted(SKIP_DIRS))}\n") for root, dirs, files in os.walk(home): # Modify dirs in-place to skip dirs[:] = [d for d in dirs if not any(s in d for s in SKIP_DIRS)] for filename in files: filepath = Path(root) / filename if should_skip(filepath): skipped += 1 continue try: if filepath.stat().st_size >= MIN_SIZE_BYTES: checksum = sha256_file(filepath) duplicates[checksum].append(filepath) total_files += 1 except (OSError, PermissionError) as e: skipped += 1 continue # Find actual duplicates dupes_by_type = defaultdict(list) total_dupes = 0 total_wasted = 0 for checksum, files in duplicates.items(): if len(files) > 1: file_size = files[0].stat().st_size file_type = get_file_type(files[0]) dupes_by_type[file_type].append({ 'checksum': checksum, 'files': files, 'size': file_size }) total_dupes += len(files) total_wasted += file_size * (len(files) - 1) # Report if not dupes_by_type: print("✅ No duplicates found!") return print(f"📊 Results:") print(f" Files scanned: {total_files}") print(f" Skipped: {skipped}") print(f" Duplicate files: {total_dupes}") print(f" Wasted space: {total_wasted / (1024**3):.2f} GB\n") for ftype, groups in sorted(dupes_by_type.items()): print(f"📁 {ftype.upper()} ({len(groups)} groups):") for group in groups[:5]: # Limit to 5 per type print(f" {len(group['files'])} copies, {group['size'] / (1024**2):.1f} MB each") print(f" → {group['files'][0]}") for f in group['files'][1:]: print(f" {f}") if len(groups) > 5: print(f" ... and {len(groups) - 5} more groups") print() if __name__ == '__main__': main()