Files
clawdbot/scripts/dupes.py
2026-01-27 02:53:42 -08:00

130 lines
3.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""Find duplicate files by checksum."""
import hashlib
import os
from pathlib import Path
from collections import defaultdict
import mimetypes
# Configuration
MIN_SIZE_MB = 1
MIN_SIZE_BYTES = MIN_SIZE_MB * 1024 * 1024
# Directories to skip
SKIP_DIRS = {
'.cache', 'node_modules', '.git', '.local', 'tmp', 'tmp.*',
'.npm', '.cargo', '.ollama', '.config/llama-swap',
'.local/share/Steam', '.local/share/containers',
'.local/lib/docker', '.thumbnails'
}
def should_skip(path):
"""Check if path should be skipped."""
parts = path.parts
for skip in SKIP_DIRS:
if '*' in skip:
pattern = skip.replace('*', '')
if any(pattern in p for p in parts):
return True
elif skip in parts:
return True
return False
def sha256_file(filepath, block_size=65536):
"""Calculate SHA256 checksum of a file."""
hasher = hashlib.sha256()
with open(filepath, 'rb') as f:
for block in iter(lambda: f.read(block_size), b''):
hasher.update(block)
return hasher.hexdigest()
def get_file_type(filepath):
"""Guess file type from extension."""
mime, _ = mimetypes.guess_type(filepath)
if mime:
if mime.startswith('image/'):
return 'image'
elif mime.startswith('video/'):
return 'video'
elif mime.startswith('audio/'):
return 'audio'
elif 'pdf' in mime:
return 'pdf'
elif 'zip' in mime or 'tar' in mime or 'compressed' in mime:
return 'archive'
elif mime.startswith('text/'):
return 'text'
return 'other'
def main():
home = Path.home()
duplicates = defaultdict(list)
total_files = 0
skipped = 0
print(f"🔍 Scanning {home} for duplicates (>{MIN_SIZE_MB}MB)...")
print(f" Skipping: {', '.join(sorted(SKIP_DIRS))}\n")
for root, dirs, files in os.walk(home):
# Modify dirs in-place to skip
dirs[:] = [d for d in dirs if not any(s in d for s in SKIP_DIRS)]
for filename in files:
filepath = Path(root) / filename
if should_skip(filepath):
skipped += 1
continue
try:
if filepath.stat().st_size >= MIN_SIZE_BYTES:
checksum = sha256_file(filepath)
duplicates[checksum].append(filepath)
total_files += 1
except (OSError, PermissionError) as e:
skipped += 1
continue
# Find actual duplicates
dupes_by_type = defaultdict(list)
total_dupes = 0
total_wasted = 0
for checksum, files in duplicates.items():
if len(files) > 1:
file_size = files[0].stat().st_size
file_type = get_file_type(files[0])
dupes_by_type[file_type].append({
'checksum': checksum,
'files': files,
'size': file_size
})
total_dupes += len(files)
total_wasted += file_size * (len(files) - 1)
# Report
if not dupes_by_type:
print("✅ No duplicates found!")
return
print(f"📊 Results:")
print(f" Files scanned: {total_files}")
print(f" Skipped: {skipped}")
print(f" Duplicate files: {total_dupes}")
print(f" Wasted space: {total_wasted / (1024**3):.2f} GB\n")
for ftype, groups in sorted(dupes_by_type.items()):
print(f"📁 {ftype.upper()} ({len(groups)} groups):")
for group in groups[:5]: # Limit to 5 per type
print(f" {len(group['files'])} copies, {group['size'] / (1024**2):.1f} MB each")
print(f"{group['files'][0]}")
for f in group['files'][1:]:
print(f" {f}")
if len(groups) > 5:
print(f" ... and {len(groups) - 5} more groups")
print()
if __name__ == '__main__':
main()