clawdbot/scripts/dupes.py

#!/usr/bin/env python3
"""Find duplicate files by checksum."""

import hashlib
import os
from pathlib import Path
from collections import defaultdict
import mimetypes

# Configuration
MIN_SIZE_MB = 1
MIN_SIZE_BYTES = MIN_SIZE_MB * 1024 * 1024

# Directories to skip
SKIP_DIRS = {
    '.cache', 'node_modules', '.git', '.local', 'tmp', 'tmp.*',
    '.npm', '.cargo', '.ollama', '.config/llama-swap',
    '.local/share/Steam', '.local/share/containers',
    '.local/lib/docker', '.thumbnails'
}

def should_skip(path):
    """Check if path should be skipped."""
    parts = path.parts
    for skip in SKIP_DIRS:
        if '*' in skip:
            pattern = skip.replace('*', '')
            if any(pattern in p for p in parts):
                return True
        elif skip in parts:
            return True
    return False

def sha256_file(filepath, block_size=65536):
    """Calculate SHA256 checksum of a file."""
    hasher = hashlib.sha256()
    with open(filepath, 'rb') as f:
        for block in iter(lambda: f.read(block_size), b''):
            hasher.update(block)
    return hasher.hexdigest()

def get_file_type(filepath):
    """Guess file type from extension."""
    mime, _ = mimetypes.guess_type(filepath)
    if mime:
        if mime.startswith('image/'):
            return 'image'
        elif mime.startswith('video/'):
            return 'video'
        elif mime.startswith('audio/'):
            return 'audio'
        elif 'pdf' in mime:
            return 'pdf'
        elif 'zip' in mime or 'tar' in mime or 'compressed' in mime:
            return 'archive'
        elif mime.startswith('text/'):
            return 'text'
    return 'other'

def main():
    home = Path.home()
    duplicates = defaultdict(list)
    total_files = 0
    skipped = 0

    print(f"🔍 Scanning {home} for duplicates (>{MIN_SIZE_MB}MB)...")
    print(f"   Skipping: {', '.join(sorted(SKIP_DIRS))}\n")

    for root, dirs, files in os.walk(home):
        # Modify dirs in-place to skip
        dirs[:] = [d for d in dirs if not any(s in d for s in SKIP_DIRS)]

        for filename in files:
            filepath = Path(root) / filename

            if should_skip(filepath):
                skipped += 1
                continue

            try:
                if filepath.stat().st_size >= MIN_SIZE_BYTES:
                    checksum = sha256_file(filepath)
                    duplicates[checksum].append(filepath)
                    total_files += 1
            except (OSError, PermissionError) as e:
                skipped += 1
                continue

    # Find actual duplicates
    dupes_by_type = defaultdict(list)
    total_dupes = 0
    total_wasted = 0

    for checksum, files in duplicates.items():
        if len(files) > 1:
            file_size = files[0].stat().st_size
            file_type = get_file_type(files[0])
            dupes_by_type[file_type].append({
                'checksum': checksum,
                'files': files,
                'size': file_size
            })
            total_dupes += len(files)
            total_wasted += file_size * (len(files) - 1)

    # Report
    if not dupes_by_type:
        print("✅ No duplicates found!")
        return

    print(f"📊 Results:")
    print(f"   Files scanned: {total_files}")
    print(f"   Skipped: {skipped}")
    print(f"   Duplicate files: {total_dupes}")
    print(f"   Wasted space: {total_wasted / (1024**3):.2f} GB\n")

    for ftype, groups in sorted(dupes_by_type.items()):
        print(f"📁 {ftype.upper()} ({len(groups)} groups):")
        for group in groups[:5]:  # Limit to 5 per type
            print(f"   {len(group['files'])} copies, {group['size'] / (1024**2):.1f} MB each")
            print(f"   → {group['files'][0]}")
            for f in group['files'][1:]:
                print(f"      {f}")
        if len(groups) > 5:
            print(f"   ... and {len(groups) - 5} more groups")
        print()

if __name__ == '__main__':
    main()