chore: add duplicate-finder utilities and report
This commit is contained in:
129
scripts/dupes.py
Executable file
129
scripts/dupes.py
Executable file
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Find duplicate files by checksum."""
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
import mimetypes
|
||||
|
||||
# Configuration
|
||||
MIN_SIZE_MB = 1
|
||||
MIN_SIZE_BYTES = MIN_SIZE_MB * 1024 * 1024
|
||||
|
||||
# Directories to skip
|
||||
SKIP_DIRS = {
|
||||
'.cache', 'node_modules', '.git', '.local', 'tmp', 'tmp.*',
|
||||
'.npm', '.cargo', '.ollama', '.config/llama-swap',
|
||||
'.local/share/Steam', '.local/share/containers',
|
||||
'.local/lib/docker', '.thumbnails'
|
||||
}
|
||||
|
||||
def should_skip(path):
|
||||
"""Check if path should be skipped."""
|
||||
parts = path.parts
|
||||
for skip in SKIP_DIRS:
|
||||
if '*' in skip:
|
||||
pattern = skip.replace('*', '')
|
||||
if any(pattern in p for p in parts):
|
||||
return True
|
||||
elif skip in parts:
|
||||
return True
|
||||
return False
|
||||
|
||||
def sha256_file(filepath, block_size=65536):
|
||||
"""Calculate SHA256 checksum of a file."""
|
||||
hasher = hashlib.sha256()
|
||||
with open(filepath, 'rb') as f:
|
||||
for block in iter(lambda: f.read(block_size), b''):
|
||||
hasher.update(block)
|
||||
return hasher.hexdigest()
|
||||
|
||||
def get_file_type(filepath):
|
||||
"""Guess file type from extension."""
|
||||
mime, _ = mimetypes.guess_type(filepath)
|
||||
if mime:
|
||||
if mime.startswith('image/'):
|
||||
return 'image'
|
||||
elif mime.startswith('video/'):
|
||||
return 'video'
|
||||
elif mime.startswith('audio/'):
|
||||
return 'audio'
|
||||
elif 'pdf' in mime:
|
||||
return 'pdf'
|
||||
elif 'zip' in mime or 'tar' in mime or 'compressed' in mime:
|
||||
return 'archive'
|
||||
elif mime.startswith('text/'):
|
||||
return 'text'
|
||||
return 'other'
|
||||
|
||||
def main():
|
||||
home = Path.home()
|
||||
duplicates = defaultdict(list)
|
||||
total_files = 0
|
||||
skipped = 0
|
||||
|
||||
print(f"🔍 Scanning {home} for duplicates (>{MIN_SIZE_MB}MB)...")
|
||||
print(f" Skipping: {', '.join(sorted(SKIP_DIRS))}\n")
|
||||
|
||||
for root, dirs, files in os.walk(home):
|
||||
# Modify dirs in-place to skip
|
||||
dirs[:] = [d for d in dirs if not any(s in d for s in SKIP_DIRS)]
|
||||
|
||||
for filename in files:
|
||||
filepath = Path(root) / filename
|
||||
|
||||
if should_skip(filepath):
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
if filepath.stat().st_size >= MIN_SIZE_BYTES:
|
||||
checksum = sha256_file(filepath)
|
||||
duplicates[checksum].append(filepath)
|
||||
total_files += 1
|
||||
except (OSError, PermissionError) as e:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Find actual duplicates
|
||||
dupes_by_type = defaultdict(list)
|
||||
total_dupes = 0
|
||||
total_wasted = 0
|
||||
|
||||
for checksum, files in duplicates.items():
|
||||
if len(files) > 1:
|
||||
file_size = files[0].stat().st_size
|
||||
file_type = get_file_type(files[0])
|
||||
dupes_by_type[file_type].append({
|
||||
'checksum': checksum,
|
||||
'files': files,
|
||||
'size': file_size
|
||||
})
|
||||
total_dupes += len(files)
|
||||
total_wasted += file_size * (len(files) - 1)
|
||||
|
||||
# Report
|
||||
if not dupes_by_type:
|
||||
print("✅ No duplicates found!")
|
||||
return
|
||||
|
||||
print(f"📊 Results:")
|
||||
print(f" Files scanned: {total_files}")
|
||||
print(f" Skipped: {skipped}")
|
||||
print(f" Duplicate files: {total_dupes}")
|
||||
print(f" Wasted space: {total_wasted / (1024**3):.2f} GB\n")
|
||||
|
||||
for ftype, groups in sorted(dupes_by_type.items()):
|
||||
print(f"📁 {ftype.upper()} ({len(groups)} groups):")
|
||||
for group in groups[:5]: # Limit to 5 per type
|
||||
print(f" {len(group['files'])} copies, {group['size'] / (1024**2):.1f} MB each")
|
||||
print(f" → {group['files'][0]}")
|
||||
for f in group['files'][1:]:
|
||||
print(f" {f}")
|
||||
if len(groups) > 5:
|
||||
print(f" ... and {len(groups) - 5} more groups")
|
||||
print()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user