130 lines
3.9 KiB
Python
Executable File
130 lines
3.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Find duplicate files by checksum."""
|
|
|
|
import hashlib
|
|
import os
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
import mimetypes
|
|
|
|
# Configuration
|
|
MIN_SIZE_MB = 1
|
|
MIN_SIZE_BYTES = MIN_SIZE_MB * 1024 * 1024
|
|
|
|
# Directories to skip
|
|
SKIP_DIRS = {
|
|
'.cache', 'node_modules', '.git', '.local', 'tmp', 'tmp.*',
|
|
'.npm', '.cargo', '.ollama', '.config/llama-swap',
|
|
'.local/share/Steam', '.local/share/containers',
|
|
'.local/lib/docker', '.thumbnails'
|
|
}
|
|
|
|
def should_skip(path):
|
|
"""Check if path should be skipped."""
|
|
parts = path.parts
|
|
for skip in SKIP_DIRS:
|
|
if '*' in skip:
|
|
pattern = skip.replace('*', '')
|
|
if any(pattern in p for p in parts):
|
|
return True
|
|
elif skip in parts:
|
|
return True
|
|
return False
|
|
|
|
def sha256_file(filepath, block_size=65536):
|
|
"""Calculate SHA256 checksum of a file."""
|
|
hasher = hashlib.sha256()
|
|
with open(filepath, 'rb') as f:
|
|
for block in iter(lambda: f.read(block_size), b''):
|
|
hasher.update(block)
|
|
return hasher.hexdigest()
|
|
|
|
def get_file_type(filepath):
|
|
"""Guess file type from extension."""
|
|
mime, _ = mimetypes.guess_type(filepath)
|
|
if mime:
|
|
if mime.startswith('image/'):
|
|
return 'image'
|
|
elif mime.startswith('video/'):
|
|
return 'video'
|
|
elif mime.startswith('audio/'):
|
|
return 'audio'
|
|
elif 'pdf' in mime:
|
|
return 'pdf'
|
|
elif 'zip' in mime or 'tar' in mime or 'compressed' in mime:
|
|
return 'archive'
|
|
elif mime.startswith('text/'):
|
|
return 'text'
|
|
return 'other'
|
|
|
|
def main():
|
|
home = Path.home()
|
|
duplicates = defaultdict(list)
|
|
total_files = 0
|
|
skipped = 0
|
|
|
|
print(f"🔍 Scanning {home} for duplicates (>{MIN_SIZE_MB}MB)...")
|
|
print(f" Skipping: {', '.join(sorted(SKIP_DIRS))}\n")
|
|
|
|
for root, dirs, files in os.walk(home):
|
|
# Modify dirs in-place to skip
|
|
dirs[:] = [d for d in dirs if not any(s in d for s in SKIP_DIRS)]
|
|
|
|
for filename in files:
|
|
filepath = Path(root) / filename
|
|
|
|
if should_skip(filepath):
|
|
skipped += 1
|
|
continue
|
|
|
|
try:
|
|
if filepath.stat().st_size >= MIN_SIZE_BYTES:
|
|
checksum = sha256_file(filepath)
|
|
duplicates[checksum].append(filepath)
|
|
total_files += 1
|
|
except (OSError, PermissionError) as e:
|
|
skipped += 1
|
|
continue
|
|
|
|
# Find actual duplicates
|
|
dupes_by_type = defaultdict(list)
|
|
total_dupes = 0
|
|
total_wasted = 0
|
|
|
|
for checksum, files in duplicates.items():
|
|
if len(files) > 1:
|
|
file_size = files[0].stat().st_size
|
|
file_type = get_file_type(files[0])
|
|
dupes_by_type[file_type].append({
|
|
'checksum': checksum,
|
|
'files': files,
|
|
'size': file_size
|
|
})
|
|
total_dupes += len(files)
|
|
total_wasted += file_size * (len(files) - 1)
|
|
|
|
# Report
|
|
if not dupes_by_type:
|
|
print("✅ No duplicates found!")
|
|
return
|
|
|
|
print(f"📊 Results:")
|
|
print(f" Files scanned: {total_files}")
|
|
print(f" Skipped: {skipped}")
|
|
print(f" Duplicate files: {total_dupes}")
|
|
print(f" Wasted space: {total_wasted / (1024**3):.2f} GB\n")
|
|
|
|
for ftype, groups in sorted(dupes_by_type.items()):
|
|
print(f"📁 {ftype.upper()} ({len(groups)} groups):")
|
|
for group in groups[:5]: # Limit to 5 per type
|
|
print(f" {len(group['files'])} copies, {group['size'] / (1024**2):.1f} MB each")
|
|
print(f" → {group['files'][0]}")
|
|
for f in group['files'][1:]:
|
|
print(f" {f}")
|
|
if len(groups) > 5:
|
|
print(f" ... and {len(groups) - 5} more groups")
|
|
print()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|