chore: add duplicate-finder utilities and report

This commit is contained in:
William Valentin
2026-01-27 02:53:42 -08:00
parent 2e43612eb2
commit 4d5f1958cc
5 changed files with 360 additions and 0 deletions

52
find_duplicates.py Executable file
View File

@@ -0,0 +1,52 @@
import os
import hashlib
from collections import defaultdict
EXCLUDED_DIRS = {".cache", "node_modules", ".git", ".local", "tmp", ".npm", ".cargo", ".ollama", ".config/llama-swap"}
MIN_FILE_SIZE = 1 * 1024 * 1024 # 1MB
def calculate_checksum(filepath):
sha256_hash = hashlib.sha256()
try:
with open(filepath, "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()
except (OSError, IOError):
return None
def find_duplicates(start_path):
checksums = defaultdict(list)
processed_files = 0
for root, dirs, files in os.walk(start_path):
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
for filename in files:
filepath = os.path.join(root, filename)
try:
if os.path.getsize(filepath) > MIN_FILE_SIZE:
checksum = calculate_checksum(filepath)
if checksum:
checksums[checksum].append(filepath)
processed_files += 1
if processed_files % 100 == 0:
print(f"Processed {processed_files} files...")
except (OSError, IOError):
continue
return {k: v for k, v in checksums.items() if len(v) > 1}
if __name__ == "__main__":
home_dir = os.path.expanduser("~")
print("Scanning for duplicate files. This may take some time...")
duplicates = find_duplicates(home_dir)
grouped_by_extension = defaultdict(list)
for files in duplicates.values():
for file in files:
ext = os.path.splitext(file)[-1].lower()
grouped_by_extension[ext].append(file)
print("\nDuplicate Files Found:")
for ext, files in grouped_by_extension.items():
print(f"\nFile Type: {ext if ext else 'No Extension'}")
for file in files:
print(f" {file}")