chore: add duplicate-finder utilities and report
This commit is contained in:
52
find_duplicates.py
Executable file
52
find_duplicates.py
Executable file
@@ -0,0 +1,52 @@
|
||||
import os
|
||||
import hashlib
|
||||
from collections import defaultdict
|
||||
|
||||
EXCLUDED_DIRS = {".cache", "node_modules", ".git", ".local", "tmp", ".npm", ".cargo", ".ollama", ".config/llama-swap"}
|
||||
MIN_FILE_SIZE = 1 * 1024 * 1024 # 1MB
|
||||
|
||||
def calculate_checksum(filepath):
|
||||
sha256_hash = hashlib.sha256()
|
||||
try:
|
||||
with open(filepath, "rb") as f:
|
||||
for byte_block in iter(lambda: f.read(4096), b""):
|
||||
sha256_hash.update(byte_block)
|
||||
return sha256_hash.hexdigest()
|
||||
except (OSError, IOError):
|
||||
return None
|
||||
|
||||
def find_duplicates(start_path):
|
||||
checksums = defaultdict(list)
|
||||
processed_files = 0
|
||||
for root, dirs, files in os.walk(start_path):
|
||||
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
|
||||
for filename in files:
|
||||
filepath = os.path.join(root, filename)
|
||||
try:
|
||||
if os.path.getsize(filepath) > MIN_FILE_SIZE:
|
||||
checksum = calculate_checksum(filepath)
|
||||
if checksum:
|
||||
checksums[checksum].append(filepath)
|
||||
processed_files += 1
|
||||
if processed_files % 100 == 0:
|
||||
print(f"Processed {processed_files} files...")
|
||||
except (OSError, IOError):
|
||||
continue
|
||||
return {k: v for k, v in checksums.items() if len(v) > 1}
|
||||
|
||||
if __name__ == "__main__":
|
||||
home_dir = os.path.expanduser("~")
|
||||
print("Scanning for duplicate files. This may take some time...")
|
||||
duplicates = find_duplicates(home_dir)
|
||||
|
||||
grouped_by_extension = defaultdict(list)
|
||||
for files in duplicates.values():
|
||||
for file in files:
|
||||
ext = os.path.splitext(file)[-1].lower()
|
||||
grouped_by_extension[ext].append(file)
|
||||
|
||||
print("\nDuplicate Files Found:")
|
||||
for ext, files in grouped_by_extension.items():
|
||||
print(f"\nFile Type: {ext if ext else 'No Extension'}")
|
||||
for file in files:
|
||||
print(f" {file}")
|
||||
Reference in New Issue
Block a user