52 lines
1.9 KiB
Python
Executable File
52 lines
1.9 KiB
Python
Executable File
import os
|
|
import hashlib
|
|
from collections import defaultdict
|
|
|
|
EXCLUDED_DIRS = {".cache", "node_modules", ".git", ".local", "tmp", ".npm", ".cargo", ".ollama", ".config/llama-swap"}
|
|
MIN_FILE_SIZE = 1 * 1024 * 1024 # 1MB
|
|
|
|
def calculate_checksum(filepath):
|
|
sha256_hash = hashlib.sha256()
|
|
try:
|
|
with open(filepath, "rb") as f:
|
|
for byte_block in iter(lambda: f.read(4096), b""):
|
|
sha256_hash.update(byte_block)
|
|
return sha256_hash.hexdigest()
|
|
except (OSError, IOError):
|
|
return None
|
|
|
|
def find_duplicates(start_path):
|
|
checksums = defaultdict(list)
|
|
processed_files = 0
|
|
for root, dirs, files in os.walk(start_path):
|
|
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
|
|
for filename in files:
|
|
filepath = os.path.join(root, filename)
|
|
try:
|
|
if os.path.getsize(filepath) > MIN_FILE_SIZE:
|
|
checksum = calculate_checksum(filepath)
|
|
if checksum:
|
|
checksums[checksum].append(filepath)
|
|
processed_files += 1
|
|
if processed_files % 100 == 0:
|
|
print(f"Processed {processed_files} files...")
|
|
except (OSError, IOError):
|
|
continue
|
|
return {k: v for k, v in checksums.items() if len(v) > 1}
|
|
|
|
if __name__ == "__main__":
|
|
home_dir = os.path.expanduser("~")
|
|
print("Scanning for duplicate files. This may take some time...")
|
|
duplicates = find_duplicates(home_dir)
|
|
|
|
grouped_by_extension = defaultdict(list)
|
|
for files in duplicates.values():
|
|
for file in files:
|
|
ext = os.path.splitext(file)[-1].lower()
|
|
grouped_by_extension[ext].append(file)
|
|
|
|
print("\nDuplicate Files Found:")
|
|
for ext, files in grouped_by_extension.items():
|
|
print(f"\nFile Type: {ext if ext else 'No Extension'}")
|
|
for file in files:
|
|
print(f" {file}") |