import os import hashlib from collections import defaultdict EXCLUDED_DIRS = {".cache", "node_modules", ".git", ".local", "tmp", ".npm", ".cargo", ".ollama", ".config/llama-swap"} MIN_FILE_SIZE = 1 * 1024 * 1024 # 1MB def calculate_checksum(filepath): sha256_hash = hashlib.sha256() try: with open(filepath, "rb") as f: for byte_block in iter(lambda: f.read(4096), b""): sha256_hash.update(byte_block) return sha256_hash.hexdigest() except (OSError, IOError): return None def find_duplicates(start_path): checksums = defaultdict(list) processed_files = 0 for root, dirs, files in os.walk(start_path): dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS] for filename in files: filepath = os.path.join(root, filename) try: if os.path.getsize(filepath) > MIN_FILE_SIZE: checksum = calculate_checksum(filepath) if checksum: checksums[checksum].append(filepath) processed_files += 1 if processed_files % 100 == 0: print(f"Processed {processed_files} files...") except (OSError, IOError): continue return {k: v for k, v in checksums.items() if len(v) > 1} if __name__ == "__main__": home_dir = os.path.expanduser("~") print("Scanning for duplicate files. This may take some time...") duplicates = find_duplicates(home_dir) grouped_by_extension = defaultdict(list) for files in duplicates.values(): for file in files: ext = os.path.splitext(file)[-1].lower() grouped_by_extension[ext].append(file) print("\nDuplicate Files Found:") for ext, files in grouped_by_extension.items(): print(f"\nFile Type: {ext if ext else 'No Extension'}") for file in files: print(f" {file}")