#!/usr/bin/env python3 """ RAG Search - Personal Index Builder Indexes ~/.claude/state files for semantic search. Chunks JSON files by key for optimal retrieval. """ import argparse import json import sys from datetime import datetime from pathlib import Path from typing import Generator # Add venv site-packages to path VENV_PATH = Path(__file__).parent.parent / "venv" / "lib" / "python3.13" / "site-packages" if str(VENV_PATH) not in sys.path: sys.path.insert(0, str(VENV_PATH)) import chromadb from sentence_transformers import SentenceTransformer # Constants STATE_DIR = Path.home() / ".claude" / "state" DATA_DIR = Path.home() / ".claude" / "data" / "rag-search" CHROMA_DIR = DATA_DIR / "chroma" MODEL_NAME = "all-MiniLM-L6-v2" COLLECTION_NAME = "personal" def chunk_json_file(file_path: Path) -> Generator[tuple[str, dict], None, None]: """ Chunk a JSON file into searchable segments. Strategy: - Arrays: Each item becomes a chunk - Objects with arrays: Each array item with parent context - Nested objects: Flatten with path prefix Yields: (chunk_text, metadata) tuples """ try: with open(file_path) as f: data = json.load(f) except (json.JSONDecodeError, IOError) as e: print(f" Warning: Could not parse {file_path}: {e}", file=sys.stderr) return rel_path = str(file_path.relative_to(STATE_DIR)) base_metadata = {"file": rel_path} def process_item(item: dict, context: str = "") -> Generator[tuple[str, dict], None, None]: """Process a single item from JSON structure.""" if isinstance(item, dict): # Check for common patterns in our state files # Memory items (decisions, preferences, facts, projects) if "content" in item: text_parts = [] if context: text_parts.append(f"[{context}]") text_parts.append(item.get("content", "")) if item.get("context"): text_parts.append(f"Context: {item['context']}") if item.get("rationale"): text_parts.append(f"Rationale: {item['rationale']}") metadata = {**base_metadata} if item.get("date"): metadata["date"] = item["date"] if item.get("id"): metadata["id"] = item["id"] if item.get("status"): metadata["status"] = item["status"] yield (" ".join(text_parts), metadata) return # General instructions (memory) if "instruction" in item: text_parts = [item["instruction"]] metadata = {**base_metadata} if item.get("added"): metadata["date"] = item["added"] if item.get("status"): metadata["status"] = item["status"] yield (" ".join(text_parts), metadata) return # Knowledge base entries if "fact" in item or "answer" in item: text = item.get("fact") or item.get("answer", "") if item.get("question"): text = f"Q: {item['question']} A: {text}" metadata = {**base_metadata} if item.get("category"): metadata["category"] = item["category"] yield (text, metadata) return # Component registry entries if "name" in item and "description" in item: text = f"{item['name']}: {item['description']}" if item.get("triggers"): text += f" Triggers: {', '.join(item['triggers'])}" metadata = {**base_metadata, "type": item.get("type", "unknown")} yield (text, metadata) return # Future considerations if "id" in item and "title" in item: text = f"{item.get('id', '')}: {item['title']}" if item.get("description"): text += f" - {item['description']}" if item.get("rationale"): text += f" Rationale: {item['rationale']}" metadata = {**base_metadata} if item.get("date_added"): metadata["date"] = item["date_added"] if item.get("status"): metadata["status"] = item["status"] yield (text, metadata) return # System instructions - processes if "process" in item or "name" in item: parts = [] if item.get("name"): parts.append(item["name"]) if item.get("description"): parts.append(item["description"]) if item.get("steps"): parts.append("Steps: " + " ".join(item["steps"])) if parts: yield (" - ".join(parts), {**base_metadata}) return # Fallback: stringify the whole object text = json.dumps(item, indent=None) if len(text) > 50: # Only index if substantial yield (text[:1000], {**base_metadata}) # Truncate very long items elif isinstance(item, str) and len(item) > 20: yield (item, {**base_metadata}) # Process top-level structure if isinstance(data, list): for item in data: yield from process_item(item) elif isinstance(data, dict): # Handle nested arrays within objects for key, value in data.items(): if isinstance(value, list): for item in value: yield from process_item(item, context=key) elif isinstance(value, dict): yield from process_item(value, context=key) elif isinstance(value, str) and len(value) > 20: yield (f"{key}: {value}", {**base_metadata}) def find_json_files() -> list[Path]: """Find all JSON files in the state directory.""" files = [] for pattern in ["*.json", "**/*.json"]: files.extend(STATE_DIR.glob(pattern)) return sorted(set(files)) def index_personal(quiet: bool = False, force: bool = False) -> dict: """ Index all personal state files. Args: quiet: Suppress progress output force: Force reindex even if already exists Returns: Summary statistics """ if not quiet: print(f"Indexing personal state from {STATE_DIR}") # Initialize model and client model = SentenceTransformer(MODEL_NAME) CHROMA_DIR.mkdir(parents=True, exist_ok=True) client = chromadb.PersistentClient(path=str(CHROMA_DIR)) # Delete and recreate collection for clean reindex try: client.delete_collection(COLLECTION_NAME) except Exception: pass collection = client.create_collection( name=COLLECTION_NAME, metadata={"description": "Personal state files from ~/.claude/state"} ) # Find and process files files = find_json_files() if not quiet: print(f"Found {len(files)} JSON files") total_chunks = 0 chunks = [] metadatas = [] ids = [] for file_path in files: if not quiet: print(f" Processing: {file_path.relative_to(STATE_DIR)}") for chunk_text, metadata in chunk_json_file(file_path): # Skip empty or very short chunks if not chunk_text or len(chunk_text.strip()) < 10: continue chunk_id = f"personal_{total_chunks}" chunks.append(chunk_text) metadatas.append(metadata) ids.append(chunk_id) total_chunks += 1 # Batch embed and add to collection if chunks: if not quiet: print(f"Embedding {len(chunks)} chunks...") embeddings = model.encode(chunks, show_progress_bar=not quiet).tolist() # Add in batches (ChromaDB has limits) batch_size = 100 for i in range(0, len(chunks), batch_size): end_idx = min(i + batch_size, len(chunks)) collection.add( documents=chunks[i:end_idx], embeddings=embeddings[i:end_idx], metadatas=metadatas[i:end_idx], ids=ids[i:end_idx] ) stats = { "collection": COLLECTION_NAME, "files_processed": len(files), "chunks_indexed": total_chunks, "indexed_at": datetime.now().isoformat() } if not quiet: print(f"\nIndexed {total_chunks} chunks from {len(files)} files") return stats def main(): parser = argparse.ArgumentParser( description="Index personal state files for RAG search" ) parser.add_argument( "--quiet", "-q", action="store_true", help="Suppress progress output" ) parser.add_argument( "--force", "-f", action="store_true", help="Force reindex even if already indexed" ) parser.add_argument( "--stats", action="store_true", help="Output stats as JSON" ) args = parser.parse_args() stats = index_personal(quiet=args.quiet, force=args.force) if args.stats: print(json.dumps(stats, indent=2)) if __name__ == "__main__": main()