Implement rag-search skill for semantic search

Add new skill for semantic search across personal state files and external documentation using ChromaDB and sentence-transformers. Components: - search.py: Main search interface (--index, --top-k flags) - index_personal.py: Index ~/.claude/state files - index_docs.py: Index external docs (git repos) - add_doc_source.py: Manage doc sources - test_rag.py: Test suite (5/5 passing) Features: - Two indexes: personal (116 chunks) and docs (k0s: 846 chunks) - all-MiniLM-L6-v2 embeddings (384 dimensions) - ChromaDB persistent storage - JSON output with ranked results and metadata Documentation: - Added to component-registry.json with triggers - Added /rag command alias - Updated skills/README.md - Resolved fc-013 (vector database for agent memory) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-04 23:41:38 -08:00
parent c21b152de8
commit 7ca8caeecb
11 changed files with 1781 additions and 155 deletions
--- a/skills/rag-search/scripts/index_personal.py
+++ b/skills/rag-search/scripts/index_personal.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python3
+"""
+RAG Search - Personal Index Builder
+
+Indexes ~/.claude/state files for semantic search.
+Chunks JSON files by key for optimal retrieval.
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Generator
+
+# Add venv site-packages to path
+VENV_PATH = Path(__file__).parent.parent / "venv" / "lib" / "python3.13" / "site-packages"
+if str(VENV_PATH) not in sys.path:
+    sys.path.insert(0, str(VENV_PATH))
+
+import chromadb
+from sentence_transformers import SentenceTransformer
+
+# Constants
+STATE_DIR = Path.home() / ".claude" / "state"
+DATA_DIR = Path.home() / ".claude" / "data" / "rag-search"
+CHROMA_DIR = DATA_DIR / "chroma"
+MODEL_NAME = "all-MiniLM-L6-v2"
+COLLECTION_NAME = "personal"
+
+
+def chunk_json_file(file_path: Path) -> Generator[tuple[str, dict], None, None]:
+    """
+    Chunk a JSON file into searchable segments.
+
+    Strategy:
+    - Arrays: Each item becomes a chunk
+    - Objects with arrays: Each array item with parent context
+    - Nested objects: Flatten with path prefix
+
+    Yields:
+        (chunk_text, metadata) tuples
+    """
+    try:
+        with open(file_path) as f:
+            data = json.load(f)
+    except (json.JSONDecodeError, IOError) as e:
+        print(f"  Warning: Could not parse {file_path}: {e}", file=sys.stderr)
+        return
+
+    rel_path = str(file_path.relative_to(STATE_DIR))
+    base_metadata = {"file": rel_path}
+
+    def process_item(item: dict, context: str = "") -> Generator[tuple[str, dict], None, None]:
+        """Process a single item from JSON structure."""
+        if isinstance(item, dict):
+            # Check for common patterns in our state files
+
+            # Memory items (decisions, preferences, facts, projects)
+            if "content" in item:
+                text_parts = []
+                if context:
+                    text_parts.append(f"[{context}]")
+                text_parts.append(item.get("content", ""))
+                if item.get("context"):
+                    text_parts.append(f"Context: {item['context']}")
+                if item.get("rationale"):
+                    text_parts.append(f"Rationale: {item['rationale']}")
+
+                metadata = {**base_metadata}
+                if item.get("date"):
+                    metadata["date"] = item["date"]
+                if item.get("id"):
+                    metadata["id"] = item["id"]
+                if item.get("status"):
+                    metadata["status"] = item["status"]
+
+                yield (" ".join(text_parts), metadata)
+                return
+
+            # General instructions (memory)
+            if "instruction" in item:
+                text_parts = [item["instruction"]]
+                metadata = {**base_metadata}
+                if item.get("added"):
+                    metadata["date"] = item["added"]
+                if item.get("status"):
+                    metadata["status"] = item["status"]
+                yield (" ".join(text_parts), metadata)
+                return
+
+            # Knowledge base entries
+            if "fact" in item or "answer" in item:
+                text = item.get("fact") or item.get("answer", "")
+                if item.get("question"):
+                    text = f"Q: {item['question']} A: {text}"
+                metadata = {**base_metadata}
+                if item.get("category"):
+                    metadata["category"] = item["category"]
+                yield (text, metadata)
+                return
+
+            # Component registry entries
+            if "name" in item and "description" in item:
+                text = f"{item['name']}: {item['description']}"
+                if item.get("triggers"):
+                    text += f" Triggers: {', '.join(item['triggers'])}"
+                metadata = {**base_metadata, "type": item.get("type", "unknown")}
+                yield (text, metadata)
+                return
+
+            # Future considerations
+            if "id" in item and "title" in item:
+                text = f"{item.get('id', '')}: {item['title']}"
+                if item.get("description"):
+                    text += f" - {item['description']}"
+                if item.get("rationale"):
+                    text += f" Rationale: {item['rationale']}"
+                metadata = {**base_metadata}
+                if item.get("date_added"):
+                    metadata["date"] = item["date_added"]
+                if item.get("status"):
+                    metadata["status"] = item["status"]
+                yield (text, metadata)
+                return
+
+            # System instructions - processes
+            if "process" in item or "name" in item:
+                parts = []
+                if item.get("name"):
+                    parts.append(item["name"])
+                if item.get("description"):
+                    parts.append(item["description"])
+                if item.get("steps"):
+                    parts.append("Steps: " + " ".join(item["steps"]))
+                if parts:
+                    yield (" - ".join(parts), {**base_metadata})
+                    return
+
+            # Fallback: stringify the whole object
+            text = json.dumps(item, indent=None)
+            if len(text) > 50:  # Only index if substantial
+                yield (text[:1000], {**base_metadata})  # Truncate very long items
+
+        elif isinstance(item, str) and len(item) > 20:
+            yield (item, {**base_metadata})
+
+    # Process top-level structure
+    if isinstance(data, list):
+        for item in data:
+            yield from process_item(item)
+    elif isinstance(data, dict):
+        # Handle nested arrays within objects
+        for key, value in data.items():
+            if isinstance(value, list):
+                for item in value:
+                    yield from process_item(item, context=key)
+            elif isinstance(value, dict):
+                yield from process_item(value, context=key)
+            elif isinstance(value, str) and len(value) > 20:
+                yield (f"{key}: {value}", {**base_metadata})
+
+
+def find_json_files() -> list[Path]:
+    """Find all JSON files in the state directory."""
+    files = []
+    for pattern in ["*.json", "**/*.json"]:
+        files.extend(STATE_DIR.glob(pattern))
+    return sorted(set(files))
+
+
+def index_personal(quiet: bool = False, force: bool = False) -> dict:
+    """
+    Index all personal state files.
+
+    Args:
+        quiet: Suppress progress output
+        force: Force reindex even if already exists
+
+    Returns:
+        Summary statistics
+    """
+    if not quiet:
+        print(f"Indexing personal state from {STATE_DIR}")
+
+    # Initialize model and client
+    model = SentenceTransformer(MODEL_NAME)
+    CHROMA_DIR.mkdir(parents=True, exist_ok=True)
+    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
+
+    # Delete and recreate collection for clean reindex
+    try:
+        client.delete_collection(COLLECTION_NAME)
+    except Exception:
+        pass
+
+    collection = client.create_collection(
+        name=COLLECTION_NAME,
+        metadata={"description": "Personal state files from ~/.claude/state"}
+    )
+
+    # Find and process files
+    files = find_json_files()
+    if not quiet:
+        print(f"Found {len(files)} JSON files")
+
+    total_chunks = 0
+    chunks = []
+    metadatas = []
+    ids = []
+
+    for file_path in files:
+        if not quiet:
+            print(f"  Processing: {file_path.relative_to(STATE_DIR)}")
+
+        for chunk_text, metadata in chunk_json_file(file_path):
+            # Skip empty or very short chunks
+            if not chunk_text or len(chunk_text.strip()) < 10:
+                continue
+
+            chunk_id = f"personal_{total_chunks}"
+            chunks.append(chunk_text)
+            metadatas.append(metadata)
+            ids.append(chunk_id)
+            total_chunks += 1
+
+    # Batch embed and add to collection
+    if chunks:
+        if not quiet:
+            print(f"Embedding {len(chunks)} chunks...")
+
+        embeddings = model.encode(chunks, show_progress_bar=not quiet).tolist()
+
+        # Add in batches (ChromaDB has limits)
+        batch_size = 100
+        for i in range(0, len(chunks), batch_size):
+            end_idx = min(i + batch_size, len(chunks))
+            collection.add(
+                documents=chunks[i:end_idx],
+                embeddings=embeddings[i:end_idx],
+                metadatas=metadatas[i:end_idx],
+                ids=ids[i:end_idx]
+            )
+
+    stats = {
+        "collection": COLLECTION_NAME,
+        "files_processed": len(files),
+        "chunks_indexed": total_chunks,
+        "indexed_at": datetime.now().isoformat()
+    }
+
+    if not quiet:
+        print(f"\nIndexed {total_chunks} chunks from {len(files)} files")
+
+    return stats
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Index personal state files for RAG search"
+    )
+    parser.add_argument(
+        "--quiet", "-q",
+        action="store_true",
+        help="Suppress progress output"
+    )
+    parser.add_argument(
+        "--force", "-f",
+        action="store_true",
+        help="Force reindex even if already indexed"
+    )
+    parser.add_argument(
+        "--stats",
+        action="store_true",
+        help="Output stats as JSON"
+    )
+
+    args = parser.parse_args()
+    stats = index_personal(quiet=args.quiet, force=args.force)
+
+    if args.stats:
+        print(json.dumps(stats, indent=2))
+
+
+if __name__ == "__main__":
+    main()