Add new skill for semantic search across personal state files and external documentation using ChromaDB and sentence-transformers. Components: - search.py: Main search interface (--index, --top-k flags) - index_personal.py: Index ~/.claude/state files - index_docs.py: Index external docs (git repos) - add_doc_source.py: Manage doc sources - test_rag.py: Test suite (5/5 passing) Features: - Two indexes: personal (116 chunks) and docs (k0s: 846 chunks) - all-MiniLM-L6-v2 embeddings (384 dimensions) - ChromaDB persistent storage - JSON output with ranked results and metadata Documentation: - Added to component-registry.json with triggers - Added /rag command alias - Updated skills/README.md - Resolved fc-013 (vector database for agent memory) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
287 lines
9.2 KiB
Python
Executable File
287 lines
9.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
RAG Search - Personal Index Builder
|
|
|
|
Indexes ~/.claude/state files for semantic search.
|
|
Chunks JSON files by key for optimal retrieval.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Generator
|
|
|
|
# Add venv site-packages to path
|
|
VENV_PATH = Path(__file__).parent.parent / "venv" / "lib" / "python3.13" / "site-packages"
|
|
if str(VENV_PATH) not in sys.path:
|
|
sys.path.insert(0, str(VENV_PATH))
|
|
|
|
import chromadb
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
# Constants
|
|
STATE_DIR = Path.home() / ".claude" / "state"
|
|
DATA_DIR = Path.home() / ".claude" / "data" / "rag-search"
|
|
CHROMA_DIR = DATA_DIR / "chroma"
|
|
MODEL_NAME = "all-MiniLM-L6-v2"
|
|
COLLECTION_NAME = "personal"
|
|
|
|
|
|
def chunk_json_file(file_path: Path) -> Generator[tuple[str, dict], None, None]:
|
|
"""
|
|
Chunk a JSON file into searchable segments.
|
|
|
|
Strategy:
|
|
- Arrays: Each item becomes a chunk
|
|
- Objects with arrays: Each array item with parent context
|
|
- Nested objects: Flatten with path prefix
|
|
|
|
Yields:
|
|
(chunk_text, metadata) tuples
|
|
"""
|
|
try:
|
|
with open(file_path) as f:
|
|
data = json.load(f)
|
|
except (json.JSONDecodeError, IOError) as e:
|
|
print(f" Warning: Could not parse {file_path}: {e}", file=sys.stderr)
|
|
return
|
|
|
|
rel_path = str(file_path.relative_to(STATE_DIR))
|
|
base_metadata = {"file": rel_path}
|
|
|
|
def process_item(item: dict, context: str = "") -> Generator[tuple[str, dict], None, None]:
|
|
"""Process a single item from JSON structure."""
|
|
if isinstance(item, dict):
|
|
# Check for common patterns in our state files
|
|
|
|
# Memory items (decisions, preferences, facts, projects)
|
|
if "content" in item:
|
|
text_parts = []
|
|
if context:
|
|
text_parts.append(f"[{context}]")
|
|
text_parts.append(item.get("content", ""))
|
|
if item.get("context"):
|
|
text_parts.append(f"Context: {item['context']}")
|
|
if item.get("rationale"):
|
|
text_parts.append(f"Rationale: {item['rationale']}")
|
|
|
|
metadata = {**base_metadata}
|
|
if item.get("date"):
|
|
metadata["date"] = item["date"]
|
|
if item.get("id"):
|
|
metadata["id"] = item["id"]
|
|
if item.get("status"):
|
|
metadata["status"] = item["status"]
|
|
|
|
yield (" ".join(text_parts), metadata)
|
|
return
|
|
|
|
# General instructions (memory)
|
|
if "instruction" in item:
|
|
text_parts = [item["instruction"]]
|
|
metadata = {**base_metadata}
|
|
if item.get("added"):
|
|
metadata["date"] = item["added"]
|
|
if item.get("status"):
|
|
metadata["status"] = item["status"]
|
|
yield (" ".join(text_parts), metadata)
|
|
return
|
|
|
|
# Knowledge base entries
|
|
if "fact" in item or "answer" in item:
|
|
text = item.get("fact") or item.get("answer", "")
|
|
if item.get("question"):
|
|
text = f"Q: {item['question']} A: {text}"
|
|
metadata = {**base_metadata}
|
|
if item.get("category"):
|
|
metadata["category"] = item["category"]
|
|
yield (text, metadata)
|
|
return
|
|
|
|
# Component registry entries
|
|
if "name" in item and "description" in item:
|
|
text = f"{item['name']}: {item['description']}"
|
|
if item.get("triggers"):
|
|
text += f" Triggers: {', '.join(item['triggers'])}"
|
|
metadata = {**base_metadata, "type": item.get("type", "unknown")}
|
|
yield (text, metadata)
|
|
return
|
|
|
|
# Future considerations
|
|
if "id" in item and "title" in item:
|
|
text = f"{item.get('id', '')}: {item['title']}"
|
|
if item.get("description"):
|
|
text += f" - {item['description']}"
|
|
if item.get("rationale"):
|
|
text += f" Rationale: {item['rationale']}"
|
|
metadata = {**base_metadata}
|
|
if item.get("date_added"):
|
|
metadata["date"] = item["date_added"]
|
|
if item.get("status"):
|
|
metadata["status"] = item["status"]
|
|
yield (text, metadata)
|
|
return
|
|
|
|
# System instructions - processes
|
|
if "process" in item or "name" in item:
|
|
parts = []
|
|
if item.get("name"):
|
|
parts.append(item["name"])
|
|
if item.get("description"):
|
|
parts.append(item["description"])
|
|
if item.get("steps"):
|
|
parts.append("Steps: " + " ".join(item["steps"]))
|
|
if parts:
|
|
yield (" - ".join(parts), {**base_metadata})
|
|
return
|
|
|
|
# Fallback: stringify the whole object
|
|
text = json.dumps(item, indent=None)
|
|
if len(text) > 50: # Only index if substantial
|
|
yield (text[:1000], {**base_metadata}) # Truncate very long items
|
|
|
|
elif isinstance(item, str) and len(item) > 20:
|
|
yield (item, {**base_metadata})
|
|
|
|
# Process top-level structure
|
|
if isinstance(data, list):
|
|
for item in data:
|
|
yield from process_item(item)
|
|
elif isinstance(data, dict):
|
|
# Handle nested arrays within objects
|
|
for key, value in data.items():
|
|
if isinstance(value, list):
|
|
for item in value:
|
|
yield from process_item(item, context=key)
|
|
elif isinstance(value, dict):
|
|
yield from process_item(value, context=key)
|
|
elif isinstance(value, str) and len(value) > 20:
|
|
yield (f"{key}: {value}", {**base_metadata})
|
|
|
|
|
|
def find_json_files() -> list[Path]:
|
|
"""Find all JSON files in the state directory."""
|
|
files = []
|
|
for pattern in ["*.json", "**/*.json"]:
|
|
files.extend(STATE_DIR.glob(pattern))
|
|
return sorted(set(files))
|
|
|
|
|
|
def index_personal(quiet: bool = False, force: bool = False) -> dict:
|
|
"""
|
|
Index all personal state files.
|
|
|
|
Args:
|
|
quiet: Suppress progress output
|
|
force: Force reindex even if already exists
|
|
|
|
Returns:
|
|
Summary statistics
|
|
"""
|
|
if not quiet:
|
|
print(f"Indexing personal state from {STATE_DIR}")
|
|
|
|
# Initialize model and client
|
|
model = SentenceTransformer(MODEL_NAME)
|
|
CHROMA_DIR.mkdir(parents=True, exist_ok=True)
|
|
client = chromadb.PersistentClient(path=str(CHROMA_DIR))
|
|
|
|
# Delete and recreate collection for clean reindex
|
|
try:
|
|
client.delete_collection(COLLECTION_NAME)
|
|
except Exception:
|
|
pass
|
|
|
|
collection = client.create_collection(
|
|
name=COLLECTION_NAME,
|
|
metadata={"description": "Personal state files from ~/.claude/state"}
|
|
)
|
|
|
|
# Find and process files
|
|
files = find_json_files()
|
|
if not quiet:
|
|
print(f"Found {len(files)} JSON files")
|
|
|
|
total_chunks = 0
|
|
chunks = []
|
|
metadatas = []
|
|
ids = []
|
|
|
|
for file_path in files:
|
|
if not quiet:
|
|
print(f" Processing: {file_path.relative_to(STATE_DIR)}")
|
|
|
|
for chunk_text, metadata in chunk_json_file(file_path):
|
|
# Skip empty or very short chunks
|
|
if not chunk_text or len(chunk_text.strip()) < 10:
|
|
continue
|
|
|
|
chunk_id = f"personal_{total_chunks}"
|
|
chunks.append(chunk_text)
|
|
metadatas.append(metadata)
|
|
ids.append(chunk_id)
|
|
total_chunks += 1
|
|
|
|
# Batch embed and add to collection
|
|
if chunks:
|
|
if not quiet:
|
|
print(f"Embedding {len(chunks)} chunks...")
|
|
|
|
embeddings = model.encode(chunks, show_progress_bar=not quiet).tolist()
|
|
|
|
# Add in batches (ChromaDB has limits)
|
|
batch_size = 100
|
|
for i in range(0, len(chunks), batch_size):
|
|
end_idx = min(i + batch_size, len(chunks))
|
|
collection.add(
|
|
documents=chunks[i:end_idx],
|
|
embeddings=embeddings[i:end_idx],
|
|
metadatas=metadatas[i:end_idx],
|
|
ids=ids[i:end_idx]
|
|
)
|
|
|
|
stats = {
|
|
"collection": COLLECTION_NAME,
|
|
"files_processed": len(files),
|
|
"chunks_indexed": total_chunks,
|
|
"indexed_at": datetime.now().isoformat()
|
|
}
|
|
|
|
if not quiet:
|
|
print(f"\nIndexed {total_chunks} chunks from {len(files)} files")
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Index personal state files for RAG search"
|
|
)
|
|
parser.add_argument(
|
|
"--quiet", "-q",
|
|
action="store_true",
|
|
help="Suppress progress output"
|
|
)
|
|
parser.add_argument(
|
|
"--force", "-f",
|
|
action="store_true",
|
|
help="Force reindex even if already indexed"
|
|
)
|
|
parser.add_argument(
|
|
"--stats",
|
|
action="store_true",
|
|
help="Output stats as JSON"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
stats = index_personal(quiet=args.quiet, force=args.force)
|
|
|
|
if args.stats:
|
|
print(json.dumps(stats, indent=2))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|