Files
claude-code/skills/rag-search/scripts/index_personal.py
OpenCode Test 7ca8caeecb Implement rag-search skill for semantic search
Add new skill for semantic search across personal state files and
external documentation using ChromaDB and sentence-transformers.

Components:
- search.py: Main search interface (--index, --top-k flags)
- index_personal.py: Index ~/.claude/state files
- index_docs.py: Index external docs (git repos)
- add_doc_source.py: Manage doc sources
- test_rag.py: Test suite (5/5 passing)

Features:
- Two indexes: personal (116 chunks) and docs (k0s: 846 chunks)
- all-MiniLM-L6-v2 embeddings (384 dimensions)
- ChromaDB persistent storage
- JSON output with ranked results and metadata

Documentation:
- Added to component-registry.json with triggers
- Added /rag command alias
- Updated skills/README.md
- Resolved fc-013 (vector database for agent memory)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-04 23:41:38 -08:00

287 lines
9.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""
RAG Search - Personal Index Builder
Indexes ~/.claude/state files for semantic search.
Chunks JSON files by key for optimal retrieval.
"""
import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Generator
# Add venv site-packages to path
VENV_PATH = Path(__file__).parent.parent / "venv" / "lib" / "python3.13" / "site-packages"
if str(VENV_PATH) not in sys.path:
sys.path.insert(0, str(VENV_PATH))
import chromadb
from sentence_transformers import SentenceTransformer
# Constants
STATE_DIR = Path.home() / ".claude" / "state"
DATA_DIR = Path.home() / ".claude" / "data" / "rag-search"
CHROMA_DIR = DATA_DIR / "chroma"
MODEL_NAME = "all-MiniLM-L6-v2"
COLLECTION_NAME = "personal"
def chunk_json_file(file_path: Path) -> Generator[tuple[str, dict], None, None]:
"""
Chunk a JSON file into searchable segments.
Strategy:
- Arrays: Each item becomes a chunk
- Objects with arrays: Each array item with parent context
- Nested objects: Flatten with path prefix
Yields:
(chunk_text, metadata) tuples
"""
try:
with open(file_path) as f:
data = json.load(f)
except (json.JSONDecodeError, IOError) as e:
print(f" Warning: Could not parse {file_path}: {e}", file=sys.stderr)
return
rel_path = str(file_path.relative_to(STATE_DIR))
base_metadata = {"file": rel_path}
def process_item(item: dict, context: str = "") -> Generator[tuple[str, dict], None, None]:
"""Process a single item from JSON structure."""
if isinstance(item, dict):
# Check for common patterns in our state files
# Memory items (decisions, preferences, facts, projects)
if "content" in item:
text_parts = []
if context:
text_parts.append(f"[{context}]")
text_parts.append(item.get("content", ""))
if item.get("context"):
text_parts.append(f"Context: {item['context']}")
if item.get("rationale"):
text_parts.append(f"Rationale: {item['rationale']}")
metadata = {**base_metadata}
if item.get("date"):
metadata["date"] = item["date"]
if item.get("id"):
metadata["id"] = item["id"]
if item.get("status"):
metadata["status"] = item["status"]
yield (" ".join(text_parts), metadata)
return
# General instructions (memory)
if "instruction" in item:
text_parts = [item["instruction"]]
metadata = {**base_metadata}
if item.get("added"):
metadata["date"] = item["added"]
if item.get("status"):
metadata["status"] = item["status"]
yield (" ".join(text_parts), metadata)
return
# Knowledge base entries
if "fact" in item or "answer" in item:
text = item.get("fact") or item.get("answer", "")
if item.get("question"):
text = f"Q: {item['question']} A: {text}"
metadata = {**base_metadata}
if item.get("category"):
metadata["category"] = item["category"]
yield (text, metadata)
return
# Component registry entries
if "name" in item and "description" in item:
text = f"{item['name']}: {item['description']}"
if item.get("triggers"):
text += f" Triggers: {', '.join(item['triggers'])}"
metadata = {**base_metadata, "type": item.get("type", "unknown")}
yield (text, metadata)
return
# Future considerations
if "id" in item and "title" in item:
text = f"{item.get('id', '')}: {item['title']}"
if item.get("description"):
text += f" - {item['description']}"
if item.get("rationale"):
text += f" Rationale: {item['rationale']}"
metadata = {**base_metadata}
if item.get("date_added"):
metadata["date"] = item["date_added"]
if item.get("status"):
metadata["status"] = item["status"]
yield (text, metadata)
return
# System instructions - processes
if "process" in item or "name" in item:
parts = []
if item.get("name"):
parts.append(item["name"])
if item.get("description"):
parts.append(item["description"])
if item.get("steps"):
parts.append("Steps: " + " ".join(item["steps"]))
if parts:
yield (" - ".join(parts), {**base_metadata})
return
# Fallback: stringify the whole object
text = json.dumps(item, indent=None)
if len(text) > 50: # Only index if substantial
yield (text[:1000], {**base_metadata}) # Truncate very long items
elif isinstance(item, str) and len(item) > 20:
yield (item, {**base_metadata})
# Process top-level structure
if isinstance(data, list):
for item in data:
yield from process_item(item)
elif isinstance(data, dict):
# Handle nested arrays within objects
for key, value in data.items():
if isinstance(value, list):
for item in value:
yield from process_item(item, context=key)
elif isinstance(value, dict):
yield from process_item(value, context=key)
elif isinstance(value, str) and len(value) > 20:
yield (f"{key}: {value}", {**base_metadata})
def find_json_files() -> list[Path]:
"""Find all JSON files in the state directory."""
files = []
for pattern in ["*.json", "**/*.json"]:
files.extend(STATE_DIR.glob(pattern))
return sorted(set(files))
def index_personal(quiet: bool = False, force: bool = False) -> dict:
"""
Index all personal state files.
Args:
quiet: Suppress progress output
force: Force reindex even if already exists
Returns:
Summary statistics
"""
if not quiet:
print(f"Indexing personal state from {STATE_DIR}")
# Initialize model and client
model = SentenceTransformer(MODEL_NAME)
CHROMA_DIR.mkdir(parents=True, exist_ok=True)
client = chromadb.PersistentClient(path=str(CHROMA_DIR))
# Delete and recreate collection for clean reindex
try:
client.delete_collection(COLLECTION_NAME)
except Exception:
pass
collection = client.create_collection(
name=COLLECTION_NAME,
metadata={"description": "Personal state files from ~/.claude/state"}
)
# Find and process files
files = find_json_files()
if not quiet:
print(f"Found {len(files)} JSON files")
total_chunks = 0
chunks = []
metadatas = []
ids = []
for file_path in files:
if not quiet:
print(f" Processing: {file_path.relative_to(STATE_DIR)}")
for chunk_text, metadata in chunk_json_file(file_path):
# Skip empty or very short chunks
if not chunk_text or len(chunk_text.strip()) < 10:
continue
chunk_id = f"personal_{total_chunks}"
chunks.append(chunk_text)
metadatas.append(metadata)
ids.append(chunk_id)
total_chunks += 1
# Batch embed and add to collection
if chunks:
if not quiet:
print(f"Embedding {len(chunks)} chunks...")
embeddings = model.encode(chunks, show_progress_bar=not quiet).tolist()
# Add in batches (ChromaDB has limits)
batch_size = 100
for i in range(0, len(chunks), batch_size):
end_idx = min(i + batch_size, len(chunks))
collection.add(
documents=chunks[i:end_idx],
embeddings=embeddings[i:end_idx],
metadatas=metadatas[i:end_idx],
ids=ids[i:end_idx]
)
stats = {
"collection": COLLECTION_NAME,
"files_processed": len(files),
"chunks_indexed": total_chunks,
"indexed_at": datetime.now().isoformat()
}
if not quiet:
print(f"\nIndexed {total_chunks} chunks from {len(files)} files")
return stats
def main():
parser = argparse.ArgumentParser(
description="Index personal state files for RAG search"
)
parser.add_argument(
"--quiet", "-q",
action="store_true",
help="Suppress progress output"
)
parser.add_argument(
"--force", "-f",
action="store_true",
help="Force reindex even if already indexed"
)
parser.add_argument(
"--stats",
action="store_true",
help="Output stats as JSON"
)
args = parser.parse_args()
stats = index_personal(quiet=args.quiet, force=args.force)
if args.stats:
print(json.dumps(stats, indent=2))
if __name__ == "__main__":
main()