Implement rag-search skill for semantic search
Add new skill for semantic search across personal state files and external documentation using ChromaDB and sentence-transformers. Components: - search.py: Main search interface (--index, --top-k flags) - index_personal.py: Index ~/.claude/state files - index_docs.py: Index external docs (git repos) - add_doc_source.py: Manage doc sources - test_rag.py: Test suite (5/5 passing) Features: - Two indexes: personal (116 chunks) and docs (k0s: 846 chunks) - all-MiniLM-L6-v2 embeddings (384 dimensions) - ChromaDB persistent storage - JSON output with ranked results and metadata Documentation: - Added to component-registry.json with triggers - Added /rag command alias - Updated skills/README.md - Resolved fc-013 (vector database for agent memory) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
286
skills/rag-search/scripts/index_personal.py
Executable file
286
skills/rag-search/scripts/index_personal.py
Executable file
@@ -0,0 +1,286 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
RAG Search - Personal Index Builder
|
||||
|
||||
Indexes ~/.claude/state files for semantic search.
|
||||
Chunks JSON files by key for optimal retrieval.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Generator
|
||||
|
||||
# Add venv site-packages to path
|
||||
VENV_PATH = Path(__file__).parent.parent / "venv" / "lib" / "python3.13" / "site-packages"
|
||||
if str(VENV_PATH) not in sys.path:
|
||||
sys.path.insert(0, str(VENV_PATH))
|
||||
|
||||
import chromadb
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# Constants
|
||||
STATE_DIR = Path.home() / ".claude" / "state"
|
||||
DATA_DIR = Path.home() / ".claude" / "data" / "rag-search"
|
||||
CHROMA_DIR = DATA_DIR / "chroma"
|
||||
MODEL_NAME = "all-MiniLM-L6-v2"
|
||||
COLLECTION_NAME = "personal"
|
||||
|
||||
|
||||
def chunk_json_file(file_path: Path) -> Generator[tuple[str, dict], None, None]:
|
||||
"""
|
||||
Chunk a JSON file into searchable segments.
|
||||
|
||||
Strategy:
|
||||
- Arrays: Each item becomes a chunk
|
||||
- Objects with arrays: Each array item with parent context
|
||||
- Nested objects: Flatten with path prefix
|
||||
|
||||
Yields:
|
||||
(chunk_text, metadata) tuples
|
||||
"""
|
||||
try:
|
||||
with open(file_path) as f:
|
||||
data = json.load(f)
|
||||
except (json.JSONDecodeError, IOError) as e:
|
||||
print(f" Warning: Could not parse {file_path}: {e}", file=sys.stderr)
|
||||
return
|
||||
|
||||
rel_path = str(file_path.relative_to(STATE_DIR))
|
||||
base_metadata = {"file": rel_path}
|
||||
|
||||
def process_item(item: dict, context: str = "") -> Generator[tuple[str, dict], None, None]:
|
||||
"""Process a single item from JSON structure."""
|
||||
if isinstance(item, dict):
|
||||
# Check for common patterns in our state files
|
||||
|
||||
# Memory items (decisions, preferences, facts, projects)
|
||||
if "content" in item:
|
||||
text_parts = []
|
||||
if context:
|
||||
text_parts.append(f"[{context}]")
|
||||
text_parts.append(item.get("content", ""))
|
||||
if item.get("context"):
|
||||
text_parts.append(f"Context: {item['context']}")
|
||||
if item.get("rationale"):
|
||||
text_parts.append(f"Rationale: {item['rationale']}")
|
||||
|
||||
metadata = {**base_metadata}
|
||||
if item.get("date"):
|
||||
metadata["date"] = item["date"]
|
||||
if item.get("id"):
|
||||
metadata["id"] = item["id"]
|
||||
if item.get("status"):
|
||||
metadata["status"] = item["status"]
|
||||
|
||||
yield (" ".join(text_parts), metadata)
|
||||
return
|
||||
|
||||
# General instructions (memory)
|
||||
if "instruction" in item:
|
||||
text_parts = [item["instruction"]]
|
||||
metadata = {**base_metadata}
|
||||
if item.get("added"):
|
||||
metadata["date"] = item["added"]
|
||||
if item.get("status"):
|
||||
metadata["status"] = item["status"]
|
||||
yield (" ".join(text_parts), metadata)
|
||||
return
|
||||
|
||||
# Knowledge base entries
|
||||
if "fact" in item or "answer" in item:
|
||||
text = item.get("fact") or item.get("answer", "")
|
||||
if item.get("question"):
|
||||
text = f"Q: {item['question']} A: {text}"
|
||||
metadata = {**base_metadata}
|
||||
if item.get("category"):
|
||||
metadata["category"] = item["category"]
|
||||
yield (text, metadata)
|
||||
return
|
||||
|
||||
# Component registry entries
|
||||
if "name" in item and "description" in item:
|
||||
text = f"{item['name']}: {item['description']}"
|
||||
if item.get("triggers"):
|
||||
text += f" Triggers: {', '.join(item['triggers'])}"
|
||||
metadata = {**base_metadata, "type": item.get("type", "unknown")}
|
||||
yield (text, metadata)
|
||||
return
|
||||
|
||||
# Future considerations
|
||||
if "id" in item and "title" in item:
|
||||
text = f"{item.get('id', '')}: {item['title']}"
|
||||
if item.get("description"):
|
||||
text += f" - {item['description']}"
|
||||
if item.get("rationale"):
|
||||
text += f" Rationale: {item['rationale']}"
|
||||
metadata = {**base_metadata}
|
||||
if item.get("date_added"):
|
||||
metadata["date"] = item["date_added"]
|
||||
if item.get("status"):
|
||||
metadata["status"] = item["status"]
|
||||
yield (text, metadata)
|
||||
return
|
||||
|
||||
# System instructions - processes
|
||||
if "process" in item or "name" in item:
|
||||
parts = []
|
||||
if item.get("name"):
|
||||
parts.append(item["name"])
|
||||
if item.get("description"):
|
||||
parts.append(item["description"])
|
||||
if item.get("steps"):
|
||||
parts.append("Steps: " + " ".join(item["steps"]))
|
||||
if parts:
|
||||
yield (" - ".join(parts), {**base_metadata})
|
||||
return
|
||||
|
||||
# Fallback: stringify the whole object
|
||||
text = json.dumps(item, indent=None)
|
||||
if len(text) > 50: # Only index if substantial
|
||||
yield (text[:1000], {**base_metadata}) # Truncate very long items
|
||||
|
||||
elif isinstance(item, str) and len(item) > 20:
|
||||
yield (item, {**base_metadata})
|
||||
|
||||
# Process top-level structure
|
||||
if isinstance(data, list):
|
||||
for item in data:
|
||||
yield from process_item(item)
|
||||
elif isinstance(data, dict):
|
||||
# Handle nested arrays within objects
|
||||
for key, value in data.items():
|
||||
if isinstance(value, list):
|
||||
for item in value:
|
||||
yield from process_item(item, context=key)
|
||||
elif isinstance(value, dict):
|
||||
yield from process_item(value, context=key)
|
||||
elif isinstance(value, str) and len(value) > 20:
|
||||
yield (f"{key}: {value}", {**base_metadata})
|
||||
|
||||
|
||||
def find_json_files() -> list[Path]:
|
||||
"""Find all JSON files in the state directory."""
|
||||
files = []
|
||||
for pattern in ["*.json", "**/*.json"]:
|
||||
files.extend(STATE_DIR.glob(pattern))
|
||||
return sorted(set(files))
|
||||
|
||||
|
||||
def index_personal(quiet: bool = False, force: bool = False) -> dict:
|
||||
"""
|
||||
Index all personal state files.
|
||||
|
||||
Args:
|
||||
quiet: Suppress progress output
|
||||
force: Force reindex even if already exists
|
||||
|
||||
Returns:
|
||||
Summary statistics
|
||||
"""
|
||||
if not quiet:
|
||||
print(f"Indexing personal state from {STATE_DIR}")
|
||||
|
||||
# Initialize model and client
|
||||
model = SentenceTransformer(MODEL_NAME)
|
||||
CHROMA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
client = chromadb.PersistentClient(path=str(CHROMA_DIR))
|
||||
|
||||
# Delete and recreate collection for clean reindex
|
||||
try:
|
||||
client.delete_collection(COLLECTION_NAME)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
collection = client.create_collection(
|
||||
name=COLLECTION_NAME,
|
||||
metadata={"description": "Personal state files from ~/.claude/state"}
|
||||
)
|
||||
|
||||
# Find and process files
|
||||
files = find_json_files()
|
||||
if not quiet:
|
||||
print(f"Found {len(files)} JSON files")
|
||||
|
||||
total_chunks = 0
|
||||
chunks = []
|
||||
metadatas = []
|
||||
ids = []
|
||||
|
||||
for file_path in files:
|
||||
if not quiet:
|
||||
print(f" Processing: {file_path.relative_to(STATE_DIR)}")
|
||||
|
||||
for chunk_text, metadata in chunk_json_file(file_path):
|
||||
# Skip empty or very short chunks
|
||||
if not chunk_text or len(chunk_text.strip()) < 10:
|
||||
continue
|
||||
|
||||
chunk_id = f"personal_{total_chunks}"
|
||||
chunks.append(chunk_text)
|
||||
metadatas.append(metadata)
|
||||
ids.append(chunk_id)
|
||||
total_chunks += 1
|
||||
|
||||
# Batch embed and add to collection
|
||||
if chunks:
|
||||
if not quiet:
|
||||
print(f"Embedding {len(chunks)} chunks...")
|
||||
|
||||
embeddings = model.encode(chunks, show_progress_bar=not quiet).tolist()
|
||||
|
||||
# Add in batches (ChromaDB has limits)
|
||||
batch_size = 100
|
||||
for i in range(0, len(chunks), batch_size):
|
||||
end_idx = min(i + batch_size, len(chunks))
|
||||
collection.add(
|
||||
documents=chunks[i:end_idx],
|
||||
embeddings=embeddings[i:end_idx],
|
||||
metadatas=metadatas[i:end_idx],
|
||||
ids=ids[i:end_idx]
|
||||
)
|
||||
|
||||
stats = {
|
||||
"collection": COLLECTION_NAME,
|
||||
"files_processed": len(files),
|
||||
"chunks_indexed": total_chunks,
|
||||
"indexed_at": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
if not quiet:
|
||||
print(f"\nIndexed {total_chunks} chunks from {len(files)} files")
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Index personal state files for RAG search"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quiet", "-q",
|
||||
action="store_true",
|
||||
help="Suppress progress output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force", "-f",
|
||||
action="store_true",
|
||||
help="Force reindex even if already indexed"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stats",
|
||||
action="store_true",
|
||||
help="Output stats as JSON"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
stats = index_personal(quiet=args.quiet, force=args.force)
|
||||
|
||||
if args.stats:
|
||||
print(json.dumps(stats, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user