feat(swarm): add Obsidian vault reindex endpoint + update handoff
- obsidian-reindex-server.py: HTTP endpoint on port 18810 for triggering incremental Obsidian vault reindex from n8n - Updated n8n Implementation Handoff: Obsidian Semantic Index section, new reindex workflow, updated verification commands
This commit is contained in:
@@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Obsidian Vault Reindex Endpoint
|
||||
Lightweight HTTP server that triggers an incremental Obsidian vault reindex.
|
||||
|
||||
Listens on 0.0.0.0:18810 (configurable via PORT env var).
|
||||
Called by n8n webhooks or systemd timers.
|
||||
|
||||
Endpoints:
|
||||
POST /reindex -> trigger incremental reindex, returns JSON stats
|
||||
GET /reindex/status -> check last index state
|
||||
GET /healthz -> returns ok
|
||||
"""
|
||||
|
||||
import http.server
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
PORT = int(os.environ.get("PORT", 18810))
|
||||
|
||||
REINDEX_SCRIPT = str(
|
||||
Path.home()
|
||||
/ ".hermes/skills/note-taking/rag-search/scripts/reindex_obsidian.sh"
|
||||
)
|
||||
STATE_FILE = (
|
||||
Path.home() / ".hermes/data/rag-search/obsidian_index_state.json"
|
||||
)
|
||||
|
||||
# Lock to prevent concurrent reindexing
|
||||
_reindex_lock = threading.Lock()
|
||||
|
||||
|
||||
def run_reindex() -> dict:
|
||||
"""Run the incremental reindex script. Returns stats dict."""
|
||||
if not _reindex_lock.acquire(blocking=False):
|
||||
return {"error": "reindex already in progress", "status": "locked"}
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[REINDEX_SCRIPT],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600, # 10 min max for full reindex
|
||||
)
|
||||
if result.returncode != 0:
|
||||
return {
|
||||
"error": "reindex failed",
|
||||
"exit_code": result.returncode,
|
||||
"stderr": result.stderr.strip()[:500],
|
||||
}
|
||||
try:
|
||||
return json.loads(result.stdout)
|
||||
except json.JSONDecodeError:
|
||||
return {
|
||||
"error": "invalid json output",
|
||||
"stdout": result.stdout.strip()[:500],
|
||||
}
|
||||
except subprocess.TimeoutExpired:
|
||||
return {"error": "reindex timed out (600s)"}
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
finally:
|
||||
_reindex_lock.release()
|
||||
|
||||
|
||||
def get_status() -> dict:
|
||||
"""Read the last index state file."""
|
||||
if not STATE_FILE.exists():
|
||||
return {"indexed": False, "message": "no state file"}
|
||||
try:
|
||||
return json.loads(STATE_FILE.read_text())
|
||||
except (json.JSONDecodeError, IOError) as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
class ReindexHandler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
path = self.path.rstrip("/")
|
||||
if path == "/healthz":
|
||||
self._json_response({"status": "ok"})
|
||||
elif path == "/reindex/status":
|
||||
self._json_response(get_status())
|
||||
else:
|
||||
self._json_response({"error": "not found"}, status=404)
|
||||
|
||||
def do_POST(self):
|
||||
path = self.path.rstrip("/")
|
||||
if path == "/reindex":
|
||||
# Run in background thread so we can respond
|
||||
result = run_reindex()
|
||||
status = 200 if "error" not in result else 500
|
||||
self._json_response(result, status=status)
|
||||
else:
|
||||
self._json_response({"error": "not found"}, status=404)
|
||||
|
||||
def _json_response(self, data, status=200):
|
||||
body = json.dumps(data, indent=2).encode()
|
||||
self.send_response(status)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, format, *args):
|
||||
# Minimal logging
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
server = http.server.HTTPServer(("0.0.0.0", PORT), ReindexHandler)
|
||||
print(f"obsidian-reindex-server listening on 0.0.0.0:{PORT}", flush=True)
|
||||
try:
|
||||
server.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
server.server_close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+82
-30
@@ -19,11 +19,13 @@ Last verified on 2026-05-13 (evening):
|
||||
- Container: `n8n-agent` running and healthy.
|
||||
- Health endpoint: `GET /healthz` returns `{"status":"ok"}`.
|
||||
- Workflow export: `n8n export:workflow --all` succeeds.
|
||||
- Active workflows: 12.
|
||||
- Active workflows: 13.
|
||||
- Inactive workflows: 1 (Nightly Obsidian Vault Sync replaced by Evening Digest).
|
||||
- Archived workflows: 2 unrecoverable duplicate IMAP workflows archived after SQLite recovery.
|
||||
- Docker health endpoint: `GET :18809/health` returns container state for 7 services.
|
||||
- Systemd user service `docker-health-endpoint.service` active and enabled.
|
||||
- Obsidian reindex endpoint: `POST :18810/reindex` triggers incremental vault reindex.
|
||||
- Systemd user service `obsidian-reindex-endpoint.service` active and enabled.
|
||||
|
||||
## Implemented and active
|
||||
|
||||
@@ -207,6 +209,74 @@ Last verified on 2026-05-13 (evening):
|
||||
- `run_health_check`
|
||||
- `process_voice_memo`
|
||||
|
||||
### Obsidian Vault Reindex
|
||||
|
||||
- Workflow ID: `85ntyyphDJ4Ms2b4`
|
||||
- Status: active
|
||||
- Trigger: every 6 hours
|
||||
- Current behavior:
|
||||
- n8n schedule trigger calls `POST http://172.19.0.1:18810/reindex`.
|
||||
- Host-side `obsidian-reindex-server.py` on port `18810` runs the incremental Obsidian vault indexer.
|
||||
- Systemd user service `obsidian-reindex-endpoint.service`.
|
||||
|
||||
## Obsidian Semantic Index
|
||||
|
||||
Implemented 2026-05-13.
|
||||
|
||||
### Architecture
|
||||
|
||||
- **Vector store**: Hermes rag-search ChromaDB embedded at `~/.hermes/data/rag-search/chroma/` in the `obsidian` collection.
|
||||
- **Embeddings**: Ollama `nomic-embed-text` on port `18807` (768-dim vectors).
|
||||
- **Indexer**: `~/.hermes/skills/note-taking/rag-search/scripts/index_obsidian.py`
|
||||
- **Chunking**: Markdown files are split by heading sections; long sections get sliding-window chunks (max 2000 chars, 200 char overlap). YAML frontmatter is extracted and stored as metadata.
|
||||
- **Search**: `~/.hermes/skills/note-taking/rag-search/scripts/search.py --index obsidian "query"`
|
||||
- **Cross-collection search**: `search.py "query"` now searches all three collections (`personal`, `docs`, `obsidian`) using the appropriate embedding backend per collection.
|
||||
|
||||
### Index stats (2026-05-13)
|
||||
|
||||
- 36 markdown files indexed
|
||||
- 231 chunks
|
||||
- Embedding model: `nomic-embed-text` via Ollama
|
||||
- Full index time: ~5 minutes (Ollama CPU inference at ~1.2s/text, batch=10)
|
||||
- Incremental reindex (no changes): ~1.4 seconds
|
||||
|
||||
### Incremental updates
|
||||
|
||||
- File content SHA-256 hashes tracked in `~/.hermes/data/rag-search/obsidian_index_state.json`.
|
||||
- Only changed files are re-indexed on subsequent runs.
|
||||
- Deleted files have their chunks removed from ChromaDB.
|
||||
|
||||
### Automated reindex
|
||||
|
||||
- n8n workflow `Obsidian Vault Reindex` (`85ntyyphDJ4Ms2b4`) triggers every 6 hours.
|
||||
- Calls `POST http://172.19.0.1:18810/reindex` (host-side endpoint).
|
||||
- Host endpoint: `~/lab/swarm/scripts/obsidian-reindex-server.py` on port `18810`.
|
||||
- Systemd service: `obsidian-reindex-endpoint.service` (enabled).
|
||||
- Manual trigger: `curl -X POST http://127.0.0.1:18810/reindex`
|
||||
|
||||
### Verification commands
|
||||
|
||||
```bash
|
||||
# Check index state
|
||||
curl -fsS http://127.0.0.1:18810/reindex/status | python3 -m json.tool
|
||||
|
||||
# Trigger manual reindex
|
||||
curl -X POST http://127.0.0.1:18810/reindex | python3 -m json.tool
|
||||
|
||||
# Search the Obsidian index
|
||||
~/.hermes/skills/note-taking/rag-search/venv/bin/python \
|
||||
~/.hermes/skills/note-taking/rag-search/scripts/search.py --index obsidian "health monitoring"
|
||||
|
||||
# Check ChromaDB data
|
||||
du -sh ~/.hermes/data/rag-search/chroma/
|
||||
|
||||
# Check systemd service
|
||||
systemctl --user status obsidian-reindex-endpoint.service
|
||||
|
||||
# Verify from inside n8n container
|
||||
docker exec n8n-agent wget -qO- http://172.19.0.1:18810/healthz
|
||||
```
|
||||
|
||||
## Not yet implemented
|
||||
|
||||
### Weekly review
|
||||
@@ -226,27 +296,6 @@ Recommended implementation:
|
||||
3. Use Atlas/Hermes or cloud model for final synthesis.
|
||||
4. Write `Notes/YYYY-MM-DD Weekly Review.md`.
|
||||
|
||||
### Obsidian Semantic Index
|
||||
|
||||
Desired scope:
|
||||
|
||||
- Watch vault changes.
|
||||
- Chunk changed notes.
|
||||
- Embed with Ollama on `18807` using `nomic-embed-text`.
|
||||
- Store vectors locally.
|
||||
- Enable semantic search / RAG for Atlas.
|
||||
|
||||
Recommended implementation options:
|
||||
|
||||
1. Prefer Hermes `rag-search`/local ChromaDB if already available and stable.
|
||||
2. If n8n owns the trigger, have n8n call a local indexing webhook/script rather than implementing vector DB logic entirely in n8n.
|
||||
3. Use file-change polling if native file watch is unreliable in Docker/virtiofs.
|
||||
|
||||
Open questions:
|
||||
|
||||
- Which vector store should be canonical: Hermes rag-search ChromaDB, a separate Chroma instance, SQLite vector extension, or another local store?
|
||||
- Should n8n trigger indexing, or should Atlas/Hermes own indexing and n8n only notify?
|
||||
|
||||
### Personal data routing
|
||||
|
||||
Desired scope:
|
||||
@@ -277,11 +326,7 @@ Recommended implementation:
|
||||
4. ~~Extend Evening Digest.~~ Done 2026-05-13: workflow `PlZywwqL8MRNEAN6`, daily 21:00 PT.
|
||||
5. ~~Add Discord delivery to n8n Failure Digest.~~ Done 2026-05-13.
|
||||
6. ~~Fix stale container URLs in IMAP workflow.~~ Done 2026-05-13.
|
||||
|
||||
7. Implement Obsidian Semantic Index.
|
||||
- Decide canonical vector store first.
|
||||
- Use Ollama embeddings on `18807`.
|
||||
- Add incremental update path.
|
||||
7. ~~Implement Obsidian Semantic Index.~~ Done 2026-05-13: ChromaDB `obsidian` collection, Ollama nomic-embed-text, automated reindex every 6h.
|
||||
|
||||
8. Upgrade Web-to-Notes Capture.
|
||||
- Add PDF and YouTube transcript support.
|
||||
@@ -292,8 +337,8 @@ Recommended implementation:
|
||||
- Add optional Kokoro audio summary.
|
||||
|
||||
10. Define webhook action bus catalog.
|
||||
- Document stable endpoints and schemas.
|
||||
- Add `process_url`, `summarize_pdf`, `add_reminder`, `sync_vault`, `run_health_check`.
|
||||
- Document stable endpoints and schemas.
|
||||
- Add `process_url`, `summarize_pdf`, `add_reminder`, `sync_vault`, `run_health_check`.
|
||||
|
||||
## Verification commands
|
||||
|
||||
@@ -311,8 +356,13 @@ docker exec n8n-agent n8n export:workflow --all --output=/tmp/workflows-verify.j
|
||||
# Docker health endpoint (host-side systemd service)
|
||||
curl -fsS --max-time 3 http://127.0.0.1:18809/health | python3 -m json.tool
|
||||
|
||||
# Obsidian reindex endpoint
|
||||
curl -fsS http://127.0.0.1:18810/healthz
|
||||
curl -fsS http://127.0.0.1:18810/reindex/status | python3 -m json.tool
|
||||
|
||||
# Verify from inside n8n container
|
||||
docker exec n8n-agent wget -qO- http://172.19.0.1:18809/health
|
||||
docker exec n8n-agent wget -qO- http://172.19.0.1:18810/healthz
|
||||
```
|
||||
|
||||
### n8n Public API access
|
||||
@@ -358,4 +408,6 @@ docker logs n8n-agent --tail 120
|
||||
- `N8N_API_KEY` in `~/lab/swarm/.env` is stale and returns 401. Get the working key from n8n credential `UPAHgUJVRqZQceL4` (see Verification commands).
|
||||
- Do not commit DB backups, workflow execution history, secrets, or runtime state.
|
||||
- The Google OAuth credential (`wpcf2epDDCT57Y5x`) cannot refresh (`invalid_client`). Gmail workflows use IMAP fallback instead.
|
||||
- The Docker health endpoint (`18809`) must bind to `0.0.0.0` (not `127.0.0.1`) so the n8n container can reach it via the Docker bridge gateway.
|
||||
- The Docker health endpoint (`18809`) and reindex endpoint (`18810`) must bind to `0.0.0.0` (not `127.0.0.1`) so the n8n container can reach them via the Docker bridge gateway.
|
||||
- The `obsidian` ChromaDB collection uses Ollama `nomic-embed-text` embeddings, while `personal` and `docs` use Sentence Transformers `all-MiniLM-L6-v2`. They cannot be compared directly by score across backends.
|
||||
- Ollama embedding on CPU is ~1.2s per text. Full vault reindex takes ~5 minutes for 231 chunks. Incremental (no changes) takes ~1.4 seconds.
|
||||
|
||||
Reference in New Issue
Block a user