diff --git a/scripts/obsidian-reindex-server.py b/scripts/obsidian-reindex-server.py new file mode 100644 index 0000000..29d10a8 --- /dev/null +++ b/scripts/obsidian-reindex-server.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +Obsidian Vault Reindex Endpoint +Lightweight HTTP server that triggers an incremental Obsidian vault reindex. + +Listens on 0.0.0.0:18810 (configurable via PORT env var). +Called by n8n webhooks or systemd timers. + +Endpoints: + POST /reindex -> trigger incremental reindex, returns JSON stats + GET /reindex/status -> check last index state + GET /healthz -> returns ok +""" + +import http.server +import json +import os +import subprocess +import sys +import threading +from pathlib import Path + +PORT = int(os.environ.get("PORT", 18810)) + +REINDEX_SCRIPT = str( + Path.home() + / ".hermes/skills/note-taking/rag-search/scripts/reindex_obsidian.sh" +) +STATE_FILE = ( + Path.home() / ".hermes/data/rag-search/obsidian_index_state.json" +) + +# Lock to prevent concurrent reindexing +_reindex_lock = threading.Lock() + + +def run_reindex() -> dict: + """Run the incremental reindex script. Returns stats dict.""" + if not _reindex_lock.acquire(blocking=False): + return {"error": "reindex already in progress", "status": "locked"} + + try: + result = subprocess.run( + [REINDEX_SCRIPT], + capture_output=True, + text=True, + timeout=600, # 10 min max for full reindex + ) + if result.returncode != 0: + return { + "error": "reindex failed", + "exit_code": result.returncode, + "stderr": result.stderr.strip()[:500], + } + try: + return json.loads(result.stdout) + except json.JSONDecodeError: + return { + "error": "invalid json output", + "stdout": result.stdout.strip()[:500], + } + except subprocess.TimeoutExpired: + return {"error": "reindex timed out (600s)"} + except Exception as e: + return {"error": str(e)} + finally: + _reindex_lock.release() + + +def get_status() -> dict: + """Read the last index state file.""" + if not STATE_FILE.exists(): + return {"indexed": False, "message": "no state file"} + try: + return json.loads(STATE_FILE.read_text()) + except (json.JSONDecodeError, IOError) as e: + return {"error": str(e)} + + +class ReindexHandler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + path = self.path.rstrip("/") + if path == "/healthz": + self._json_response({"status": "ok"}) + elif path == "/reindex/status": + self._json_response(get_status()) + else: + self._json_response({"error": "not found"}, status=404) + + def do_POST(self): + path = self.path.rstrip("/") + if path == "/reindex": + # Run in background thread so we can respond + result = run_reindex() + status = 200 if "error" not in result else 500 + self._json_response(result, status=status) + else: + self._json_response({"error": "not found"}, status=404) + + def _json_response(self, data, status=200): + body = json.dumps(data, indent=2).encode() + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format, *args): + # Minimal logging + pass + + +def main(): + server = http.server.HTTPServer(("0.0.0.0", PORT), ReindexHandler) + print(f"obsidian-reindex-server listening on 0.0.0.0:{PORT}", flush=True) + try: + server.serve_forever() + except KeyboardInterrupt: + pass + server.server_close() + + +if __name__ == "__main__": + main() diff --git a/swarm-common/obsidian-vault/will/will-shared-zap/Infrastructure/Automation/n8n Implementation Handoff.md b/swarm-common/obsidian-vault/will/will-shared-zap/Infrastructure/Automation/n8n Implementation Handoff.md index 9dad762..b831355 100644 --- a/swarm-common/obsidian-vault/will/will-shared-zap/Infrastructure/Automation/n8n Implementation Handoff.md +++ b/swarm-common/obsidian-vault/will/will-shared-zap/Infrastructure/Automation/n8n Implementation Handoff.md @@ -19,11 +19,13 @@ Last verified on 2026-05-13 (evening): - Container: `n8n-agent` running and healthy. - Health endpoint: `GET /healthz` returns `{"status":"ok"}`. - Workflow export: `n8n export:workflow --all` succeeds. -- Active workflows: 12. +- Active workflows: 13. - Inactive workflows: 1 (Nightly Obsidian Vault Sync replaced by Evening Digest). - Archived workflows: 2 unrecoverable duplicate IMAP workflows archived after SQLite recovery. - Docker health endpoint: `GET :18809/health` returns container state for 7 services. - Systemd user service `docker-health-endpoint.service` active and enabled. +- Obsidian reindex endpoint: `POST :18810/reindex` triggers incremental vault reindex. +- Systemd user service `obsidian-reindex-endpoint.service` active and enabled. ## Implemented and active @@ -207,6 +209,74 @@ Last verified on 2026-05-13 (evening): - `run_health_check` - `process_voice_memo` +### Obsidian Vault Reindex + +- Workflow ID: `85ntyyphDJ4Ms2b4` +- Status: active +- Trigger: every 6 hours +- Current behavior: + - n8n schedule trigger calls `POST http://172.19.0.1:18810/reindex`. + - Host-side `obsidian-reindex-server.py` on port `18810` runs the incremental Obsidian vault indexer. + - Systemd user service `obsidian-reindex-endpoint.service`. + +## Obsidian Semantic Index + +Implemented 2026-05-13. + +### Architecture + +- **Vector store**: Hermes rag-search ChromaDB embedded at `~/.hermes/data/rag-search/chroma/` in the `obsidian` collection. +- **Embeddings**: Ollama `nomic-embed-text` on port `18807` (768-dim vectors). +- **Indexer**: `~/.hermes/skills/note-taking/rag-search/scripts/index_obsidian.py` +- **Chunking**: Markdown files are split by heading sections; long sections get sliding-window chunks (max 2000 chars, 200 char overlap). YAML frontmatter is extracted and stored as metadata. +- **Search**: `~/.hermes/skills/note-taking/rag-search/scripts/search.py --index obsidian "query"` +- **Cross-collection search**: `search.py "query"` now searches all three collections (`personal`, `docs`, `obsidian`) using the appropriate embedding backend per collection. + +### Index stats (2026-05-13) + +- 36 markdown files indexed +- 231 chunks +- Embedding model: `nomic-embed-text` via Ollama +- Full index time: ~5 minutes (Ollama CPU inference at ~1.2s/text, batch=10) +- Incremental reindex (no changes): ~1.4 seconds + +### Incremental updates + +- File content SHA-256 hashes tracked in `~/.hermes/data/rag-search/obsidian_index_state.json`. +- Only changed files are re-indexed on subsequent runs. +- Deleted files have their chunks removed from ChromaDB. + +### Automated reindex + +- n8n workflow `Obsidian Vault Reindex` (`85ntyyphDJ4Ms2b4`) triggers every 6 hours. +- Calls `POST http://172.19.0.1:18810/reindex` (host-side endpoint). +- Host endpoint: `~/lab/swarm/scripts/obsidian-reindex-server.py` on port `18810`. +- Systemd service: `obsidian-reindex-endpoint.service` (enabled). +- Manual trigger: `curl -X POST http://127.0.0.1:18810/reindex` + +### Verification commands + +```bash +# Check index state +curl -fsS http://127.0.0.1:18810/reindex/status | python3 -m json.tool + +# Trigger manual reindex +curl -X POST http://127.0.0.1:18810/reindex | python3 -m json.tool + +# Search the Obsidian index +~/.hermes/skills/note-taking/rag-search/venv/bin/python \ + ~/.hermes/skills/note-taking/rag-search/scripts/search.py --index obsidian "health monitoring" + +# Check ChromaDB data +du -sh ~/.hermes/data/rag-search/chroma/ + +# Check systemd service +systemctl --user status obsidian-reindex-endpoint.service + +# Verify from inside n8n container +docker exec n8n-agent wget -qO- http://172.19.0.1:18810/healthz +``` + ## Not yet implemented ### Weekly review @@ -226,27 +296,6 @@ Recommended implementation: 3. Use Atlas/Hermes or cloud model for final synthesis. 4. Write `Notes/YYYY-MM-DD Weekly Review.md`. -### Obsidian Semantic Index - -Desired scope: - -- Watch vault changes. -- Chunk changed notes. -- Embed with Ollama on `18807` using `nomic-embed-text`. -- Store vectors locally. -- Enable semantic search / RAG for Atlas. - -Recommended implementation options: - -1. Prefer Hermes `rag-search`/local ChromaDB if already available and stable. -2. If n8n owns the trigger, have n8n call a local indexing webhook/script rather than implementing vector DB logic entirely in n8n. -3. Use file-change polling if native file watch is unreliable in Docker/virtiofs. - -Open questions: - -- Which vector store should be canonical: Hermes rag-search ChromaDB, a separate Chroma instance, SQLite vector extension, or another local store? -- Should n8n trigger indexing, or should Atlas/Hermes own indexing and n8n only notify? - ### Personal data routing Desired scope: @@ -277,11 +326,7 @@ Recommended implementation: 4. ~~Extend Evening Digest.~~ Done 2026-05-13: workflow `PlZywwqL8MRNEAN6`, daily 21:00 PT. 5. ~~Add Discord delivery to n8n Failure Digest.~~ Done 2026-05-13. 6. ~~Fix stale container URLs in IMAP workflow.~~ Done 2026-05-13. - -7. Implement Obsidian Semantic Index. - - Decide canonical vector store first. - - Use Ollama embeddings on `18807`. - - Add incremental update path. +7. ~~Implement Obsidian Semantic Index.~~ Done 2026-05-13: ChromaDB `obsidian` collection, Ollama nomic-embed-text, automated reindex every 6h. 8. Upgrade Web-to-Notes Capture. - Add PDF and YouTube transcript support. @@ -292,8 +337,8 @@ Recommended implementation: - Add optional Kokoro audio summary. 10. Define webhook action bus catalog. - - Document stable endpoints and schemas. - - Add `process_url`, `summarize_pdf`, `add_reminder`, `sync_vault`, `run_health_check`. + - Document stable endpoints and schemas. + - Add `process_url`, `summarize_pdf`, `add_reminder`, `sync_vault`, `run_health_check`. ## Verification commands @@ -311,8 +356,13 @@ docker exec n8n-agent n8n export:workflow --all --output=/tmp/workflows-verify.j # Docker health endpoint (host-side systemd service) curl -fsS --max-time 3 http://127.0.0.1:18809/health | python3 -m json.tool +# Obsidian reindex endpoint +curl -fsS http://127.0.0.1:18810/healthz +curl -fsS http://127.0.0.1:18810/reindex/status | python3 -m json.tool + # Verify from inside n8n container docker exec n8n-agent wget -qO- http://172.19.0.1:18809/health +docker exec n8n-agent wget -qO- http://172.19.0.1:18810/healthz ``` ### n8n Public API access @@ -358,4 +408,6 @@ docker logs n8n-agent --tail 120 - `N8N_API_KEY` in `~/lab/swarm/.env` is stale and returns 401. Get the working key from n8n credential `UPAHgUJVRqZQceL4` (see Verification commands). - Do not commit DB backups, workflow execution history, secrets, or runtime state. - The Google OAuth credential (`wpcf2epDDCT57Y5x`) cannot refresh (`invalid_client`). Gmail workflows use IMAP fallback instead. -- The Docker health endpoint (`18809`) must bind to `0.0.0.0` (not `127.0.0.1`) so the n8n container can reach it via the Docker bridge gateway. +- The Docker health endpoint (`18809`) and reindex endpoint (`18810`) must bind to `0.0.0.0` (not `127.0.0.1`) so the n8n container can reach them via the Docker bridge gateway. +- The `obsidian` ChromaDB collection uses Ollama `nomic-embed-text` embeddings, while `personal` and `docs` use Sentence Transformers `all-MiniLM-L6-v2`. They cannot be compared directly by score across backends. +- Ollama embedding on CPU is ~1.2s per text. Full vault reindex takes ~5 minutes for 231 chunks. Incremental (no changes) takes ~1.4 seconds.