feat(swarm): add Obsidian vault reindex endpoint + update handoff
- obsidian-reindex-server.py: HTTP endpoint on port 18810 for triggering incremental Obsidian vault reindex from n8n - Updated n8n Implementation Handoff: Obsidian Semantic Index section, new reindex workflow, updated verification commands
This commit is contained in:
@@ -0,0 +1,124 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Obsidian Vault Reindex Endpoint
|
||||||
|
Lightweight HTTP server that triggers an incremental Obsidian vault reindex.
|
||||||
|
|
||||||
|
Listens on 0.0.0.0:18810 (configurable via PORT env var).
|
||||||
|
Called by n8n webhooks or systemd timers.
|
||||||
|
|
||||||
|
Endpoints:
|
||||||
|
POST /reindex -> trigger incremental reindex, returns JSON stats
|
||||||
|
GET /reindex/status -> check last index state
|
||||||
|
GET /healthz -> returns ok
|
||||||
|
"""
|
||||||
|
|
||||||
|
import http.server
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
PORT = int(os.environ.get("PORT", 18810))
|
||||||
|
|
||||||
|
REINDEX_SCRIPT = str(
|
||||||
|
Path.home()
|
||||||
|
/ ".hermes/skills/note-taking/rag-search/scripts/reindex_obsidian.sh"
|
||||||
|
)
|
||||||
|
STATE_FILE = (
|
||||||
|
Path.home() / ".hermes/data/rag-search/obsidian_index_state.json"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Lock to prevent concurrent reindexing
|
||||||
|
_reindex_lock = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def run_reindex() -> dict:
|
||||||
|
"""Run the incremental reindex script. Returns stats dict."""
|
||||||
|
if not _reindex_lock.acquire(blocking=False):
|
||||||
|
return {"error": "reindex already in progress", "status": "locked"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[REINDEX_SCRIPT],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=600, # 10 min max for full reindex
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
return {
|
||||||
|
"error": "reindex failed",
|
||||||
|
"exit_code": result.returncode,
|
||||||
|
"stderr": result.stderr.strip()[:500],
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
return json.loads(result.stdout)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {
|
||||||
|
"error": "invalid json output",
|
||||||
|
"stdout": result.stdout.strip()[:500],
|
||||||
|
}
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
return {"error": "reindex timed out (600s)"}
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": str(e)}
|
||||||
|
finally:
|
||||||
|
_reindex_lock.release()
|
||||||
|
|
||||||
|
|
||||||
|
def get_status() -> dict:
|
||||||
|
"""Read the last index state file."""
|
||||||
|
if not STATE_FILE.exists():
|
||||||
|
return {"indexed": False, "message": "no state file"}
|
||||||
|
try:
|
||||||
|
return json.loads(STATE_FILE.read_text())
|
||||||
|
except (json.JSONDecodeError, IOError) as e:
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
class ReindexHandler(http.server.BaseHTTPRequestHandler):
|
||||||
|
def do_GET(self):
|
||||||
|
path = self.path.rstrip("/")
|
||||||
|
if path == "/healthz":
|
||||||
|
self._json_response({"status": "ok"})
|
||||||
|
elif path == "/reindex/status":
|
||||||
|
self._json_response(get_status())
|
||||||
|
else:
|
||||||
|
self._json_response({"error": "not found"}, status=404)
|
||||||
|
|
||||||
|
def do_POST(self):
|
||||||
|
path = self.path.rstrip("/")
|
||||||
|
if path == "/reindex":
|
||||||
|
# Run in background thread so we can respond
|
||||||
|
result = run_reindex()
|
||||||
|
status = 200 if "error" not in result else 500
|
||||||
|
self._json_response(result, status=status)
|
||||||
|
else:
|
||||||
|
self._json_response({"error": "not found"}, status=404)
|
||||||
|
|
||||||
|
def _json_response(self, data, status=200):
|
||||||
|
body = json.dumps(data, indent=2).encode()
|
||||||
|
self.send_response(status)
|
||||||
|
self.send_header("Content-Type", "application/json")
|
||||||
|
self.send_header("Content-Length", str(len(body)))
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(body)
|
||||||
|
|
||||||
|
def log_message(self, format, *args):
|
||||||
|
# Minimal logging
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
server = http.server.HTTPServer(("0.0.0.0", PORT), ReindexHandler)
|
||||||
|
print(f"obsidian-reindex-server listening on 0.0.0.0:{PORT}", flush=True)
|
||||||
|
try:
|
||||||
|
server.serve_forever()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
server.server_close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
+82
-30
@@ -19,11 +19,13 @@ Last verified on 2026-05-13 (evening):
|
|||||||
- Container: `n8n-agent` running and healthy.
|
- Container: `n8n-agent` running and healthy.
|
||||||
- Health endpoint: `GET /healthz` returns `{"status":"ok"}`.
|
- Health endpoint: `GET /healthz` returns `{"status":"ok"}`.
|
||||||
- Workflow export: `n8n export:workflow --all` succeeds.
|
- Workflow export: `n8n export:workflow --all` succeeds.
|
||||||
- Active workflows: 12.
|
- Active workflows: 13.
|
||||||
- Inactive workflows: 1 (Nightly Obsidian Vault Sync replaced by Evening Digest).
|
- Inactive workflows: 1 (Nightly Obsidian Vault Sync replaced by Evening Digest).
|
||||||
- Archived workflows: 2 unrecoverable duplicate IMAP workflows archived after SQLite recovery.
|
- Archived workflows: 2 unrecoverable duplicate IMAP workflows archived after SQLite recovery.
|
||||||
- Docker health endpoint: `GET :18809/health` returns container state for 7 services.
|
- Docker health endpoint: `GET :18809/health` returns container state for 7 services.
|
||||||
- Systemd user service `docker-health-endpoint.service` active and enabled.
|
- Systemd user service `docker-health-endpoint.service` active and enabled.
|
||||||
|
- Obsidian reindex endpoint: `POST :18810/reindex` triggers incremental vault reindex.
|
||||||
|
- Systemd user service `obsidian-reindex-endpoint.service` active and enabled.
|
||||||
|
|
||||||
## Implemented and active
|
## Implemented and active
|
||||||
|
|
||||||
@@ -207,6 +209,74 @@ Last verified on 2026-05-13 (evening):
|
|||||||
- `run_health_check`
|
- `run_health_check`
|
||||||
- `process_voice_memo`
|
- `process_voice_memo`
|
||||||
|
|
||||||
|
### Obsidian Vault Reindex
|
||||||
|
|
||||||
|
- Workflow ID: `85ntyyphDJ4Ms2b4`
|
||||||
|
- Status: active
|
||||||
|
- Trigger: every 6 hours
|
||||||
|
- Current behavior:
|
||||||
|
- n8n schedule trigger calls `POST http://172.19.0.1:18810/reindex`.
|
||||||
|
- Host-side `obsidian-reindex-server.py` on port `18810` runs the incremental Obsidian vault indexer.
|
||||||
|
- Systemd user service `obsidian-reindex-endpoint.service`.
|
||||||
|
|
||||||
|
## Obsidian Semantic Index
|
||||||
|
|
||||||
|
Implemented 2026-05-13.
|
||||||
|
|
||||||
|
### Architecture
|
||||||
|
|
||||||
|
- **Vector store**: Hermes rag-search ChromaDB embedded at `~/.hermes/data/rag-search/chroma/` in the `obsidian` collection.
|
||||||
|
- **Embeddings**: Ollama `nomic-embed-text` on port `18807` (768-dim vectors).
|
||||||
|
- **Indexer**: `~/.hermes/skills/note-taking/rag-search/scripts/index_obsidian.py`
|
||||||
|
- **Chunking**: Markdown files are split by heading sections; long sections get sliding-window chunks (max 2000 chars, 200 char overlap). YAML frontmatter is extracted and stored as metadata.
|
||||||
|
- **Search**: `~/.hermes/skills/note-taking/rag-search/scripts/search.py --index obsidian "query"`
|
||||||
|
- **Cross-collection search**: `search.py "query"` now searches all three collections (`personal`, `docs`, `obsidian`) using the appropriate embedding backend per collection.
|
||||||
|
|
||||||
|
### Index stats (2026-05-13)
|
||||||
|
|
||||||
|
- 36 markdown files indexed
|
||||||
|
- 231 chunks
|
||||||
|
- Embedding model: `nomic-embed-text` via Ollama
|
||||||
|
- Full index time: ~5 minutes (Ollama CPU inference at ~1.2s/text, batch=10)
|
||||||
|
- Incremental reindex (no changes): ~1.4 seconds
|
||||||
|
|
||||||
|
### Incremental updates
|
||||||
|
|
||||||
|
- File content SHA-256 hashes tracked in `~/.hermes/data/rag-search/obsidian_index_state.json`.
|
||||||
|
- Only changed files are re-indexed on subsequent runs.
|
||||||
|
- Deleted files have their chunks removed from ChromaDB.
|
||||||
|
|
||||||
|
### Automated reindex
|
||||||
|
|
||||||
|
- n8n workflow `Obsidian Vault Reindex` (`85ntyyphDJ4Ms2b4`) triggers every 6 hours.
|
||||||
|
- Calls `POST http://172.19.0.1:18810/reindex` (host-side endpoint).
|
||||||
|
- Host endpoint: `~/lab/swarm/scripts/obsidian-reindex-server.py` on port `18810`.
|
||||||
|
- Systemd service: `obsidian-reindex-endpoint.service` (enabled).
|
||||||
|
- Manual trigger: `curl -X POST http://127.0.0.1:18810/reindex`
|
||||||
|
|
||||||
|
### Verification commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check index state
|
||||||
|
curl -fsS http://127.0.0.1:18810/reindex/status | python3 -m json.tool
|
||||||
|
|
||||||
|
# Trigger manual reindex
|
||||||
|
curl -X POST http://127.0.0.1:18810/reindex | python3 -m json.tool
|
||||||
|
|
||||||
|
# Search the Obsidian index
|
||||||
|
~/.hermes/skills/note-taking/rag-search/venv/bin/python \
|
||||||
|
~/.hermes/skills/note-taking/rag-search/scripts/search.py --index obsidian "health monitoring"
|
||||||
|
|
||||||
|
# Check ChromaDB data
|
||||||
|
du -sh ~/.hermes/data/rag-search/chroma/
|
||||||
|
|
||||||
|
# Check systemd service
|
||||||
|
systemctl --user status obsidian-reindex-endpoint.service
|
||||||
|
|
||||||
|
# Verify from inside n8n container
|
||||||
|
docker exec n8n-agent wget -qO- http://172.19.0.1:18810/healthz
|
||||||
|
```
|
||||||
|
|
||||||
## Not yet implemented
|
## Not yet implemented
|
||||||
|
|
||||||
### Weekly review
|
### Weekly review
|
||||||
@@ -226,27 +296,6 @@ Recommended implementation:
|
|||||||
3. Use Atlas/Hermes or cloud model for final synthesis.
|
3. Use Atlas/Hermes or cloud model for final synthesis.
|
||||||
4. Write `Notes/YYYY-MM-DD Weekly Review.md`.
|
4. Write `Notes/YYYY-MM-DD Weekly Review.md`.
|
||||||
|
|
||||||
### Obsidian Semantic Index
|
|
||||||
|
|
||||||
Desired scope:
|
|
||||||
|
|
||||||
- Watch vault changes.
|
|
||||||
- Chunk changed notes.
|
|
||||||
- Embed with Ollama on `18807` using `nomic-embed-text`.
|
|
||||||
- Store vectors locally.
|
|
||||||
- Enable semantic search / RAG for Atlas.
|
|
||||||
|
|
||||||
Recommended implementation options:
|
|
||||||
|
|
||||||
1. Prefer Hermes `rag-search`/local ChromaDB if already available and stable.
|
|
||||||
2. If n8n owns the trigger, have n8n call a local indexing webhook/script rather than implementing vector DB logic entirely in n8n.
|
|
||||||
3. Use file-change polling if native file watch is unreliable in Docker/virtiofs.
|
|
||||||
|
|
||||||
Open questions:
|
|
||||||
|
|
||||||
- Which vector store should be canonical: Hermes rag-search ChromaDB, a separate Chroma instance, SQLite vector extension, or another local store?
|
|
||||||
- Should n8n trigger indexing, or should Atlas/Hermes own indexing and n8n only notify?
|
|
||||||
|
|
||||||
### Personal data routing
|
### Personal data routing
|
||||||
|
|
||||||
Desired scope:
|
Desired scope:
|
||||||
@@ -277,11 +326,7 @@ Recommended implementation:
|
|||||||
4. ~~Extend Evening Digest.~~ Done 2026-05-13: workflow `PlZywwqL8MRNEAN6`, daily 21:00 PT.
|
4. ~~Extend Evening Digest.~~ Done 2026-05-13: workflow `PlZywwqL8MRNEAN6`, daily 21:00 PT.
|
||||||
5. ~~Add Discord delivery to n8n Failure Digest.~~ Done 2026-05-13.
|
5. ~~Add Discord delivery to n8n Failure Digest.~~ Done 2026-05-13.
|
||||||
6. ~~Fix stale container URLs in IMAP workflow.~~ Done 2026-05-13.
|
6. ~~Fix stale container URLs in IMAP workflow.~~ Done 2026-05-13.
|
||||||
|
7. ~~Implement Obsidian Semantic Index.~~ Done 2026-05-13: ChromaDB `obsidian` collection, Ollama nomic-embed-text, automated reindex every 6h.
|
||||||
7. Implement Obsidian Semantic Index.
|
|
||||||
- Decide canonical vector store first.
|
|
||||||
- Use Ollama embeddings on `18807`.
|
|
||||||
- Add incremental update path.
|
|
||||||
|
|
||||||
8. Upgrade Web-to-Notes Capture.
|
8. Upgrade Web-to-Notes Capture.
|
||||||
- Add PDF and YouTube transcript support.
|
- Add PDF and YouTube transcript support.
|
||||||
@@ -292,8 +337,8 @@ Recommended implementation:
|
|||||||
- Add optional Kokoro audio summary.
|
- Add optional Kokoro audio summary.
|
||||||
|
|
||||||
10. Define webhook action bus catalog.
|
10. Define webhook action bus catalog.
|
||||||
- Document stable endpoints and schemas.
|
- Document stable endpoints and schemas.
|
||||||
- Add `process_url`, `summarize_pdf`, `add_reminder`, `sync_vault`, `run_health_check`.
|
- Add `process_url`, `summarize_pdf`, `add_reminder`, `sync_vault`, `run_health_check`.
|
||||||
|
|
||||||
## Verification commands
|
## Verification commands
|
||||||
|
|
||||||
@@ -311,8 +356,13 @@ docker exec n8n-agent n8n export:workflow --all --output=/tmp/workflows-verify.j
|
|||||||
# Docker health endpoint (host-side systemd service)
|
# Docker health endpoint (host-side systemd service)
|
||||||
curl -fsS --max-time 3 http://127.0.0.1:18809/health | python3 -m json.tool
|
curl -fsS --max-time 3 http://127.0.0.1:18809/health | python3 -m json.tool
|
||||||
|
|
||||||
|
# Obsidian reindex endpoint
|
||||||
|
curl -fsS http://127.0.0.1:18810/healthz
|
||||||
|
curl -fsS http://127.0.0.1:18810/reindex/status | python3 -m json.tool
|
||||||
|
|
||||||
# Verify from inside n8n container
|
# Verify from inside n8n container
|
||||||
docker exec n8n-agent wget -qO- http://172.19.0.1:18809/health
|
docker exec n8n-agent wget -qO- http://172.19.0.1:18809/health
|
||||||
|
docker exec n8n-agent wget -qO- http://172.19.0.1:18810/healthz
|
||||||
```
|
```
|
||||||
|
|
||||||
### n8n Public API access
|
### n8n Public API access
|
||||||
@@ -358,4 +408,6 @@ docker logs n8n-agent --tail 120
|
|||||||
- `N8N_API_KEY` in `~/lab/swarm/.env` is stale and returns 401. Get the working key from n8n credential `UPAHgUJVRqZQceL4` (see Verification commands).
|
- `N8N_API_KEY` in `~/lab/swarm/.env` is stale and returns 401. Get the working key from n8n credential `UPAHgUJVRqZQceL4` (see Verification commands).
|
||||||
- Do not commit DB backups, workflow execution history, secrets, or runtime state.
|
- Do not commit DB backups, workflow execution history, secrets, or runtime state.
|
||||||
- The Google OAuth credential (`wpcf2epDDCT57Y5x`) cannot refresh (`invalid_client`). Gmail workflows use IMAP fallback instead.
|
- The Google OAuth credential (`wpcf2epDDCT57Y5x`) cannot refresh (`invalid_client`). Gmail workflows use IMAP fallback instead.
|
||||||
- The Docker health endpoint (`18809`) must bind to `0.0.0.0` (not `127.0.0.1`) so the n8n container can reach it via the Docker bridge gateway.
|
- The Docker health endpoint (`18809`) and reindex endpoint (`18810`) must bind to `0.0.0.0` (not `127.0.0.1`) so the n8n container can reach them via the Docker bridge gateway.
|
||||||
|
- The `obsidian` ChromaDB collection uses Ollama `nomic-embed-text` embeddings, while `personal` and `docs` use Sentence Transformers `all-MiniLM-L6-v2`. They cannot be compared directly by score across backends.
|
||||||
|
- Ollama embedding on CPU is ~1.2s per text. Full vault reindex takes ~5 minutes for 231 chunks. Incremental (no changes) takes ~1.4 seconds.
|
||||||
|
|||||||
Reference in New Issue
Block a user