diff --git a/scripts/rag-embedding-health-server.py b/scripts/rag-embedding-health-server.py new file mode 100644 index 0000000..601ab73 --- /dev/null +++ b/scripts/rag-embedding-health-server.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +"""RAG/embedding health HTTP wrapper for n8n. + +Listens on 0.0.0.0:18814 so the n8n container can call it via +http://172.19.0.1:18814. + +Endpoints: + GET /healthz -> service liveness + POST /check -> run ~/.hermes/scripts/rag_embedding_health.py and return JSON +""" + +from __future__ import annotations + +import http.server +import json +import os +import subprocess +import time +from pathlib import Path + +PORT = int(os.environ.get("PORT", "18814")) +CHECK_SCRIPT = Path(os.environ.get("RAG_HEALTH_SCRIPT", "/home/will/.hermes/scripts/rag_embedding_health.py")) +TIMEOUT = int(os.environ.get("RAG_HEALTH_TIMEOUT", "180")) + + +class Handler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + if self.path.rstrip("/") == "/healthz": + self._json({"status": "ok", "service": "rag-embedding-health"}) + else: + self._json({"error": "not found"}, status=404) + + def do_POST(self): + if self.path.rstrip("/") != "/check": + self._json({"error": "not found"}, status=404) + return + + started = time.time() + if not CHECK_SCRIPT.exists(): + self._json( + { + "ok": False, + "status": "failed", + "exitCode": 127, + "output": f"RAG health script missing: {CHECK_SCRIPT}", + "durationMs": 0, + }, + status=200, + ) + return + + env = os.environ.copy() + env.setdefault("HERMES_HOME", "/home/will/.hermes") + env.setdefault("OLLAMA_BASE_URL", "http://127.0.0.1:18807") + env.setdefault("N8N_URL", "http://127.0.0.1:18808") + env.setdefault("OBSIDIAN_REINDEX_URL", "http://127.0.0.1:18810") + + try: + proc = subprocess.run( + [str(CHECK_SCRIPT)], + text=True, + capture_output=True, + timeout=TIMEOUT, + check=False, + env=env, + ) + output = (proc.stdout or proc.stderr or "").strip() + self._json( + { + "ok": proc.returncode == 0, + "status": "ok" if proc.returncode == 0 else "failed", + "exitCode": proc.returncode, + "output": output[:4000], + "durationMs": int((time.time() - started) * 1000), + }, + status=200, + ) + except subprocess.TimeoutExpired: + self._json( + { + "ok": False, + "status": "timeout", + "exitCode": 124, + "output": f"RAG/embedding health check timed out after {TIMEOUT}s", + "durationMs": int((time.time() - started) * 1000), + }, + status=200, + ) + except Exception as exc: + self._json( + { + "ok": False, + "status": "error", + "exitCode": 1, + "output": str(exc)[:4000], + "durationMs": int((time.time() - started) * 1000), + }, + status=200, + ) + + def _json(self, data, status=200): + body = json.dumps(data, indent=2).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format, *args): + return + + +if __name__ == "__main__": + server = http.server.HTTPServer(("0.0.0.0", PORT), Handler) + print(f"rag-embedding-health listening on 0.0.0.0:{PORT}", flush=True) + server.serve_forever() diff --git a/swarm-common/n8n-workflows/rag-and-embedding-health-watchdog.json b/swarm-common/n8n-workflows/rag-and-embedding-health-watchdog.json new file mode 100644 index 0000000..4f3e362 --- /dev/null +++ b/swarm-common/n8n-workflows/rag-and-embedding-health-watchdog.json @@ -0,0 +1,345 @@ +{ + "updatedAt": "2026-05-14T18:49:58.205Z", + "createdAt": "2026-05-14T18:49:04.674Z", + "id": "SwKaPtYqUJrakpFu", + "name": "RAG and Embedding Health Watchdog", + "description": null, + "active": true, + "isArchived": false, + "nodes": [ + { + "parameters": {}, + "id": "bca0ccac-1102-4b45-a9e3-a52f06352376", + "name": "Manual Trigger", + "type": "n8n-nodes-base.manualTrigger", + "typeVersion": 1, + "position": [ + 0, + 100 + ] + }, + { + "parameters": { + "rule": { + "interval": [ + { + "field": "hours", + "hoursInterval": 6 + } + ] + } + }, + "id": "3f5e4d1e-7e90-43d1-ae01-97dde40fbf28", + "name": "Every 6 Hours", + "type": "n8n-nodes-base.scheduleTrigger", + "typeVersion": 1.2, + "position": [ + 0, + -80 + ] + }, + { + "parameters": { + "method": "POST", + "url": "http://172.19.0.1:18814/check", + "options": { + "timeout": 240000 + } + }, + "id": "52e14b9f-4ab4-4906-9ed7-0dbe10762c26", + "name": "Run RAG Health Check", + "type": "n8n-nodes-base.httpRequest", + "typeVersion": 4.2, + "position": [ + 260, + 20 + ] + }, + { + "parameters": { + "mode": "runOnceForAllItems", + "jsCode": "const staticData = $getWorkflowStaticData('global');\nconst data = $input.first().json;\nconst now = new Date().toISOString();\nconst nl = String.fromCharCode(10);\nconst prev = staticData.ragEmbedding || { failedRuns: 0, alerted: false };\n\nif (data.ok) {\n const wasAlerted = prev.alerted;\n staticData.ragEmbedding = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: data.status, durationMs: data.durationMs };\n if (!wasAlerted) return [];\n return [{ json: { text: ['\u2705 RAG/Embedding health recovered', `- status=ok; duration=${data.durationMs}ms`, `checked=${now}`].join(nl), data } }];\n}\n\nconst failedRuns = (prev.failedRuns || 0) + 1;\nconst shouldAlert = !prev.alerted || failedRuns % 4 === 0;\nstaticData.ragEmbedding = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: data.status, exitCode: data.exitCode, output: data.output };\nif (!shouldAlert) return [];\n\nconst output = (data.output || 'No output from checker').trim();\nconst lines = [\n '\ud83d\udea8 RAG/Embedding Health Watchdog',\n `- failedRuns=${failedRuns}; status=${data.status}; exit=${data.exitCode}; duration=${data.durationMs}ms`,\n output,\n 'fix=check systemctl --user status rag-embedding-health.service; then inspect Ollama 18807, ChromaDB, and Obsidian reindex 18810.',\n `checked=${now}`,\n];\nreturn [{ json: { text: lines.join(nl), data } }];" + }, + "id": "6b435e3e-2efc-43da-b565-d5ecb819af1f", + "name": "Alert on Failure or Recovery", + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [ + 520, + 20 + ] + }, + { + "parameters": { + "authentication": "predefinedCredentialType", + "nodeCredentialType": "httpHeaderAuth", + "method": "POST", + "url": "https://discord.com/api/v10/channels/1494453542243532932/messages", + "sendBody": true, + "specifyBody": "json", + "jsonBody": "={{ { content: $json.text } }}", + "options": {} + }, + "id": "1ebabe7e-2dbc-4fa6-a63c-3d869314a5cf", + "name": "Send Discord Ops Alert", + "type": "n8n-nodes-base.httpRequest", + "typeVersion": 4.2, + "position": [ + 800, + 20 + ], + "credentials": { + "httpHeaderAuth": { + "id": "UgPqYcoCNNIgr55m", + "name": "Discord Bot Auth" + } + } + } + ], + "connections": { + "Manual Trigger": { + "main": [ + [ + { + "node": "Run RAG Health Check", + "type": "main", + "index": 0 + } + ] + ] + }, + "Every 6 Hours": { + "main": [ + [ + { + "node": "Run RAG Health Check", + "type": "main", + "index": 0 + } + ] + ] + }, + "Run RAG Health Check": { + "main": [ + [ + { + "node": "Alert on Failure or Recovery", + "type": "main", + "index": 0 + } + ] + ] + }, + "Alert on Failure or Recovery": { + "main": [ + [ + { + "node": "Send Discord Ops Alert", + "type": "main", + "index": 0 + } + ] + ] + } + }, + "settings": { + "executionOrder": "v1", + "callerPolicy": "workflowsFromSameOwner", + "availableInMCP": false + }, + "staticData": { + "node:Every 6 Hours": { + "recurrenceRules": [] + }, + "global": { + "ragEmbedding": { + "failedRuns": 0, + "alerted": false, + "lastOk": "2026-05-14T18:50:22.108Z", + "lastStatus": "ok", + "durationMs": 13239 + } + } + }, + "meta": null, + "versionId": "b6be4349-5960-40cd-b857-bd6c9c6c717f", + "activeVersionId": "b6be4349-5960-40cd-b857-bd6c9c6c717f", + "versionCounter": 9, + "triggerCount": 1, + "shared": [ + { + "updatedAt": "2026-05-14T18:49:04.685Z", + "createdAt": "2026-05-14T18:49:04.685Z", + "role": "workflow:owner", + "workflowId": "SwKaPtYqUJrakpFu", + "projectId": "WGdp8QunI1tHpjXa", + "project": { + "updatedAt": "2026-03-11T21:08:10.005Z", + "createdAt": "2026-03-11T21:05:11.541Z", + "id": "WGdp8QunI1tHpjXa", + "name": "will will ", + "type": "personal", + "icon": null, + "description": null, + "creatorId": "5ad50ead-6e6a-4d12-ab5b-e5db15835bb5" + } + } + ], + "tags": [], + "activeVersion": { + "updatedAt": "2026-05-14T18:49:58.207Z", + "createdAt": "2026-05-14T18:49:58.207Z", + "versionId": "b6be4349-5960-40cd-b857-bd6c9c6c717f", + "workflowId": "SwKaPtYqUJrakpFu", + "nodes": [ + { + "parameters": {}, + "id": "bca0ccac-1102-4b45-a9e3-a52f06352376", + "name": "Manual Trigger", + "type": "n8n-nodes-base.manualTrigger", + "typeVersion": 1, + "position": [ + 0, + 100 + ] + }, + { + "parameters": { + "rule": { + "interval": [ + { + "field": "hours", + "hoursInterval": 6 + } + ] + } + }, + "id": "3f5e4d1e-7e90-43d1-ae01-97dde40fbf28", + "name": "Every 6 Hours", + "type": "n8n-nodes-base.scheduleTrigger", + "typeVersion": 1.2, + "position": [ + 0, + -80 + ] + }, + { + "parameters": { + "method": "POST", + "url": "http://172.19.0.1:18814/check", + "options": { + "timeout": 240000 + } + }, + "id": "52e14b9f-4ab4-4906-9ed7-0dbe10762c26", + "name": "Run RAG Health Check", + "type": "n8n-nodes-base.httpRequest", + "typeVersion": 4.2, + "position": [ + 260, + 20 + ] + }, + { + "parameters": { + "mode": "runOnceForAllItems", + "jsCode": "const staticData = $getWorkflowStaticData('global');\nconst data = $input.first().json;\nconst now = new Date().toISOString();\nconst nl = String.fromCharCode(10);\nconst prev = staticData.ragEmbedding || { failedRuns: 0, alerted: false };\n\nif (data.ok) {\n const wasAlerted = prev.alerted;\n staticData.ragEmbedding = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: data.status, durationMs: data.durationMs };\n if (!wasAlerted) return [];\n return [{ json: { text: ['\u2705 RAG/Embedding health recovered', `- status=ok; duration=${data.durationMs}ms`, `checked=${now}`].join(nl), data } }];\n}\n\nconst failedRuns = (prev.failedRuns || 0) + 1;\nconst shouldAlert = !prev.alerted || failedRuns % 4 === 0;\nstaticData.ragEmbedding = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: data.status, exitCode: data.exitCode, output: data.output };\nif (!shouldAlert) return [];\n\nconst output = (data.output || 'No output from checker').trim();\nconst lines = [\n '\ud83d\udea8 RAG/Embedding Health Watchdog',\n `- failedRuns=${failedRuns}; status=${data.status}; exit=${data.exitCode}; duration=${data.durationMs}ms`,\n output,\n 'fix=check systemctl --user status rag-embedding-health.service; then inspect Ollama 18807, ChromaDB, and Obsidian reindex 18810.',\n `checked=${now}`,\n];\nreturn [{ json: { text: lines.join(nl), data } }];" + }, + "id": "6b435e3e-2efc-43da-b565-d5ecb819af1f", + "name": "Alert on Failure or Recovery", + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [ + 520, + 20 + ] + }, + { + "parameters": { + "authentication": "predefinedCredentialType", + "nodeCredentialType": "httpHeaderAuth", + "method": "POST", + "url": "https://discord.com/api/v10/channels/1494453542243532932/messages", + "sendBody": true, + "specifyBody": "json", + "jsonBody": "={{ { content: $json.text } }}", + "options": {} + }, + "id": "1ebabe7e-2dbc-4fa6-a63c-3d869314a5cf", + "name": "Send Discord Ops Alert", + "type": "n8n-nodes-base.httpRequest", + "typeVersion": 4.2, + "position": [ + 800, + 20 + ], + "credentials": { + "httpHeaderAuth": { + "id": "UgPqYcoCNNIgr55m", + "name": "Discord Bot Auth" + } + } + } + ], + "connections": { + "Manual Trigger": { + "main": [ + [ + { + "node": "Run RAG Health Check", + "type": "main", + "index": 0 + } + ] + ] + }, + "Every 6 Hours": { + "main": [ + [ + { + "node": "Run RAG Health Check", + "type": "main", + "index": 0 + } + ] + ] + }, + "Run RAG Health Check": { + "main": [ + [ + { + "node": "Alert on Failure or Recovery", + "type": "main", + "index": 0 + } + ] + ] + }, + "Alert on Failure or Recovery": { + "main": [ + [ + { + "node": "Send Discord Ops Alert", + "type": "main", + "index": 0 + } + ] + ] + } + }, + "authors": "will will", + "name": null, + "description": null, + "autosaved": false, + "workflowPublishHistory": [ + { + "createdAt": "2026-05-14T18:49:58.274Z", + "id": 1516, + "workflowId": "SwKaPtYqUJrakpFu", + "versionId": "b6be4349-5960-40cd-b857-bd6c9c6c717f", + "event": "activated", + "userId": "5ad50ead-6e6a-4d12-ab5b-e5db15835bb5" + } + ] + } +} diff --git a/swarm-common/rag-embedding-health.service b/swarm-common/rag-embedding-health.service new file mode 100644 index 0000000..aa72e76 --- /dev/null +++ b/swarm-common/rag-embedding-health.service @@ -0,0 +1,14 @@ +[Unit] +Description=RAG/Embedding Health HTTP Service (port 18814) +After=network.target + +[Service] +Type=simple +ExecStart=/usr/bin/python3 /home/will/lab/swarm/scripts/rag-embedding-health-server.py +Restart=on-failure +RestartSec=5 +Environment=PORT=18814 +Environment=RAG_HEALTH_TIMEOUT=180 + +[Install] +WantedBy=default.target