feat(n8n): migrate rag health watchdog

This commit is contained in:
William Valentin
2026-05-14 11:50:59 -07:00
parent 13087de8c4
commit c774030341
3 changed files with 475 additions and 0 deletions
+116
View File
@@ -0,0 +1,116 @@
#!/usr/bin/env python3
"""RAG/embedding health HTTP wrapper for n8n.
Listens on 0.0.0.0:18814 so the n8n container can call it via
http://172.19.0.1:18814.
Endpoints:
GET /healthz -> service liveness
POST /check -> run ~/.hermes/scripts/rag_embedding_health.py and return JSON
"""
from __future__ import annotations
import http.server
import json
import os
import subprocess
import time
from pathlib import Path
PORT = int(os.environ.get("PORT", "18814"))
CHECK_SCRIPT = Path(os.environ.get("RAG_HEALTH_SCRIPT", "/home/will/.hermes/scripts/rag_embedding_health.py"))
TIMEOUT = int(os.environ.get("RAG_HEALTH_TIMEOUT", "180"))
class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
if self.path.rstrip("/") == "/healthz":
self._json({"status": "ok", "service": "rag-embedding-health"})
else:
self._json({"error": "not found"}, status=404)
def do_POST(self):
if self.path.rstrip("/") != "/check":
self._json({"error": "not found"}, status=404)
return
started = time.time()
if not CHECK_SCRIPT.exists():
self._json(
{
"ok": False,
"status": "failed",
"exitCode": 127,
"output": f"RAG health script missing: {CHECK_SCRIPT}",
"durationMs": 0,
},
status=200,
)
return
env = os.environ.copy()
env.setdefault("HERMES_HOME", "/home/will/.hermes")
env.setdefault("OLLAMA_BASE_URL", "http://127.0.0.1:18807")
env.setdefault("N8N_URL", "http://127.0.0.1:18808")
env.setdefault("OBSIDIAN_REINDEX_URL", "http://127.0.0.1:18810")
try:
proc = subprocess.run(
[str(CHECK_SCRIPT)],
text=True,
capture_output=True,
timeout=TIMEOUT,
check=False,
env=env,
)
output = (proc.stdout or proc.stderr or "").strip()
self._json(
{
"ok": proc.returncode == 0,
"status": "ok" if proc.returncode == 0 else "failed",
"exitCode": proc.returncode,
"output": output[:4000],
"durationMs": int((time.time() - started) * 1000),
},
status=200,
)
except subprocess.TimeoutExpired:
self._json(
{
"ok": False,
"status": "timeout",
"exitCode": 124,
"output": f"RAG/embedding health check timed out after {TIMEOUT}s",
"durationMs": int((time.time() - started) * 1000),
},
status=200,
)
except Exception as exc:
self._json(
{
"ok": False,
"status": "error",
"exitCode": 1,
"output": str(exc)[:4000],
"durationMs": int((time.time() - started) * 1000),
},
status=200,
)
def _json(self, data, status=200):
body = json.dumps(data, indent=2).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def log_message(self, format, *args):
return
if __name__ == "__main__":
server = http.server.HTTPServer(("0.0.0.0", PORT), Handler)
print(f"rag-embedding-health listening on 0.0.0.0:{PORT}", flush=True)
server.serve_forever()
@@ -0,0 +1,345 @@
{
"updatedAt": "2026-05-14T18:49:58.205Z",
"createdAt": "2026-05-14T18:49:04.674Z",
"id": "SwKaPtYqUJrakpFu",
"name": "RAG and Embedding Health Watchdog",
"description": null,
"active": true,
"isArchived": false,
"nodes": [
{
"parameters": {},
"id": "bca0ccac-1102-4b45-a9e3-a52f06352376",
"name": "Manual Trigger",
"type": "n8n-nodes-base.manualTrigger",
"typeVersion": 1,
"position": [
0,
100
]
},
{
"parameters": {
"rule": {
"interval": [
{
"field": "hours",
"hoursInterval": 6
}
]
}
},
"id": "3f5e4d1e-7e90-43d1-ae01-97dde40fbf28",
"name": "Every 6 Hours",
"type": "n8n-nodes-base.scheduleTrigger",
"typeVersion": 1.2,
"position": [
0,
-80
]
},
{
"parameters": {
"method": "POST",
"url": "http://172.19.0.1:18814/check",
"options": {
"timeout": 240000
}
},
"id": "52e14b9f-4ab4-4906-9ed7-0dbe10762c26",
"name": "Run RAG Health Check",
"type": "n8n-nodes-base.httpRequest",
"typeVersion": 4.2,
"position": [
260,
20
]
},
{
"parameters": {
"mode": "runOnceForAllItems",
"jsCode": "const staticData = $getWorkflowStaticData('global');\nconst data = $input.first().json;\nconst now = new Date().toISOString();\nconst nl = String.fromCharCode(10);\nconst prev = staticData.ragEmbedding || { failedRuns: 0, alerted: false };\n\nif (data.ok) {\n const wasAlerted = prev.alerted;\n staticData.ragEmbedding = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: data.status, durationMs: data.durationMs };\n if (!wasAlerted) return [];\n return [{ json: { text: ['\u2705 RAG/Embedding health recovered', `- status=ok; duration=${data.durationMs}ms`, `checked=${now}`].join(nl), data } }];\n}\n\nconst failedRuns = (prev.failedRuns || 0) + 1;\nconst shouldAlert = !prev.alerted || failedRuns % 4 === 0;\nstaticData.ragEmbedding = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: data.status, exitCode: data.exitCode, output: data.output };\nif (!shouldAlert) return [];\n\nconst output = (data.output || 'No output from checker').trim();\nconst lines = [\n '\ud83d\udea8 RAG/Embedding Health Watchdog',\n `- failedRuns=${failedRuns}; status=${data.status}; exit=${data.exitCode}; duration=${data.durationMs}ms`,\n output,\n 'fix=check systemctl --user status rag-embedding-health.service; then inspect Ollama 18807, ChromaDB, and Obsidian reindex 18810.',\n `checked=${now}`,\n];\nreturn [{ json: { text: lines.join(nl), data } }];"
},
"id": "6b435e3e-2efc-43da-b565-d5ecb819af1f",
"name": "Alert on Failure or Recovery",
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
520,
20
]
},
{
"parameters": {
"authentication": "predefinedCredentialType",
"nodeCredentialType": "httpHeaderAuth",
"method": "POST",
"url": "https://discord.com/api/v10/channels/1494453542243532932/messages",
"sendBody": true,
"specifyBody": "json",
"jsonBody": "={{ { content: $json.text } }}",
"options": {}
},
"id": "1ebabe7e-2dbc-4fa6-a63c-3d869314a5cf",
"name": "Send Discord Ops Alert",
"type": "n8n-nodes-base.httpRequest",
"typeVersion": 4.2,
"position": [
800,
20
],
"credentials": {
"httpHeaderAuth": {
"id": "UgPqYcoCNNIgr55m",
"name": "Discord Bot Auth"
}
}
}
],
"connections": {
"Manual Trigger": {
"main": [
[
{
"node": "Run RAG Health Check",
"type": "main",
"index": 0
}
]
]
},
"Every 6 Hours": {
"main": [
[
{
"node": "Run RAG Health Check",
"type": "main",
"index": 0
}
]
]
},
"Run RAG Health Check": {
"main": [
[
{
"node": "Alert on Failure or Recovery",
"type": "main",
"index": 0
}
]
]
},
"Alert on Failure or Recovery": {
"main": [
[
{
"node": "Send Discord Ops Alert",
"type": "main",
"index": 0
}
]
]
}
},
"settings": {
"executionOrder": "v1",
"callerPolicy": "workflowsFromSameOwner",
"availableInMCP": false
},
"staticData": {
"node:Every 6 Hours": {
"recurrenceRules": []
},
"global": {
"ragEmbedding": {
"failedRuns": 0,
"alerted": false,
"lastOk": "2026-05-14T18:50:22.108Z",
"lastStatus": "ok",
"durationMs": 13239
}
}
},
"meta": null,
"versionId": "b6be4349-5960-40cd-b857-bd6c9c6c717f",
"activeVersionId": "b6be4349-5960-40cd-b857-bd6c9c6c717f",
"versionCounter": 9,
"triggerCount": 1,
"shared": [
{
"updatedAt": "2026-05-14T18:49:04.685Z",
"createdAt": "2026-05-14T18:49:04.685Z",
"role": "workflow:owner",
"workflowId": "SwKaPtYqUJrakpFu",
"projectId": "WGdp8QunI1tHpjXa",
"project": {
"updatedAt": "2026-03-11T21:08:10.005Z",
"createdAt": "2026-03-11T21:05:11.541Z",
"id": "WGdp8QunI1tHpjXa",
"name": "will will <will@wills-portal.com>",
"type": "personal",
"icon": null,
"description": null,
"creatorId": "5ad50ead-6e6a-4d12-ab5b-e5db15835bb5"
}
}
],
"tags": [],
"activeVersion": {
"updatedAt": "2026-05-14T18:49:58.207Z",
"createdAt": "2026-05-14T18:49:58.207Z",
"versionId": "b6be4349-5960-40cd-b857-bd6c9c6c717f",
"workflowId": "SwKaPtYqUJrakpFu",
"nodes": [
{
"parameters": {},
"id": "bca0ccac-1102-4b45-a9e3-a52f06352376",
"name": "Manual Trigger",
"type": "n8n-nodes-base.manualTrigger",
"typeVersion": 1,
"position": [
0,
100
]
},
{
"parameters": {
"rule": {
"interval": [
{
"field": "hours",
"hoursInterval": 6
}
]
}
},
"id": "3f5e4d1e-7e90-43d1-ae01-97dde40fbf28",
"name": "Every 6 Hours",
"type": "n8n-nodes-base.scheduleTrigger",
"typeVersion": 1.2,
"position": [
0,
-80
]
},
{
"parameters": {
"method": "POST",
"url": "http://172.19.0.1:18814/check",
"options": {
"timeout": 240000
}
},
"id": "52e14b9f-4ab4-4906-9ed7-0dbe10762c26",
"name": "Run RAG Health Check",
"type": "n8n-nodes-base.httpRequest",
"typeVersion": 4.2,
"position": [
260,
20
]
},
{
"parameters": {
"mode": "runOnceForAllItems",
"jsCode": "const staticData = $getWorkflowStaticData('global');\nconst data = $input.first().json;\nconst now = new Date().toISOString();\nconst nl = String.fromCharCode(10);\nconst prev = staticData.ragEmbedding || { failedRuns: 0, alerted: false };\n\nif (data.ok) {\n const wasAlerted = prev.alerted;\n staticData.ragEmbedding = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: data.status, durationMs: data.durationMs };\n if (!wasAlerted) return [];\n return [{ json: { text: ['\u2705 RAG/Embedding health recovered', `- status=ok; duration=${data.durationMs}ms`, `checked=${now}`].join(nl), data } }];\n}\n\nconst failedRuns = (prev.failedRuns || 0) + 1;\nconst shouldAlert = !prev.alerted || failedRuns % 4 === 0;\nstaticData.ragEmbedding = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: data.status, exitCode: data.exitCode, output: data.output };\nif (!shouldAlert) return [];\n\nconst output = (data.output || 'No output from checker').trim();\nconst lines = [\n '\ud83d\udea8 RAG/Embedding Health Watchdog',\n `- failedRuns=${failedRuns}; status=${data.status}; exit=${data.exitCode}; duration=${data.durationMs}ms`,\n output,\n 'fix=check systemctl --user status rag-embedding-health.service; then inspect Ollama 18807, ChromaDB, and Obsidian reindex 18810.',\n `checked=${now}`,\n];\nreturn [{ json: { text: lines.join(nl), data } }];"
},
"id": "6b435e3e-2efc-43da-b565-d5ecb819af1f",
"name": "Alert on Failure or Recovery",
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
520,
20
]
},
{
"parameters": {
"authentication": "predefinedCredentialType",
"nodeCredentialType": "httpHeaderAuth",
"method": "POST",
"url": "https://discord.com/api/v10/channels/1494453542243532932/messages",
"sendBody": true,
"specifyBody": "json",
"jsonBody": "={{ { content: $json.text } }}",
"options": {}
},
"id": "1ebabe7e-2dbc-4fa6-a63c-3d869314a5cf",
"name": "Send Discord Ops Alert",
"type": "n8n-nodes-base.httpRequest",
"typeVersion": 4.2,
"position": [
800,
20
],
"credentials": {
"httpHeaderAuth": {
"id": "UgPqYcoCNNIgr55m",
"name": "Discord Bot Auth"
}
}
}
],
"connections": {
"Manual Trigger": {
"main": [
[
{
"node": "Run RAG Health Check",
"type": "main",
"index": 0
}
]
]
},
"Every 6 Hours": {
"main": [
[
{
"node": "Run RAG Health Check",
"type": "main",
"index": 0
}
]
]
},
"Run RAG Health Check": {
"main": [
[
{
"node": "Alert on Failure or Recovery",
"type": "main",
"index": 0
}
]
]
},
"Alert on Failure or Recovery": {
"main": [
[
{
"node": "Send Discord Ops Alert",
"type": "main",
"index": 0
}
]
]
}
},
"authors": "will will",
"name": null,
"description": null,
"autosaved": false,
"workflowPublishHistory": [
{
"createdAt": "2026-05-14T18:49:58.274Z",
"id": 1516,
"workflowId": "SwKaPtYqUJrakpFu",
"versionId": "b6be4349-5960-40cd-b857-bd6c9c6c717f",
"event": "activated",
"userId": "5ad50ead-6e6a-4d12-ab5b-e5db15835bb5"
}
]
}
}
+14
View File
@@ -0,0 +1,14 @@
[Unit]
Description=RAG/Embedding Health HTTP Service (port 18814)
After=network.target
[Service]
Type=simple
ExecStart=/usr/bin/python3 /home/will/lab/swarm/scripts/rag-embedding-health-server.py
Restart=on-failure
RestartSec=5
Environment=PORT=18814
Environment=RAG_HEALTH_TIMEOUT=180
[Install]
WantedBy=default.target