feat(n8n): migrate rag health watchdog
This commit is contained in:
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
"""RAG/embedding health HTTP wrapper for n8n.
|
||||
|
||||
Listens on 0.0.0.0:18814 so the n8n container can call it via
|
||||
http://172.19.0.1:18814.
|
||||
|
||||
Endpoints:
|
||||
GET /healthz -> service liveness
|
||||
POST /check -> run ~/.hermes/scripts/rag_embedding_health.py and return JSON
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import http.server
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
PORT = int(os.environ.get("PORT", "18814"))
|
||||
CHECK_SCRIPT = Path(os.environ.get("RAG_HEALTH_SCRIPT", "/home/will/.hermes/scripts/rag_embedding_health.py"))
|
||||
TIMEOUT = int(os.environ.get("RAG_HEALTH_TIMEOUT", "180"))
|
||||
|
||||
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path.rstrip("/") == "/healthz":
|
||||
self._json({"status": "ok", "service": "rag-embedding-health"})
|
||||
else:
|
||||
self._json({"error": "not found"}, status=404)
|
||||
|
||||
def do_POST(self):
|
||||
if self.path.rstrip("/") != "/check":
|
||||
self._json({"error": "not found"}, status=404)
|
||||
return
|
||||
|
||||
started = time.time()
|
||||
if not CHECK_SCRIPT.exists():
|
||||
self._json(
|
||||
{
|
||||
"ok": False,
|
||||
"status": "failed",
|
||||
"exitCode": 127,
|
||||
"output": f"RAG health script missing: {CHECK_SCRIPT}",
|
||||
"durationMs": 0,
|
||||
},
|
||||
status=200,
|
||||
)
|
||||
return
|
||||
|
||||
env = os.environ.copy()
|
||||
env.setdefault("HERMES_HOME", "/home/will/.hermes")
|
||||
env.setdefault("OLLAMA_BASE_URL", "http://127.0.0.1:18807")
|
||||
env.setdefault("N8N_URL", "http://127.0.0.1:18808")
|
||||
env.setdefault("OBSIDIAN_REINDEX_URL", "http://127.0.0.1:18810")
|
||||
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
[str(CHECK_SCRIPT)],
|
||||
text=True,
|
||||
capture_output=True,
|
||||
timeout=TIMEOUT,
|
||||
check=False,
|
||||
env=env,
|
||||
)
|
||||
output = (proc.stdout or proc.stderr or "").strip()
|
||||
self._json(
|
||||
{
|
||||
"ok": proc.returncode == 0,
|
||||
"status": "ok" if proc.returncode == 0 else "failed",
|
||||
"exitCode": proc.returncode,
|
||||
"output": output[:4000],
|
||||
"durationMs": int((time.time() - started) * 1000),
|
||||
},
|
||||
status=200,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
self._json(
|
||||
{
|
||||
"ok": False,
|
||||
"status": "timeout",
|
||||
"exitCode": 124,
|
||||
"output": f"RAG/embedding health check timed out after {TIMEOUT}s",
|
||||
"durationMs": int((time.time() - started) * 1000),
|
||||
},
|
||||
status=200,
|
||||
)
|
||||
except Exception as exc:
|
||||
self._json(
|
||||
{
|
||||
"ok": False,
|
||||
"status": "error",
|
||||
"exitCode": 1,
|
||||
"output": str(exc)[:4000],
|
||||
"durationMs": int((time.time() - started) * 1000),
|
||||
},
|
||||
status=200,
|
||||
)
|
||||
|
||||
def _json(self, data, status=200):
|
||||
body = json.dumps(data, indent=2).encode("utf-8")
|
||||
self.send_response(status)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, format, *args):
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
server = http.server.HTTPServer(("0.0.0.0", PORT), Handler)
|
||||
print(f"rag-embedding-health listening on 0.0.0.0:{PORT}", flush=True)
|
||||
server.serve_forever()
|
||||
@@ -0,0 +1,345 @@
|
||||
{
|
||||
"updatedAt": "2026-05-14T18:49:58.205Z",
|
||||
"createdAt": "2026-05-14T18:49:04.674Z",
|
||||
"id": "SwKaPtYqUJrakpFu",
|
||||
"name": "RAG and Embedding Health Watchdog",
|
||||
"description": null,
|
||||
"active": true,
|
||||
"isArchived": false,
|
||||
"nodes": [
|
||||
{
|
||||
"parameters": {},
|
||||
"id": "bca0ccac-1102-4b45-a9e3-a52f06352376",
|
||||
"name": "Manual Trigger",
|
||||
"type": "n8n-nodes-base.manualTrigger",
|
||||
"typeVersion": 1,
|
||||
"position": [
|
||||
0,
|
||||
100
|
||||
]
|
||||
},
|
||||
{
|
||||
"parameters": {
|
||||
"rule": {
|
||||
"interval": [
|
||||
{
|
||||
"field": "hours",
|
||||
"hoursInterval": 6
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"id": "3f5e4d1e-7e90-43d1-ae01-97dde40fbf28",
|
||||
"name": "Every 6 Hours",
|
||||
"type": "n8n-nodes-base.scheduleTrigger",
|
||||
"typeVersion": 1.2,
|
||||
"position": [
|
||||
0,
|
||||
-80
|
||||
]
|
||||
},
|
||||
{
|
||||
"parameters": {
|
||||
"method": "POST",
|
||||
"url": "http://172.19.0.1:18814/check",
|
||||
"options": {
|
||||
"timeout": 240000
|
||||
}
|
||||
},
|
||||
"id": "52e14b9f-4ab4-4906-9ed7-0dbe10762c26",
|
||||
"name": "Run RAG Health Check",
|
||||
"type": "n8n-nodes-base.httpRequest",
|
||||
"typeVersion": 4.2,
|
||||
"position": [
|
||||
260,
|
||||
20
|
||||
]
|
||||
},
|
||||
{
|
||||
"parameters": {
|
||||
"mode": "runOnceForAllItems",
|
||||
"jsCode": "const staticData = $getWorkflowStaticData('global');\nconst data = $input.first().json;\nconst now = new Date().toISOString();\nconst nl = String.fromCharCode(10);\nconst prev = staticData.ragEmbedding || { failedRuns: 0, alerted: false };\n\nif (data.ok) {\n const wasAlerted = prev.alerted;\n staticData.ragEmbedding = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: data.status, durationMs: data.durationMs };\n if (!wasAlerted) return [];\n return [{ json: { text: ['\u2705 RAG/Embedding health recovered', `- status=ok; duration=${data.durationMs}ms`, `checked=${now}`].join(nl), data } }];\n}\n\nconst failedRuns = (prev.failedRuns || 0) + 1;\nconst shouldAlert = !prev.alerted || failedRuns % 4 === 0;\nstaticData.ragEmbedding = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: data.status, exitCode: data.exitCode, output: data.output };\nif (!shouldAlert) return [];\n\nconst output = (data.output || 'No output from checker').trim();\nconst lines = [\n '\ud83d\udea8 RAG/Embedding Health Watchdog',\n `- failedRuns=${failedRuns}; status=${data.status}; exit=${data.exitCode}; duration=${data.durationMs}ms`,\n output,\n 'fix=check systemctl --user status rag-embedding-health.service; then inspect Ollama 18807, ChromaDB, and Obsidian reindex 18810.',\n `checked=${now}`,\n];\nreturn [{ json: { text: lines.join(nl), data } }];"
|
||||
},
|
||||
"id": "6b435e3e-2efc-43da-b565-d5ecb819af1f",
|
||||
"name": "Alert on Failure or Recovery",
|
||||
"type": "n8n-nodes-base.code",
|
||||
"typeVersion": 2,
|
||||
"position": [
|
||||
520,
|
||||
20
|
||||
]
|
||||
},
|
||||
{
|
||||
"parameters": {
|
||||
"authentication": "predefinedCredentialType",
|
||||
"nodeCredentialType": "httpHeaderAuth",
|
||||
"method": "POST",
|
||||
"url": "https://discord.com/api/v10/channels/1494453542243532932/messages",
|
||||
"sendBody": true,
|
||||
"specifyBody": "json",
|
||||
"jsonBody": "={{ { content: $json.text } }}",
|
||||
"options": {}
|
||||
},
|
||||
"id": "1ebabe7e-2dbc-4fa6-a63c-3d869314a5cf",
|
||||
"name": "Send Discord Ops Alert",
|
||||
"type": "n8n-nodes-base.httpRequest",
|
||||
"typeVersion": 4.2,
|
||||
"position": [
|
||||
800,
|
||||
20
|
||||
],
|
||||
"credentials": {
|
||||
"httpHeaderAuth": {
|
||||
"id": "UgPqYcoCNNIgr55m",
|
||||
"name": "Discord Bot Auth"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"connections": {
|
||||
"Manual Trigger": {
|
||||
"main": [
|
||||
[
|
||||
{
|
||||
"node": "Run RAG Health Check",
|
||||
"type": "main",
|
||||
"index": 0
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"Every 6 Hours": {
|
||||
"main": [
|
||||
[
|
||||
{
|
||||
"node": "Run RAG Health Check",
|
||||
"type": "main",
|
||||
"index": 0
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"Run RAG Health Check": {
|
||||
"main": [
|
||||
[
|
||||
{
|
||||
"node": "Alert on Failure or Recovery",
|
||||
"type": "main",
|
||||
"index": 0
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"Alert on Failure or Recovery": {
|
||||
"main": [
|
||||
[
|
||||
{
|
||||
"node": "Send Discord Ops Alert",
|
||||
"type": "main",
|
||||
"index": 0
|
||||
}
|
||||
]
|
||||
]
|
||||
}
|
||||
},
|
||||
"settings": {
|
||||
"executionOrder": "v1",
|
||||
"callerPolicy": "workflowsFromSameOwner",
|
||||
"availableInMCP": false
|
||||
},
|
||||
"staticData": {
|
||||
"node:Every 6 Hours": {
|
||||
"recurrenceRules": []
|
||||
},
|
||||
"global": {
|
||||
"ragEmbedding": {
|
||||
"failedRuns": 0,
|
||||
"alerted": false,
|
||||
"lastOk": "2026-05-14T18:50:22.108Z",
|
||||
"lastStatus": "ok",
|
||||
"durationMs": 13239
|
||||
}
|
||||
}
|
||||
},
|
||||
"meta": null,
|
||||
"versionId": "b6be4349-5960-40cd-b857-bd6c9c6c717f",
|
||||
"activeVersionId": "b6be4349-5960-40cd-b857-bd6c9c6c717f",
|
||||
"versionCounter": 9,
|
||||
"triggerCount": 1,
|
||||
"shared": [
|
||||
{
|
||||
"updatedAt": "2026-05-14T18:49:04.685Z",
|
||||
"createdAt": "2026-05-14T18:49:04.685Z",
|
||||
"role": "workflow:owner",
|
||||
"workflowId": "SwKaPtYqUJrakpFu",
|
||||
"projectId": "WGdp8QunI1tHpjXa",
|
||||
"project": {
|
||||
"updatedAt": "2026-03-11T21:08:10.005Z",
|
||||
"createdAt": "2026-03-11T21:05:11.541Z",
|
||||
"id": "WGdp8QunI1tHpjXa",
|
||||
"name": "will will <will@wills-portal.com>",
|
||||
"type": "personal",
|
||||
"icon": null,
|
||||
"description": null,
|
||||
"creatorId": "5ad50ead-6e6a-4d12-ab5b-e5db15835bb5"
|
||||
}
|
||||
}
|
||||
],
|
||||
"tags": [],
|
||||
"activeVersion": {
|
||||
"updatedAt": "2026-05-14T18:49:58.207Z",
|
||||
"createdAt": "2026-05-14T18:49:58.207Z",
|
||||
"versionId": "b6be4349-5960-40cd-b857-bd6c9c6c717f",
|
||||
"workflowId": "SwKaPtYqUJrakpFu",
|
||||
"nodes": [
|
||||
{
|
||||
"parameters": {},
|
||||
"id": "bca0ccac-1102-4b45-a9e3-a52f06352376",
|
||||
"name": "Manual Trigger",
|
||||
"type": "n8n-nodes-base.manualTrigger",
|
||||
"typeVersion": 1,
|
||||
"position": [
|
||||
0,
|
||||
100
|
||||
]
|
||||
},
|
||||
{
|
||||
"parameters": {
|
||||
"rule": {
|
||||
"interval": [
|
||||
{
|
||||
"field": "hours",
|
||||
"hoursInterval": 6
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"id": "3f5e4d1e-7e90-43d1-ae01-97dde40fbf28",
|
||||
"name": "Every 6 Hours",
|
||||
"type": "n8n-nodes-base.scheduleTrigger",
|
||||
"typeVersion": 1.2,
|
||||
"position": [
|
||||
0,
|
||||
-80
|
||||
]
|
||||
},
|
||||
{
|
||||
"parameters": {
|
||||
"method": "POST",
|
||||
"url": "http://172.19.0.1:18814/check",
|
||||
"options": {
|
||||
"timeout": 240000
|
||||
}
|
||||
},
|
||||
"id": "52e14b9f-4ab4-4906-9ed7-0dbe10762c26",
|
||||
"name": "Run RAG Health Check",
|
||||
"type": "n8n-nodes-base.httpRequest",
|
||||
"typeVersion": 4.2,
|
||||
"position": [
|
||||
260,
|
||||
20
|
||||
]
|
||||
},
|
||||
{
|
||||
"parameters": {
|
||||
"mode": "runOnceForAllItems",
|
||||
"jsCode": "const staticData = $getWorkflowStaticData('global');\nconst data = $input.first().json;\nconst now = new Date().toISOString();\nconst nl = String.fromCharCode(10);\nconst prev = staticData.ragEmbedding || { failedRuns: 0, alerted: false };\n\nif (data.ok) {\n const wasAlerted = prev.alerted;\n staticData.ragEmbedding = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: data.status, durationMs: data.durationMs };\n if (!wasAlerted) return [];\n return [{ json: { text: ['\u2705 RAG/Embedding health recovered', `- status=ok; duration=${data.durationMs}ms`, `checked=${now}`].join(nl), data } }];\n}\n\nconst failedRuns = (prev.failedRuns || 0) + 1;\nconst shouldAlert = !prev.alerted || failedRuns % 4 === 0;\nstaticData.ragEmbedding = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: data.status, exitCode: data.exitCode, output: data.output };\nif (!shouldAlert) return [];\n\nconst output = (data.output || 'No output from checker').trim();\nconst lines = [\n '\ud83d\udea8 RAG/Embedding Health Watchdog',\n `- failedRuns=${failedRuns}; status=${data.status}; exit=${data.exitCode}; duration=${data.durationMs}ms`,\n output,\n 'fix=check systemctl --user status rag-embedding-health.service; then inspect Ollama 18807, ChromaDB, and Obsidian reindex 18810.',\n `checked=${now}`,\n];\nreturn [{ json: { text: lines.join(nl), data } }];"
|
||||
},
|
||||
"id": "6b435e3e-2efc-43da-b565-d5ecb819af1f",
|
||||
"name": "Alert on Failure or Recovery",
|
||||
"type": "n8n-nodes-base.code",
|
||||
"typeVersion": 2,
|
||||
"position": [
|
||||
520,
|
||||
20
|
||||
]
|
||||
},
|
||||
{
|
||||
"parameters": {
|
||||
"authentication": "predefinedCredentialType",
|
||||
"nodeCredentialType": "httpHeaderAuth",
|
||||
"method": "POST",
|
||||
"url": "https://discord.com/api/v10/channels/1494453542243532932/messages",
|
||||
"sendBody": true,
|
||||
"specifyBody": "json",
|
||||
"jsonBody": "={{ { content: $json.text } }}",
|
||||
"options": {}
|
||||
},
|
||||
"id": "1ebabe7e-2dbc-4fa6-a63c-3d869314a5cf",
|
||||
"name": "Send Discord Ops Alert",
|
||||
"type": "n8n-nodes-base.httpRequest",
|
||||
"typeVersion": 4.2,
|
||||
"position": [
|
||||
800,
|
||||
20
|
||||
],
|
||||
"credentials": {
|
||||
"httpHeaderAuth": {
|
||||
"id": "UgPqYcoCNNIgr55m",
|
||||
"name": "Discord Bot Auth"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"connections": {
|
||||
"Manual Trigger": {
|
||||
"main": [
|
||||
[
|
||||
{
|
||||
"node": "Run RAG Health Check",
|
||||
"type": "main",
|
||||
"index": 0
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"Every 6 Hours": {
|
||||
"main": [
|
||||
[
|
||||
{
|
||||
"node": "Run RAG Health Check",
|
||||
"type": "main",
|
||||
"index": 0
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"Run RAG Health Check": {
|
||||
"main": [
|
||||
[
|
||||
{
|
||||
"node": "Alert on Failure or Recovery",
|
||||
"type": "main",
|
||||
"index": 0
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"Alert on Failure or Recovery": {
|
||||
"main": [
|
||||
[
|
||||
{
|
||||
"node": "Send Discord Ops Alert",
|
||||
"type": "main",
|
||||
"index": 0
|
||||
}
|
||||
]
|
||||
]
|
||||
}
|
||||
},
|
||||
"authors": "will will",
|
||||
"name": null,
|
||||
"description": null,
|
||||
"autosaved": false,
|
||||
"workflowPublishHistory": [
|
||||
{
|
||||
"createdAt": "2026-05-14T18:49:58.274Z",
|
||||
"id": 1516,
|
||||
"workflowId": "SwKaPtYqUJrakpFu",
|
||||
"versionId": "b6be4349-5960-40cd-b857-bd6c9c6c717f",
|
||||
"event": "activated",
|
||||
"userId": "5ad50ead-6e6a-4d12-ab5b-e5db15835bb5"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
[Unit]
|
||||
Description=RAG/Embedding Health HTTP Service (port 18814)
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/bin/python3 /home/will/lab/swarm/scripts/rag-embedding-health-server.py
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
Environment=PORT=18814
|
||||
Environment=RAG_HEALTH_TIMEOUT=180
|
||||
|
||||
[Install]
|
||||
WantedBy=default.target
|
||||
Reference in New Issue
Block a user