From b3eefc4d14ff630267814432002ed28157805efd Mon Sep 17 00:00:00 2001 From: William Valentin Date: Wed, 13 May 2026 14:33:48 -0700 Subject: [PATCH] feat: add systemd service file and updated n8n watchdog workflow - docker-health-endpoint.service (systemd user unit) - swarm-health-watchdog.json with Docker health enrichment - Calls http://172.19.0.1:18809/health for container state - Includes docker status/health/restarts in alert messages - Adds docker field to service check results Task: t_461f71fe --- swarm-common/docker-health-endpoint.service | 13 +++++++++++++ .../n8n-workflows/swarm-health-watchdog.json | 1 + 2 files changed, 14 insertions(+) create mode 100644 swarm-common/docker-health-endpoint.service create mode 100644 swarm-common/n8n-workflows/swarm-health-watchdog.json diff --git a/swarm-common/docker-health-endpoint.service b/swarm-common/docker-health-endpoint.service new file mode 100644 index 0000000..c8630d3 --- /dev/null +++ b/swarm-common/docker-health-endpoint.service @@ -0,0 +1,13 @@ +[Unit] +Description=Docker Health Endpoint (port 18809) +After=local-fs.target docker.service + +[Service] +Type=simple +ExecStart=/usr/bin/python3 /home/will/lab/swarm/scripts/docker-health-server.py +Restart=on-failure +RestartSec=3 +Environment=PORT=18809 + +[Install] +WantedBy=default.target diff --git a/swarm-common/n8n-workflows/swarm-health-watchdog.json b/swarm-common/n8n-workflows/swarm-health-watchdog.json new file mode 100644 index 0000000..bcf6708 --- /dev/null +++ b/swarm-common/n8n-workflows/swarm-health-watchdog.json @@ -0,0 +1 @@ +{"updatedAt":"2026-05-13T21:33:29.860Z","createdAt":"2026-05-12T17:48:01.214Z","id":"lDKocSFXBQWQrDd3","name":"Swarm Health Watchdog","description":"Every 15 minutes, checks core swarm endpoints from inside n8n. Alerts after two consecutive failures and reports recoveries to Telegram and Discord.","active":true,"isArchived":false,"nodes":[{"parameters":{},"type":"n8n-nodes-base.manualTrigger","typeVersion":1,"position":[-620,-100],"id":"3759f3cd-fa90-49b6-ad08-322d21f3d727","name":"Manual Trigger"},{"parameters":{"rule":{"interval":[{"field":"minutes","minutesInterval":15}]}},"type":"n8n-nodes-base.scheduleTrigger","typeVersion":1.3,"position":[-620,100],"id":"9d209ddb-8da7-48ad-850c-ec0e452760ca","name":"Every 15 Minutes"},{"parameters":{"mode":"runOnceForAllItems","jsCode":"const staticData = $getWorkflowStaticData('global');\nconst CONFIG = {\n timeoutMs: 5000,\n failureThreshold: 2,\n reminderEveryFailedRuns: 6,\n};\nconst services = [\n { key: 'brave', name: 'Brave MCP', port: 18802, url: 'http://172.19.0.1:18802/mcp', ok: [200, 400, 404, 405, 406], docker: 'brave-search' },\n { key: 'searxng', name: 'SearXNG', port: 18803, url: 'http://172.19.0.1:18803/search?q=health&format=json', ok: [200], docker: 'searxng' },\n { key: 'litellm', name: 'LiteLLM', port: 18804, url: 'http://172.19.0.1:18804/health/liveliness', ok: [200], docker: 'litellm' },\n { key: 'kokoro', name: 'Kokoro TTS', port: 18805, url: 'http://172.19.0.1:18805/health', ok: [200], docker: 'kokoro-tts' },\n { key: 'llamacpp', name: 'llama.cpp', port: 18806, url: 'http://172.19.0.1:18806/health', ok: [200] },\n { key: 'ollama', name: 'Ollama embeddings', port: 18807, url: 'http://172.19.0.1:18807/api/version', ok: [200] },\n { key: 'n8n', name: 'n8n', port: 18808, url: 'http://127.0.0.1:5678/healthz', ok: [200], docker: 'n8n-agent' },\n { key: 'whisper', name: 'Whisper', port: 18811, url: 'http://172.19.0.1:18811/', ok: [200, 404], docker: 'whisper-server' },\n];\n\n// Fetch Docker container health from host-side endpoint\nlet dockerHealth = {};\ntry {\n const dhRes = await fetch('http://172.19.0.1:18809/health', { signal: AbortSignal.timeout(3000) });\n if (dhRes.ok) {\n const dhData = await dhRes.json();\n for (const c of (dhData.containers || [])) {\n dockerHealth[c.name] = c;\n }\n }\n} catch (_) {\n // Docker health endpoint unavailable - continue without it\n}\n\nasync function check(svc) {\n const controller = new AbortController();\n const timer = setTimeout(() => controller.abort(), CONFIG.timeoutMs);\n const started = Date.now();\n try {\n const res = await fetch(svc.url, { method: 'GET', signal: controller.signal });\n const ms = Date.now() - started;\n const body = await res.text().catch(() => '');\n return {\n ...svc,\n healthy: svc.ok.includes(res.status),\n status: res.status,\n ms,\n detail: body.slice(0, 160).replace(/\\s+/g, ' ').trim(),\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n } catch (error) {\n return {\n ...svc,\n healthy: false,\n status: 'error',\n ms: Date.now() - started,\n detail: error.message,\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n } finally {\n clearTimeout(timer);\n }\n}\n\nfunction suggestedFix(r) {\n const dh = r.docker;\n const dockerInfo = dh ? ` [docker: ${dh.status}/${dh.health} restarts=${dh.restarts}]` : '';\n if (r.key === 'llamacpp') return 'systemctl status llama-server.service; restart only if it is down.' + dockerInfo;\n if (r.key === 'ollama') return 'systemctl --user status ollama.service; verify port 18807 and nomic-embed-text.' + dockerInfo;\n if (r.key === 'n8n') return 'docker logs n8n-agent --tail 100; check database/API health.' + dockerInfo;\n if (['searxng','litellm','kokoro','whisper','brave'].includes(r.key)) return `cd ~/lab/swarm && docker compose ps; inspect ${r.name} logs.${dockerInfo}`;\n return 'Check service logs and port listener.' + dockerInfo;\n}\n\nconst results = await Promise.all(services.map(check));\nconst now = new Date().toISOString();\nstaticData.services = staticData.services || {};\nconst alerts = [];\nconst recoveries = [];\nfor (const r of results) {\n const prev = staticData.services[r.key] || { failedRuns: 0, alerted: false };\n if (r.healthy) {\n if (prev.alerted) recoveries.push({ ...r, previousFailedRuns: prev.failedRuns });\n staticData.services[r.key] = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: r.status, lastDetail: r.detail };\n } else {\n const failedRuns = (prev.failedRuns || 0) + 1;\n const shouldAlert = failedRuns >= CONFIG.failureThreshold && (!prev.alerted || (CONFIG.reminderEveryFailedRuns > 0 && failedRuns % CONFIG.reminderEveryFailedRuns === 0));\n staticData.services[r.key] = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: r.status, lastDetail: r.detail };\n if (shouldAlert) alerts.push({ ...r, failedRuns, suggestedFix: suggestedFix(r) });\n }\n}\n\nif (!alerts.length && !recoveries.length) return [];\nlet lines = [];\nif (alerts.length) {\n lines.push('\\u{1F6A8} Swarm Health Watchdog');\n for (const a of alerts) {\n const dh = a.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}/restarts=${dh.restarts}` : '';\n lines.push(`- ${a.name} :${a.port} failed ${a.failedRuns} checks; status=${a.status}${dockerStr}; detail=${a.detail || 'n/a'}; fix=${a.suggestedFix}`);\n }\n}\nif (recoveries.length) {\n lines.push('\\u2705 Swarm service recovered');\n for (const r of recoveries) {\n const dh = r.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}` : '';\n lines.push(`- ${r.name} :${r.port} healthy again; status=${r.status}; latency=${r.ms}ms${dockerStr}`);\n }\n}\nlines.push(`checked=${now}`);\nreturn [{ json: { text: lines.join('\\n'), alerts, recoveries, results, dockerHealth, checkedAt: now } }];"},"type":"n8n-nodes-base.code","typeVersion":2,"position":[-340,0],"id":"b3f76d53-204b-45bb-9a48-8cf20262319d","name":"Check Swarm Services"},{"parameters":{"chatId":"8367012007","text":"={{$json.text}}","additionalFields":{"parse_mode":"Markdown"}},"type":"n8n-nodes-base.telegram","typeVersion":1.2,"position":[-80,-80],"id":"32d7ad9f-80bb-4acf-b546-89f04db32a6a","name":"Send Telegram Alert","credentials":{"telegramApi":{"id":"aox4dyIWVSRdcH5z","name":"Telegram Bot (OpenClaw)"}}},{"parameters":{"authentication":"predefinedCredentialType","nodeCredentialType":"httpHeaderAuth","method":"POST","url":"https://discord.com/api/v10/channels/425781661268049931/messages","sendBody":true,"specifyBody":"json","jsonBody":"={{ { content: $json.text } }}","options":{}},"type":"n8n-nodes-base.httpRequest","typeVersion":4.2,"position":[-80,100],"id":"7eb589f5-6e50-4e1e-8a37-391f06785ad87","name":"Send Discord Alert","credentials":{"httpHeaderAuth":{"id":"UgPqYcoCNNIgr55m","name":"Discord Bot Auth"}}}],"connections":{"Manual Trigger":{"main":[[{"node":"Check Swarm Services","type":"main","index":0}]]},"Every 15 Minutes":{"main":[[{"node":"Check Swarm Services","type":"main","index":0}]]},"Check Swarm Services":{"main":[[{"node":"Send Telegram Alert","type":"main","index":0},{"node":"Send Discord Alert","type":"main","index":0}]]}},"settings":{"executionOrder":"v1","timezone":"America/Los_Angeles","saveDataErrorExecution":"all","saveDataSuccessExecution":"none","callerPolicy":"workflowsFromSameOwner","availableInMCP":false},"staticData":{"node:Every 15 Minutes":{"recurrenceRules":[]}},"meta":null,"pinData":null,"versionId":"0be0d265-2373-49f8-9066-a9c8aabb7861","activeVersionId":"0be0d265-2373-49f8-9066-a9c8aabb7861","versionCounter":28,"triggerCount":1,"shared":[{"updatedAt":"2026-05-12T17:39:10.124Z","createdAt":"2026-05-12T17:39:10.124Z","role":"workflow:owner","workflowId":"lDKocSFXBQWQrDd3","projectId":"WGdp8QunI1tHpjXa","project":{"updatedAt":"2026-03-11T21:08:10.005Z","createdAt":"2026-03-11T21:05:11.541Z","id":"WGdp8QunI1tHpjXa","name":"will will ","type":"personal","icon":null,"description":null,"creatorId":"5ad50ead-6e6a-4d12-ab5b-e5db15835bb5"}}],"tags":[],"activeVersion":{"updatedAt":"2026-05-13T21:33:29.861Z","createdAt":"2026-05-13T21:33:29.861Z","versionId":"0be0d265-2373-49f8-9066-a9c8aabb7861","workflowId":"lDKocSFXBQWQrDd3","nodes":[{"parameters":{},"type":"n8n-nodes-base.manualTrigger","typeVersion":1,"position":[-620,-100],"id":"3759f3cd-fa90-49b6-ad08-322d21f3d727","name":"Manual Trigger"},{"parameters":{"rule":{"interval":[{"field":"minutes","minutesInterval":15}]}},"type":"n8n-nodes-base.scheduleTrigger","typeVersion":1.3,"position":[-620,100],"id":"9d209ddb-8da7-48ad-850c-ec0e452760ca","name":"Every 15 Minutes"},{"parameters":{"mode":"runOnceForAllItems","jsCode":"const staticData = $getWorkflowStaticData('global');\nconst CONFIG = {\n timeoutMs: 5000,\n failureThreshold: 2,\n reminderEveryFailedRuns: 6,\n};\nconst services = [\n { key: 'brave', name: 'Brave MCP', port: 18802, url: 'http://172.19.0.1:18802/mcp', ok: [200, 400, 404, 405, 406], docker: 'brave-search' },\n { key: 'searxng', name: 'SearXNG', port: 18803, url: 'http://172.19.0.1:18803/search?q=health&format=json', ok: [200], docker: 'searxng' },\n { key: 'litellm', name: 'LiteLLM', port: 18804, url: 'http://172.19.0.1:18804/health/liveliness', ok: [200], docker: 'litellm' },\n { key: 'kokoro', name: 'Kokoro TTS', port: 18805, url: 'http://172.19.0.1:18805/health', ok: [200], docker: 'kokoro-tts' },\n { key: 'llamacpp', name: 'llama.cpp', port: 18806, url: 'http://172.19.0.1:18806/health', ok: [200] },\n { key: 'ollama', name: 'Ollama embeddings', port: 18807, url: 'http://172.19.0.1:18807/api/version', ok: [200] },\n { key: 'n8n', name: 'n8n', port: 18808, url: 'http://127.0.0.1:5678/healthz', ok: [200], docker: 'n8n-agent' },\n { key: 'whisper', name: 'Whisper', port: 18811, url: 'http://172.19.0.1:18811/', ok: [200, 404], docker: 'whisper-server' },\n];\n\n// Fetch Docker container health from host-side endpoint\nlet dockerHealth = {};\ntry {\n const dhRes = await fetch('http://172.19.0.1:18809/health', { signal: AbortSignal.timeout(3000) });\n if (dhRes.ok) {\n const dhData = await dhRes.json();\n for (const c of (dhData.containers || [])) {\n dockerHealth[c.name] = c;\n }\n }\n} catch (_) {\n // Docker health endpoint unavailable - continue without it\n}\n\nasync function check(svc) {\n const controller = new AbortController();\n const timer = setTimeout(() => controller.abort(), CONFIG.timeoutMs);\n const started = Date.now();\n try {\n const res = await fetch(svc.url, { method: 'GET', signal: controller.signal });\n const ms = Date.now() - started;\n const body = await res.text().catch(() => '');\n return {\n ...svc,\n healthy: svc.ok.includes(res.status),\n status: res.status,\n ms,\n detail: body.slice(0, 160).replace(/\\s+/g, ' ').trim(),\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n } catch (error) {\n return {\n ...svc,\n healthy: false,\n status: 'error',\n ms: Date.now() - started,\n detail: error.message,\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n } finally {\n clearTimeout(timer);\n }\n}\n\nfunction suggestedFix(r) {\n const dh = r.docker;\n const dockerInfo = dh ? ` [docker: ${dh.status}/${dh.health} restarts=${dh.restarts}]` : '';\n if (r.key === 'llamacpp') return 'systemctl status llama-server.service; restart only if it is down.' + dockerInfo;\n if (r.key === 'ollama') return 'systemctl --user status ollama.service; verify port 18807 and nomic-embed-text.' + dockerInfo;\n if (r.key === 'n8n') return 'docker logs n8n-agent --tail 100; check database/API health.' + dockerInfo;\n if (['searxng','litellm','kokoro','whisper','brave'].includes(r.key)) return `cd ~/lab/swarm && docker compose ps; inspect ${r.name} logs.${dockerInfo}`;\n return 'Check service logs and port listener.' + dockerInfo;\n}\n\nconst results = await Promise.all(services.map(check));\nconst now = new Date().toISOString();\nstaticData.services = staticData.services || {};\nconst alerts = [];\nconst recoveries = [];\nfor (const r of results) {\n const prev = staticData.services[r.key] || { failedRuns: 0, alerted: false };\n if (r.healthy) {\n if (prev.alerted) recoveries.push({ ...r, previousFailedRuns: prev.failedRuns });\n staticData.services[r.key] = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: r.status, lastDetail: r.detail };\n } else {\n const failedRuns = (prev.failedRuns || 0) + 1;\n const shouldAlert = failedRuns >= CONFIG.failureThreshold && (!prev.alerted || (CONFIG.reminderEveryFailedRuns > 0 && failedRuns % CONFIG.reminderEveryFailedRuns === 0));\n staticData.services[r.key] = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: r.status, lastDetail: r.detail };\n if (shouldAlert) alerts.push({ ...r, failedRuns, suggestedFix: suggestedFix(r) });\n }\n}\n\nif (!alerts.length && !recoveries.length) return [];\nlet lines = [];\nif (alerts.length) {\n lines.push('\\u{1F6A8} Swarm Health Watchdog');\n for (const a of alerts) {\n const dh = a.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}/restarts=${dh.restarts}` : '';\n lines.push(`- ${a.name} :${a.port} failed ${a.failedRuns} checks; status=${a.status}${dockerStr}; detail=${a.detail || 'n/a'}; fix=${a.suggestedFix}`);\n }\n}\nif (recoveries.length) {\n lines.push('\\u2705 Swarm service recovered');\n for (const r of recoveries) {\n const dh = r.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}` : '';\n lines.push(`- ${r.name} :${r.port} healthy again; status=${r.status}; latency=${r.ms}ms${dockerStr}`);\n }\n}\nlines.push(`checked=${now}`);\nreturn [{ json: { text: lines.join('\\n'), alerts, recoveries, results, dockerHealth, checkedAt: now } }];"},"type":"n8n-nodes-base.code","typeVersion":2,"position":[-340,0],"id":"b3f76d53-204b-45bb-9a48-8cf20262319d","name":"Check Swarm Services"},{"parameters":{"chatId":"8367012007","text":"={{$json.text}}","additionalFields":{"parse_mode":"Markdown"}},"type":"n8n-nodes-base.telegram","typeVersion":1.2,"position":[-80,-80],"id":"32d7ad9f-80bb-4acf-b546-89f04db32a6a","name":"Send Telegram Alert","credentials":{"telegramApi":{"id":"aox4dyIWVSRdcH5z","name":"Telegram Bot (OpenClaw)"}}},{"parameters":{"authentication":"predefinedCredentialType","nodeCredentialType":"httpHeaderAuth","method":"POST","url":"https://discord.com/api/v10/channels/425781661268049931/messages","sendBody":true,"specifyBody":"json","jsonBody":"={{ { content: $json.text } }}","options":{}},"type":"n8n-nodes-base.httpRequest","typeVersion":4.2,"position":[-80,100],"id":"7eb589f5-6e50-4e1e-8a37-391f06785ad87","name":"Send Discord Alert","credentials":{"httpHeaderAuth":{"id":"UgPqYcoCNNIgr55m","name":"Discord Bot Auth"}}}],"connections":{"Manual Trigger":{"main":[[{"node":"Check Swarm Services","type":"main","index":0}]]},"Every 15 Minutes":{"main":[[{"node":"Check Swarm Services","type":"main","index":0}]]},"Check Swarm Services":{"main":[[{"node":"Send Telegram Alert","type":"main","index":0},{"node":"Send Discord Alert","type":"main","index":0}]]}},"authors":"will will","name":null,"description":null,"autosaved":false,"workflowPublishHistory":[{"createdAt":"2026-05-13T21:33:29.886Z","id":1431,"workflowId":"lDKocSFXBQWQrDd3","versionId":"0be0d265-2373-49f8-9066-a9c8aabb7861","event":"activated","userId":"5ad50ead-6e6a-4d12-ab5b-e5db15835bb5"},{"createdAt":"2026-05-13T21:33:29.877Z","id":1430,"workflowId":"lDKocSFXBQWQrDd3","versionId":"0be0d265-2373-49f8-9066-a9c8aabb7861","event":"deactivated","userId":"5ad50ead-6e6a-4d12-ab5b-e5db15835bb5"}]}} \ No newline at end of file