Files
swarm-master/swarm-common/n8n-workflows/swarm-health-watchdog.json
T
2026-06-04 16:26:05 -07:00

416 lines
22 KiB
JSON

{
"updatedAt": "2026-05-14T00:32:57.803Z",
"createdAt": "2026-05-12T17:48:01.214Z",
"id": "lDKocSFXBQWQrDd3",
"name": "Swarm Health Watchdog",
"description": "Every 15 minutes, checks core swarm endpoints from inside n8n. Alerts after two consecutive failures and reports recoveries to Telegram and Discord.",
"active": true,
"isArchived": false,
"nodes": [
{
"parameters": {},
"type": "n8n-nodes-base.manualTrigger",
"typeVersion": 1,
"position": [
-620,
-100
],
"id": "3759f3cd-fa90-49b6-ad08-322d21f3d727",
"name": "Manual Trigger"
},
{
"parameters": {
"rule": {
"interval": [
{
"field": "minutes",
"minutesInterval": 15
}
]
}
},
"type": "n8n-nodes-base.scheduleTrigger",
"typeVersion": 1.3,
"position": [
-620,
100
],
"id": "9d209ddb-8da7-48ad-850c-ec0e452760ca",
"name": "Every 15 Minutes"
},
{
"parameters": {
"mode": "runOnceForAllItems",
"jsCode": "const staticData = $getWorkflowStaticData('global');\nconst CONFIG = {\n timeoutMs: 60000,\n failureThreshold: 2,\n reminderEveryFailedRuns: 6,\n};\nconst services = [\n { key: 'brave', name: 'Brave MCP', port: 18802, url: 'http://172.19.0.1:18802/mcp', ok: [200, 400, 404, 405, 406], docker: 'brave-search' },\n { key: 'searxng', name: 'SearXNG', port: 18803, url: 'http://172.19.0.1:18803/search?q=health&format=json', ok: [200], docker: 'searxng' },\n { key: 'litellm', name: 'LiteLLM', port: 18804, url: 'http://172.19.0.1:18804/health/liveliness', ok: [200], docker: 'litellm' },\n { key: 'kokoro', name: 'Kokoro TTS', port: 18805, url: 'http://172.19.0.1:18805/health', ok: [200], docker: 'kokoro-tts' },\n { key: 'llamacpp', name: 'llama.cpp', port: 18806, url: 'http://172.19.0.1:18806/health', ok: [200] },\n { key: 'ollama', name: 'Ollama embeddings', port: 18807, url: 'http://172.19.0.1:18807/api/version', ok: [200] },\n { key: 'n8n', name: 'n8n', port: 18808, url: 'http://127.0.0.1:5678/healthz', ok: [200], docker: 'n8n-agent' },\n { key: 'whisper', name: 'Whisper NPU', port: 18816, url: 'http://172.19.0.1:18816/', ok: [200, 404], docker: 'whisper-server-npu' },\n { key: 'advisory_gateway', name: 'OpenVINO Advisory Gateway', port: 18830, url: 'http://172.19.0.1:18830/healthz', ok: [200] },\n];\n\nconst httpRequest = this.helpers.httpRequest.bind(this.helpers);\n\nfunction responseLike(response) {\n const status = response.statusCode || response.status;\n const body = response.body === undefined || response.body === null ? '' : response.body;\n return {\n status,\n ok: status >= 200 && status < 300,\n async text() {\n return typeof body === 'string' ? body : JSON.stringify(body);\n },\n async json() {\n if (typeof body === 'string') return JSON.parse(body);\n return body;\n },\n };\n}\n\nasync function fetchWithTimeout(url, options = {}, timeoutMs = CONFIG.timeoutMs) {\n const method = options.method || 'GET';\n try {\n const response = await httpRequest({\n method,\n url,\n timeout: timeoutMs,\n json: false,\n simple: false,\n resolveWithFullResponse: true,\n returnFullResponse: true,\n ignoreHttpStatusErrors: true,\n });\n return responseLike(response);\n } catch (error) {\n if (error.response) return responseLike(error.response);\n throw error;\n }\n}\n\n// Fetch Docker container health from host-side endpoint\nlet dockerHealth = {};\ntry {\n const dhRes = await fetchWithTimeout('http://172.19.0.1:18809/health', { method: 'GET' }, 3000);\n if (dhRes.ok) {\n const dhData = await dhRes.json();\n for (const c of (dhData.containers || [])) {\n dockerHealth[c.name] = c;\n }\n }\n} catch (_) {\n // Docker health endpoint unavailable - continue without it\n}\n\nasync function check(svc) {\n const started = Date.now();\n try {\n const res = await fetchWithTimeout(svc.url, { method: 'GET' }, CONFIG.timeoutMs);\n const ms = Date.now() - started;\n const body = await res.text().catch(() => '');\n return {\n ...svc,\n healthy: svc.ok.includes(res.status),\n status: res.status,\n ms,\n detail: body.slice(0, 160).replace(/\\s+/g, ' ').trim(),\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n } catch (error) {\n return {\n ...svc,\n healthy: false,\n status: 'error',\n ms: Date.now() - started,\n detail: error.message,\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n }\n}\n\nfunction suggestedFix(r) {\n const dh = r.docker;\n const dockerInfo = dh ? ` [docker: ${dh.status}/${dh.health} restarts=${dh.restarts}]` : '';\n if (r.key === 'llamacpp') return 'systemctl status llama-server.service; restart only if it is down.' + dockerInfo;\n if (r.key === 'ollama') return 'systemctl --user status ollama.service; verify port 18807 and nomic-embed-text.' + dockerInfo;\n if (r.key === 'n8n') return 'docker logs n8n-agent --tail 100; check database/API health.' + dockerInfo;\n if (r.key === 'advisory_gateway') return 'systemctl --user status openvino-advisory-gateway.service; verify http://172.19.0.1:18830/healthz from n8n-agent.' + dockerInfo;\n if (['searxng','litellm','kokoro','whisper','brave'].includes(r.key)) return `cd ~/lab/swarm && docker compose ps; inspect ${r.name} logs.${dockerInfo}`;\n return 'Check service logs and port listener.' + dockerInfo;\n}\n\nconst results = await Promise.all(services.map(check));\nconst now = new Date().toISOString();\nstaticData.services = staticData.services || {};\nconst alerts = [];\nconst recoveries = [];\nfor (const r of results) {\n const prev = staticData.services[r.key] || { failedRuns: 0, alerted: false };\n if (r.healthy) {\n if (prev.alerted) recoveries.push({ ...r, previousFailedRuns: prev.failedRuns });\n staticData.services[r.key] = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: r.status, lastDetail: r.detail };\n } else {\n const failedRuns = (prev.failedRuns || 0) + 1;\n const shouldAlert = failedRuns >= CONFIG.failureThreshold && (!prev.alerted || (CONFIG.reminderEveryFailedRuns > 0 && failedRuns % CONFIG.reminderEveryFailedRuns === 0));\n staticData.services[r.key] = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: r.status, lastDetail: r.detail };\n if (shouldAlert) alerts.push({ ...r, failedRuns, suggestedFix: suggestedFix(r) });\n }\n}\n\nif (!alerts.length && !recoveries.length) return [];\nlet lines = [];\nif (alerts.length) {\n lines.push('\\u{1F6A8} Swarm Health Watchdog');\n for (const a of alerts) {\n const dh = a.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}/restarts=${dh.restarts}` : '';\n lines.push(`- ${a.name} :${a.port} failed ${a.failedRuns} checks; status=${a.status}${dockerStr}; detail=${a.detail || 'n/a'}; fix=${a.suggestedFix}`);\n }\n}\nif (recoveries.length) {\n lines.push('\\u2705 Swarm service recovered');\n for (const r of recoveries) {\n const dh = r.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}` : '';\n lines.push(`- ${r.name} :${r.port} healthy again; status=${r.status}; latency=${r.ms}ms${dockerStr}`);\n }\n}\nlines.push(`checked=${now}`);\nreturn [{ json: { text: lines.join('\\n'), alerts, recoveries, results, dockerHealth, checkedAt: now } }];"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
-340,
0
],
"id": "b3f76d53-204b-45bb-9a48-8cf20262319d",
"name": "Check Swarm Services"
},
{
"parameters": {
"chatId": "8367012007",
"text": "={{$json.text}}",
"additionalFields": {
"parse_mode": "Markdown"
}
},
"type": "n8n-nodes-base.telegram",
"typeVersion": 1.2,
"position": [
-80,
-80
],
"id": "32d7ad9f-80bb-4acf-b546-89f04db32a6a",
"name": "Send Telegram Alert",
"credentials": {
"telegramApi": {
"id": "aox4dyIWVSRdcH5z",
"name": "Telegram Bot (OpenClaw)"
}
}
},
{
"parameters": {
"authentication": "predefinedCredentialType",
"nodeCredentialType": "httpHeaderAuth",
"method": "POST",
"url": "https://discord.com/api/v10/channels/425781661268049931/messages",
"sendBody": true,
"specifyBody": "json",
"jsonBody": "={{ { content: $json.text } }}",
"options": {}
},
"type": "n8n-nodes-base.httpRequest",
"typeVersion": 4.2,
"position": [
-80,
100
],
"id": "7eb589f5-6e50-4e1e-8a37-391f06785ad87",
"name": "Send Discord Alert",
"credentials": {
"httpHeaderAuth": {
"id": "UgPqYcoCNNIgr55m",
"name": "Discord Bot Auth"
}
}
}
],
"connections": {
"Manual Trigger": {
"main": [
[
{
"node": "Check Swarm Services",
"type": "main",
"index": 0
}
]
]
},
"Every 15 Minutes": {
"main": [
[
{
"node": "Check Swarm Services",
"type": "main",
"index": 0
}
]
]
},
"Check Swarm Services": {
"main": [
[
{
"node": "Send Telegram Alert",
"type": "main",
"index": 0
},
{
"node": "Send Discord Alert",
"type": "main",
"index": 0
}
]
]
}
},
"settings": {
"executionOrder": "v1",
"timezone": "America/Los_Angeles",
"saveDataErrorExecution": "all",
"saveDataSuccessExecution": "all",
"callerPolicy": "workflowsFromSameOwner",
"availableInMCP": false
},
"staticData": {
"node:Every 15 Minutes": {
"recurrenceRules": []
},
"global": {
"services": {
"brave": {
"failedRuns": 4,
"alerted": true,
"lastFailure": "2026-05-14T00:30:40.067Z",
"lastStatus": "error",
"lastDetail": "fetch is not defined"
},
"searxng": {
"failedRuns": 4,
"alerted": true,
"lastFailure": "2026-05-14T00:30:40.067Z",
"lastStatus": "error",
"lastDetail": "fetch is not defined"
},
"litellm": {
"failedRuns": 4,
"alerted": true,
"lastFailure": "2026-05-14T00:30:40.067Z",
"lastStatus": "error",
"lastDetail": "fetch is not defined"
},
"kokoro": {
"failedRuns": 4,
"alerted": true,
"lastFailure": "2026-05-14T00:30:40.067Z",
"lastStatus": "error",
"lastDetail": "fetch is not defined"
},
"llamacpp": {
"failedRuns": 4,
"alerted": true,
"lastFailure": "2026-05-14T00:30:40.067Z",
"lastStatus": "error",
"lastDetail": "fetch is not defined"
},
"ollama": {
"failedRuns": 4,
"alerted": true,
"lastFailure": "2026-05-14T00:30:40.067Z",
"lastStatus": "error",
"lastDetail": "fetch is not defined"
},
"n8n": {
"failedRuns": 4,
"alerted": true,
"lastFailure": "2026-05-14T00:30:40.067Z",
"lastStatus": "error",
"lastDetail": "fetch is not defined"
},
"whisper": {
"failedRuns": 4,
"alerted": true,
"lastFailure": "2026-05-14T00:30:40.067Z",
"lastStatus": "error",
"lastDetail": "fetch is not defined"
}
}
}
},
"meta": null,
"versionId": "eec5521b-fb44-44ea-b238-aff842560f98",
"activeVersionId": "eec5521b-fb44-44ea-b238-aff842560f98",
"versionCounter": 52,
"triggerCount": 1,
"shared": [
{
"updatedAt": "2026-05-12T17:39:10.124Z",
"createdAt": "2026-05-12T17:39:10.124Z",
"role": "workflow:owner",
"workflowId": "lDKocSFXBQWQrDd3",
"projectId": "WGdp8QunI1tHpjXa",
"project": {
"updatedAt": "2026-03-11T21:08:10.005Z",
"createdAt": "2026-03-11T21:05:11.541Z",
"id": "WGdp8QunI1tHpjXa",
"name": "will will <will@wills-portal.com>",
"type": "personal",
"icon": null,
"description": null,
"creatorId": "5ad50ead-6e6a-4d12-ab5b-e5db15835bb5"
}
}
],
"tags": [],
"activeVersion": {
"updatedAt": "2026-05-14T00:09:09.316Z",
"createdAt": "2026-05-14T00:09:09.316Z",
"versionId": "eec5521b-fb44-44ea-b238-aff842560f98",
"workflowId": "lDKocSFXBQWQrDd3",
"nodes": [
{
"parameters": {},
"type": "n8n-nodes-base.manualTrigger",
"typeVersion": 1,
"position": [
-620,
-100
],
"id": "3759f3cd-fa90-49b6-ad08-322d21f3d727",
"name": "Manual Trigger"
},
{
"parameters": {
"rule": {
"interval": [
{
"field": "minutes",
"minutesInterval": 15
}
]
}
},
"type": "n8n-nodes-base.scheduleTrigger",
"typeVersion": 1.3,
"position": [
-620,
100
],
"id": "9d209ddb-8da7-48ad-850c-ec0e452760ca",
"name": "Every 15 Minutes"
},
{
"parameters": {
"mode": "runOnceForAllItems",
"jsCode": "const staticData = $getWorkflowStaticData('global');\nconst CONFIG = {\n timeoutMs: 5000,\n failureThreshold: 2,\n reminderEveryFailedRuns: 6,\n};\nconst services = [\n { key: 'brave', name: 'Brave MCP', port: 18802, url: 'http://172.19.0.1:18802/mcp', ok: [200, 400, 404, 405, 406], docker: 'brave-search' },\n { key: 'searxng', name: 'SearXNG', port: 18803, url: 'http://172.19.0.1:18803/search?q=health&format=json', ok: [200], docker: 'searxng' },\n { key: 'litellm', name: 'LiteLLM', port: 18804, url: 'http://172.19.0.1:18804/health/liveliness', ok: [200], docker: 'litellm' },\n { key: 'kokoro', name: 'Kokoro TTS', port: 18805, url: 'http://172.19.0.1:18805/health', ok: [200], docker: 'kokoro-tts' },\n { key: 'llamacpp', name: 'llama.cpp', port: 18806, url: 'http://172.19.0.1:18806/health', ok: [200] },\n { key: 'ollama', name: 'Ollama embeddings', port: 18807, url: 'http://172.19.0.1:18807/api/version', ok: [200] },\n { key: 'n8n', name: 'n8n', port: 18808, url: 'http://127.0.0.1:5678/healthz', ok: [200], docker: 'n8n-agent' },\n { key: 'whisper', name: 'Whisper NPU', port: 18816, url: 'http://172.19.0.1:18816/', ok: [200, 404], docker: 'whisper-server-npu' },\n];\n\nasync function fetchWithTimeout(url, options = {}, timeoutMs = CONFIG.timeoutMs) {\n let timer;\n const timeoutPromise = new Promise((_, reject) => {\n timer = setTimeout(() => reject(new Error(`Request timed out after ${timeoutMs}ms`)), timeoutMs);\n });\n try {\n return await Promise.race([fetch(url, options), timeoutPromise]);\n } finally {\n if (timer) clearTimeout(timer);\n }\n}\n\n// Fetch Docker container health from host-side endpoint\nlet dockerHealth = {};\ntry {\n const dhRes = await fetchWithTimeout('http://172.19.0.1:18809/health', { method: 'GET' }, 3000);\n if (dhRes.ok) {\n const dhData = await dhRes.json();\n for (const c of (dhData.containers || [])) {\n dockerHealth[c.name] = c;\n }\n }\n} catch (_) {\n // Docker health endpoint unavailable - continue without it\n}\n\nasync function check(svc) {\n const started = Date.now();\n try {\n const res = await fetchWithTimeout(svc.url, { method: 'GET' }, CONFIG.timeoutMs);\n const ms = Date.now() - started;\n const body = await res.text().catch(() => '');\n return {\n ...svc,\n healthy: svc.ok.includes(res.status),\n status: res.status,\n ms,\n detail: body.slice(0, 160).replace(/\\s+/g, ' ').trim(),\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n } catch (error) {\n return {\n ...svc,\n healthy: false,\n status: 'error',\n ms: Date.now() - started,\n detail: error.message,\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n }\n}\n\nfunction suggestedFix(r) {\n const dh = r.docker;\n const dockerInfo = dh ? ` [docker: ${dh.status}/${dh.health} restarts=${dh.restarts}]` : '';\n if (r.key === 'llamacpp') return 'systemctl status llama-server.service; restart only if it is down.' + dockerInfo;\n if (r.key === 'ollama') return 'systemctl --user status ollama.service; verify port 18807 and nomic-embed-text.' + dockerInfo;\n if (r.key === 'n8n') return 'docker logs n8n-agent --tail 100; check database/API health.' + dockerInfo;\n if (['searxng','litellm','kokoro','whisper','brave'].includes(r.key)) return `cd ~/lab/swarm && docker compose ps; inspect ${r.name} logs.${dockerInfo}`;\n return 'Check service logs and port listener.' + dockerInfo;\n}\n\nconst results = await Promise.all(services.map(check));\nconst now = new Date().toISOString();\nstaticData.services = staticData.services || {};\nconst alerts = [];\nconst recoveries = [];\nfor (const r of results) {\n const prev = staticData.services[r.key] || { failedRuns: 0, alerted: false };\n if (r.healthy) {\n if (prev.alerted) recoveries.push({ ...r, previousFailedRuns: prev.failedRuns });\n staticData.services[r.key] = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: r.status, lastDetail: r.detail };\n } else {\n const failedRuns = (prev.failedRuns || 0) + 1;\n const shouldAlert = failedRuns >= CONFIG.failureThreshold && (!prev.alerted || (CONFIG.reminderEveryFailedRuns > 0 && failedRuns % CONFIG.reminderEveryFailedRuns === 0));\n staticData.services[r.key] = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: r.status, lastDetail: r.detail };\n if (shouldAlert) alerts.push({ ...r, failedRuns, suggestedFix: suggestedFix(r) });\n }\n}\n\nif (!alerts.length && !recoveries.length) return [];\nlet lines = [];\nif (alerts.length) {\n lines.push('\\u{1F6A8} Swarm Health Watchdog');\n for (const a of alerts) {\n const dh = a.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}/restarts=${dh.restarts}` : '';\n lines.push(`- ${a.name} :${a.port} failed ${a.failedRuns} checks; status=${a.status}${dockerStr}; detail=${a.detail || 'n/a'}; fix=${a.suggestedFix}`);\n }\n}\nif (recoveries.length) {\n lines.push('\\u2705 Swarm service recovered');\n for (const r of recoveries) {\n const dh = r.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}` : '';\n lines.push(`- ${r.name} :${r.port} healthy again; status=${r.status}; latency=${r.ms}ms${dockerStr}`);\n }\n}\nlines.push(`checked=${now}`);\nreturn [{ json: { text: lines.join('\\n'), alerts, recoveries, results, dockerHealth, checkedAt: now } }];"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
-340,
0
],
"id": "b3f76d53-204b-45bb-9a48-8cf20262319d",
"name": "Check Swarm Services"
},
{
"parameters": {
"chatId": "8367012007",
"text": "={{$json.text}}",
"additionalFields": {
"parse_mode": "Markdown"
}
},
"type": "n8n-nodes-base.telegram",
"typeVersion": 1.2,
"position": [
-80,
-80
],
"id": "32d7ad9f-80bb-4acf-b546-89f04db32a6a",
"name": "Send Telegram Alert",
"credentials": {
"telegramApi": {
"id": "aox4dyIWVSRdcH5z",
"name": "Telegram Bot (OpenClaw)"
}
}
},
{
"parameters": {
"authentication": "predefinedCredentialType",
"nodeCredentialType": "httpHeaderAuth",
"method": "POST",
"url": "https://discord.com/api/v10/channels/425781661268049931/messages",
"sendBody": true,
"specifyBody": "json",
"jsonBody": "={{ { content: $json.text } }}",
"options": {}
},
"type": "n8n-nodes-base.httpRequest",
"typeVersion": 4.2,
"position": [
-80,
100
],
"id": "7eb589f5-6e50-4e1e-8a37-391f06785ad87",
"name": "Send Discord Alert",
"credentials": {
"httpHeaderAuth": {
"id": "UgPqYcoCNNIgr55m",
"name": "Discord Bot Auth"
}
}
}
],
"connections": {
"Manual Trigger": {
"main": [
[
{
"node": "Check Swarm Services",
"type": "main",
"index": 0
}
]
]
},
"Every 15 Minutes": {
"main": [
[
{
"node": "Check Swarm Services",
"type": "main",
"index": 0
}
]
]
},
"Check Swarm Services": {
"main": [
[
{
"node": "Send Telegram Alert",
"type": "main",
"index": 0
},
{
"node": "Send Discord Alert",
"type": "main",
"index": 0
}
]
]
}
},
"authors": "will will",
"name": null,
"description": null,
"autosaved": false,
"workflowPublishHistory": [
{
"createdAt": "2026-05-14T00:09:09.344Z",
"id": 1489,
"workflowId": "lDKocSFXBQWQrDd3",
"versionId": "eec5521b-fb44-44ea-b238-aff842560f98",
"event": "activated",
"userId": "5ad50ead-6e6a-4d12-ab5b-e5db15835bb5"
},
{
"createdAt": "2026-05-14T00:32:57.833Z",
"id": 1495,
"workflowId": "lDKocSFXBQWQrDd3",
"versionId": "eec5521b-fb44-44ea-b238-aff842560f98",
"event": "activated",
"userId": "5ad50ead-6e6a-4d12-ab5b-e5db15835bb5"
},
{
"createdAt": "2026-05-14T00:32:57.790Z",
"id": 1494,
"workflowId": "lDKocSFXBQWQrDd3",
"versionId": "eec5521b-fb44-44ea-b238-aff842560f98",
"event": "deactivated",
"userId": "5ad50ead-6e6a-4d12-ab5b-e5db15835bb5"
}
]
}
}