Files
swarm-master/swarm-common/n8n-workflows/agentmon-health-watchdog.json
T
2026-06-04 13:26:50 -07:00

147 lines
9.6 KiB
JSON

{
"name": "Agentmon Health Watchdog",
"active": false,
"nodes": [
{
"parameters": {},
"type": "n8n-nodes-base.manualTrigger",
"typeVersion": 1,
"position": [
-760,
-40
],
"id": "dd86a324-8041-4000-92d7-7bcdfa4dfdcb",
"name": "Manual Trigger"
},
{
"parameters": {
"rule": {
"interval": [
{
"field": "minutes",
"minutesInterval": 5
}
]
}
},
"type": "n8n-nodes-base.scheduleTrigger",
"typeVersion": 1.2,
"position": [
-760,
160
],
"id": "1b25c434-e019-4395-887b-8452f136f543",
"name": "Every 5 Minutes"
},
{
"parameters": {
"mode": "runOnceForAllItems",
"jsCode": "const staticData = $getWorkflowStaticData('global');\nconst CONFIG = {\n baseUrl: 'http://172.19.0.1:8081',\n hostBaseUrl: 'http://172.19.0.1',\n staleAfterMs: 3 * 60 * 1000,\n failureThreshold: 2,\n reminderEveryFailedRuns: 6,\n requiredServices: [\n 'agentmon-ingest',\n 'agentmon-query',\n 'agentmon-ui',\n 'agentmon-processor',\n 'agentmon-swarm-monitor',\n 'agentmon-db',\n 'agentmon-nats',\n ],\n};\n\nconst httpRequest = this.helpers.httpRequest.bind(this.helpers);\n\nasync function requestJson(url, timeout = 10000) {\n const response = await httpRequest({\n method: 'GET',\n url,\n timeout,\n json: true,\n simple: false,\n resolveWithFullResponse: true,\n returnFullResponse: true,\n ignoreHttpStatusErrors: true,\n });\n const status = response.statusCode || response.status;\n if (status < 200 || status >= 300) {\n throw new Error(`${url} returned HTTP ${status}`);\n }\n return response.body;\n}\n\nasync function requestText(url, timeout = 5000) {\n const response = await httpRequest({\n method: 'GET',\n url,\n timeout,\n json: false,\n simple: false,\n resolveWithFullResponse: true,\n returnFullResponse: true,\n ignoreHttpStatusErrors: true,\n });\n const status = response.statusCode || response.status;\n if (status < 200 || status >= 300) {\n throw new Error(`${url} returned HTTP ${status}`);\n }\n return typeof response.body === 'string' ? response.body : JSON.stringify(response.body);\n}\n\nfunction serviceSummary(svc) {\n if (!svc) return 'missing';\n const bits = [`status=${svc.status || 'unknown'}`, `state=${svc.container_state || 'unknown'}`, `health=${svc.health_state || 'unknown'}`];\n if (svc.http_status !== undefined) bits.push(`http=${svc.http_status}`);\n if (svc.uptime_sec !== undefined) bits.push(`uptime=${svc.uptime_sec}s`);\n return bits.join(' ');\n}\n\nfunction normalizeIssues(issues) {\n const out = [];\n if (!issues || typeof issues !== 'object') return out;\n for (const [key, value] of Object.entries(issues)) {\n if (Array.isArray(value) && value.length) out.push(`${key}: ${value.join(', ')}`);\n else if (value && typeof value === 'object' && Object.keys(value).length) out.push(`${key}: ${JSON.stringify(value)}`);\n else if (value && !Array.isArray(value)) out.push(`${key}: ${String(value)}`);\n }\n return out;\n}\n\nconst now = new Date();\nconst nowIso = now.toISOString();\nconst problems = [];\nconst details = [];\nlet snapshotEvent = null;\nlet services = [];\nlet stats = null;\n\ntry {\n await requestText(`${CONFIG.hostBaseUrl}:8080/healthz`, 5000);\n} catch (error) {\n problems.push(`agentmon-ingest /healthz failed: ${error.message}`);\n}\ntry {\n await requestText(`${CONFIG.hostBaseUrl}:8081/healthz`, 5000);\n} catch (error) {\n problems.push(`agentmon-query /healthz failed: ${error.message}`);\n}\ntry {\n await requestText(`${CONFIG.hostBaseUrl}:8082/healthz`, 5000);\n} catch (error) {\n problems.push(`agentmon-ui /healthz failed: ${error.message}`);\n}\n\ntry {\n const eventsResponse = await requestJson(`${CONFIG.baseUrl}/v1/events?event_type=swarm.snapshot&limit=1`, 10000);\n snapshotEvent = (eventsResponse.events || [])[0];\n if (!snapshotEvent) {\n problems.push('no swarm.snapshot events returned');\n } else {\n const tsRaw = snapshotEvent.ts || snapshotEvent.payload?.event?.ts;\n const ts = new Date(tsRaw);\n const ageMs = now.getTime() - ts.getTime();\n if (!Number.isFinite(ageMs)) {\n problems.push(`latest swarm.snapshot has invalid timestamp: ${tsRaw}`);\n } else if (ageMs > CONFIG.staleAfterMs) {\n problems.push(`latest swarm.snapshot stale: age=${Math.round(ageMs / 1000)}s ts=${tsRaw}`);\n }\n\n const payload = snapshotEvent.payload?.payload || snapshotEvent.payload || {};\n services = payload.services || [];\n const byName = Object.fromEntries(services.map((svc) => [svc.name, svc]));\n for (const issue of normalizeIssues(payload.issues)) {\n problems.push(`swarm issue ${issue}`);\n }\n for (const name of CONFIG.requiredServices) {\n const svc = byName[name];\n if (!svc) {\n problems.push(`required service missing: ${name}`);\n } else if (svc.status !== 'healthy' || svc.container_state !== 'running') {\n problems.push(`required service unhealthy: ${name} (${serviceSummary(svc)})`);\n }\n }\n const unhealthy = services.filter((svc) => svc.status && svc.status !== 'healthy');\n for (const svc of unhealthy.slice(0, 20)) {\n details.push(`${svc.name}: ${serviceSummary(svc)}`);\n }\n }\n} catch (error) {\n problems.push(`swarm.snapshot query failed: ${error.message}`);\n}\n\ntry {\n stats = await requestJson(`${CONFIG.baseUrl}/v1/stats/summary`, 5000);\n} catch (error) {\n problems.push(`stats summary query failed: ${error.message}`);\n}\n\nstaticData.agentmon = staticData.agentmon || { failedRuns: 0, alerted: false };\nconst prev = staticData.agentmon;\nconst healthy = problems.length === 0;\nconst result = {\n checkedAt: nowIso,\n healthy,\n problems,\n details,\n snapshotTs: snapshotEvent?.ts || snapshotEvent?.payload?.event?.ts || null,\n serviceCount: services.length,\n stats,\n};\n\nif (healthy) {\n if (prev.alerted) {\n staticData.agentmon = { failedRuns: 0, alerted: false, lastOk: nowIso };\n return [{ json: { ...result, text: `\u2705 Agentmon Health Watchdog recovered\\n- snapshot=${result.snapshotTs}\\n- services=${result.serviceCount}\\n- checked=${nowIso}` } }];\n }\n staticData.agentmon = { failedRuns: 0, alerted: false, lastOk: nowIso };\n return [];\n}\n\nconst failedRuns = (prev.failedRuns || 0) + 1;\nconst shouldAlert = failedRuns >= CONFIG.failureThreshold && (!prev.alerted || (CONFIG.reminderEveryFailedRuns > 0 && failedRuns % CONFIG.reminderEveryFailedRuns === 0));\nstaticData.agentmon = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: nowIso, lastProblems: problems };\n\nif (!shouldAlert) return [];\nconst lines = ['\ud83d\udea8 Agentmon Health Watchdog', `failedChecks=${failedRuns}`, `checked=${nowIso}`];\nfor (const p of problems.slice(0, 12)) lines.push(`- ${p}`);\nif (details.length) {\n lines.push('details:');\n for (const d of details.slice(0, 12)) lines.push(`- ${d}`);\n}\nlines.push('suggested: check `docker logs agentmon-query --tail 100`, `docker logs agentmon-swarm-monitor --tail 100`, and agentmon query `/v1/events?event_type=swarm.snapshot&limit=1`.');\nreturn [{ json: { ...result, failedRuns, text: lines.join('\\n') } }];"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
-500,
60
],
"id": "201ffa92-12f9-4b7f-9a0e-7e4df4fbdbe0",
"name": "Check Agentmon Snapshot"
},
{
"parameters": {
"chatId": "8367012007",
"text": "={{$json.text}}",
"additionalFields": {
"parse_mode": "Markdown"
}
},
"type": "n8n-nodes-base.telegram",
"typeVersion": 1.2,
"position": [
-220,
-40
],
"id": "1e160d4e-7614-4479-b470-a3048e08124c",
"name": "Send Telegram Alert",
"credentials": {
"telegramApi": {
"id": "aox4dyIWVSRdcH5z",
"name": "Telegram Bot (OpenClaw)"
}
}
},
{
"parameters": {
"authentication": "predefinedCredentialType",
"nodeCredentialType": "httpHeaderAuth",
"method": "POST",
"url": "https://discord.com/api/v10/channels/425781661268049931/messages",
"sendBody": true,
"specifyBody": "json",
"jsonBody": "={{ { content: $json.text } }}",
"options": {}
},
"type": "n8n-nodes-base.httpRequest",
"typeVersion": 4.2,
"position": [
-220,
140
],
"id": "cf94f111-7824-48a8-8c00-e06cc36cd01e",
"name": "Send Discord Alert",
"credentials": {
"httpHeaderAuth": {
"id": "UgPqYcoCNNIgr55m",
"name": "Discord Bot Auth"
}
}
}
],
"connections": {
"Manual Trigger": {
"main": [
[
{
"node": "Check Agentmon Snapshot",
"type": "main",
"index": 0
}
]
]
},
"Every 5 Minutes": {
"main": [
[
{
"node": "Check Agentmon Snapshot",
"type": "main",
"index": 0
}
]
]
},
"Check Agentmon Snapshot": {
"main": [
[
{
"node": "Send Telegram Alert",
"type": "main",
"index": 0
},
{
"node": "Send Discord Alert",
"type": "main",
"index": 0
}
]
]
}
},
"settings": {
"executionOrder": "v1"
},
"staticData": null,
"pinData": {},
"tags": [],
"id": "AgentmonHealthWatchdog"
}