diff --git a/swarm-common/n8n-workflows/agentmon-health-watchdog.json b/swarm-common/n8n-workflows/agentmon-health-watchdog.json new file mode 100644 index 0000000..43be600 --- /dev/null +++ b/swarm-common/n8n-workflows/agentmon-health-watchdog.json @@ -0,0 +1,147 @@ +{ + "name": "Agentmon Health Watchdog", + "active": false, + "nodes": [ + { + "parameters": {}, + "type": "n8n-nodes-base.manualTrigger", + "typeVersion": 1, + "position": [ + -760, + -40 + ], + "id": "dd86a324-8041-4000-92d7-7bcdfa4dfdcb", + "name": "Manual Trigger" + }, + { + "parameters": { + "rule": { + "interval": [ + { + "field": "minutes", + "minutesInterval": 5 + } + ] + } + }, + "type": "n8n-nodes-base.scheduleTrigger", + "typeVersion": 1.2, + "position": [ + -760, + 160 + ], + "id": "1b25c434-e019-4395-887b-8452f136f543", + "name": "Every 5 Minutes" + }, + { + "parameters": { + "mode": "runOnceForAllItems", + "jsCode": "const staticData = $getWorkflowStaticData('global');\nconst CONFIG = {\n baseUrl: 'http://172.19.0.1:8081',\n hostBaseUrl: 'http://172.19.0.1',\n staleAfterMs: 3 * 60 * 1000,\n failureThreshold: 2,\n reminderEveryFailedRuns: 6,\n requiredServices: [\n 'agentmon-ingest',\n 'agentmon-query',\n 'agentmon-ui',\n 'agentmon-processor',\n 'agentmon-swarm-monitor',\n 'agentmon-db',\n 'agentmon-nats',\n ],\n};\n\nconst httpRequest = this.helpers.httpRequest.bind(this.helpers);\n\nasync function requestJson(url, timeout = 10000) {\n const response = await httpRequest({\n method: 'GET',\n url,\n timeout,\n json: true,\n simple: false,\n resolveWithFullResponse: true,\n returnFullResponse: true,\n ignoreHttpStatusErrors: true,\n });\n const status = response.statusCode || response.status;\n if (status < 200 || status >= 300) {\n throw new Error(`${url} returned HTTP ${status}`);\n }\n return response.body;\n}\n\nasync function requestText(url, timeout = 5000) {\n const response = await httpRequest({\n method: 'GET',\n url,\n timeout,\n json: false,\n simple: false,\n resolveWithFullResponse: true,\n returnFullResponse: true,\n ignoreHttpStatusErrors: true,\n });\n const status = response.statusCode || response.status;\n if (status < 200 || status >= 300) {\n throw new Error(`${url} returned HTTP ${status}`);\n }\n return typeof response.body === 'string' ? response.body : JSON.stringify(response.body);\n}\n\nfunction serviceSummary(svc) {\n if (!svc) return 'missing';\n const bits = [`status=${svc.status || 'unknown'}`, `state=${svc.container_state || 'unknown'}`, `health=${svc.health_state || 'unknown'}`];\n if (svc.http_status !== undefined) bits.push(`http=${svc.http_status}`);\n if (svc.uptime_sec !== undefined) bits.push(`uptime=${svc.uptime_sec}s`);\n return bits.join(' ');\n}\n\nfunction normalizeIssues(issues) {\n const out = [];\n if (!issues || typeof issues !== 'object') return out;\n for (const [key, value] of Object.entries(issues)) {\n if (Array.isArray(value) && value.length) out.push(`${key}: ${value.join(', ')}`);\n else if (value && typeof value === 'object' && Object.keys(value).length) out.push(`${key}: ${JSON.stringify(value)}`);\n else if (value && !Array.isArray(value)) out.push(`${key}: ${String(value)}`);\n }\n return out;\n}\n\nconst now = new Date();\nconst nowIso = now.toISOString();\nconst problems = [];\nconst details = [];\nlet snapshotEvent = null;\nlet services = [];\nlet stats = null;\n\ntry {\n await requestText(`${CONFIG.hostBaseUrl}:8080/healthz`, 5000);\n} catch (error) {\n problems.push(`agentmon-ingest /healthz failed: ${error.message}`);\n}\ntry {\n await requestText(`${CONFIG.hostBaseUrl}:8081/healthz`, 5000);\n} catch (error) {\n problems.push(`agentmon-query /healthz failed: ${error.message}`);\n}\ntry {\n await requestText(`${CONFIG.hostBaseUrl}:8082/healthz`, 5000);\n} catch (error) {\n problems.push(`agentmon-ui /healthz failed: ${error.message}`);\n}\n\ntry {\n const eventsResponse = await requestJson(`${CONFIG.baseUrl}/v1/events?event_type=swarm.snapshot&limit=1`, 10000);\n snapshotEvent = (eventsResponse.events || [])[0];\n if (!snapshotEvent) {\n problems.push('no swarm.snapshot events returned');\n } else {\n const tsRaw = snapshotEvent.ts || snapshotEvent.payload?.event?.ts;\n const ts = new Date(tsRaw);\n const ageMs = now.getTime() - ts.getTime();\n if (!Number.isFinite(ageMs)) {\n problems.push(`latest swarm.snapshot has invalid timestamp: ${tsRaw}`);\n } else if (ageMs > CONFIG.staleAfterMs) {\n problems.push(`latest swarm.snapshot stale: age=${Math.round(ageMs / 1000)}s ts=${tsRaw}`);\n }\n\n const payload = snapshotEvent.payload?.payload || snapshotEvent.payload || {};\n services = payload.services || [];\n const byName = Object.fromEntries(services.map((svc) => [svc.name, svc]));\n for (const issue of normalizeIssues(payload.issues)) {\n problems.push(`swarm issue ${issue}`);\n }\n for (const name of CONFIG.requiredServices) {\n const svc = byName[name];\n if (!svc) {\n problems.push(`required service missing: ${name}`);\n } else if (svc.status !== 'healthy' || svc.container_state !== 'running') {\n problems.push(`required service unhealthy: ${name} (${serviceSummary(svc)})`);\n }\n }\n const unhealthy = services.filter((svc) => svc.status && svc.status !== 'healthy');\n for (const svc of unhealthy.slice(0, 20)) {\n details.push(`${svc.name}: ${serviceSummary(svc)}`);\n }\n }\n} catch (error) {\n problems.push(`swarm.snapshot query failed: ${error.message}`);\n}\n\ntry {\n stats = await requestJson(`${CONFIG.baseUrl}/v1/stats/summary`, 5000);\n} catch (error) {\n problems.push(`stats summary query failed: ${error.message}`);\n}\n\nstaticData.agentmon = staticData.agentmon || { failedRuns: 0, alerted: false };\nconst prev = staticData.agentmon;\nconst healthy = problems.length === 0;\nconst result = {\n checkedAt: nowIso,\n healthy,\n problems,\n details,\n snapshotTs: snapshotEvent?.ts || snapshotEvent?.payload?.event?.ts || null,\n serviceCount: services.length,\n stats,\n};\n\nif (healthy) {\n if (prev.alerted) {\n staticData.agentmon = { failedRuns: 0, alerted: false, lastOk: nowIso };\n return [{ json: { ...result, text: `\u2705 Agentmon Health Watchdog recovered\\n- snapshot=${result.snapshotTs}\\n- services=${result.serviceCount}\\n- checked=${nowIso}` } }];\n }\n staticData.agentmon = { failedRuns: 0, alerted: false, lastOk: nowIso };\n return [];\n}\n\nconst failedRuns = (prev.failedRuns || 0) + 1;\nconst shouldAlert = failedRuns >= CONFIG.failureThreshold && (!prev.alerted || (CONFIG.reminderEveryFailedRuns > 0 && failedRuns % CONFIG.reminderEveryFailedRuns === 0));\nstaticData.agentmon = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: nowIso, lastProblems: problems };\n\nif (!shouldAlert) return [];\nconst lines = ['\ud83d\udea8 Agentmon Health Watchdog', `failedChecks=${failedRuns}`, `checked=${nowIso}`];\nfor (const p of problems.slice(0, 12)) lines.push(`- ${p}`);\nif (details.length) {\n lines.push('details:');\n for (const d of details.slice(0, 12)) lines.push(`- ${d}`);\n}\nlines.push('suggested: check `docker logs agentmon-query --tail 100`, `docker logs agentmon-swarm-monitor --tail 100`, and agentmon query `/v1/events?event_type=swarm.snapshot&limit=1`.');\nreturn [{ json: { ...result, failedRuns, text: lines.join('\\n') } }];" + }, + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [ + -500, + 60 + ], + "id": "201ffa92-12f9-4b7f-9a0e-7e4df4fbdbe0", + "name": "Check Agentmon Snapshot" + }, + { + "parameters": { + "chatId": "8367012007", + "text": "={{$json.text}}", + "additionalFields": { + "parse_mode": "Markdown" + } + }, + "type": "n8n-nodes-base.telegram", + "typeVersion": 1.2, + "position": [ + -220, + -40 + ], + "id": "1e160d4e-7614-4479-b470-a3048e08124c", + "name": "Send Telegram Alert", + "credentials": { + "telegramApi": { + "id": "aox4dyIWVSRdcH5z", + "name": "Telegram Bot (OpenClaw)" + } + } + }, + { + "parameters": { + "authentication": "predefinedCredentialType", + "nodeCredentialType": "httpHeaderAuth", + "method": "POST", + "url": "https://discord.com/api/v10/channels/425781661268049931/messages", + "sendBody": true, + "specifyBody": "json", + "jsonBody": "={{ { content: $json.text } }}", + "options": {} + }, + "type": "n8n-nodes-base.httpRequest", + "typeVersion": 4.2, + "position": [ + -220, + 140 + ], + "id": "cf94f111-7824-48a8-8c00-e06cc36cd01e", + "name": "Send Discord Alert", + "credentials": { + "httpHeaderAuth": { + "id": "UgPqYcoCNNIgr55m", + "name": "Discord Bot Auth" + } + } + } + ], + "connections": { + "Manual Trigger": { + "main": [ + [ + { + "node": "Check Agentmon Snapshot", + "type": "main", + "index": 0 + } + ] + ] + }, + "Every 5 Minutes": { + "main": [ + [ + { + "node": "Check Agentmon Snapshot", + "type": "main", + "index": 0 + } + ] + ] + }, + "Check Agentmon Snapshot": { + "main": [ + [ + { + "node": "Send Telegram Alert", + "type": "main", + "index": 0 + }, + { + "node": "Send Discord Alert", + "type": "main", + "index": 0 + } + ] + ] + } + }, + "settings": { + "executionOrder": "v1" + }, + "staticData": null, + "pinData": {}, + "tags": [], + "id": "AgentmonHealthWatchdog" +} \ No newline at end of file