{ "name": "Agentmon Health Watchdog", "active": false, "nodes": [ { "parameters": {}, "type": "n8n-nodes-base.manualTrigger", "typeVersion": 1, "position": [ -760, -40 ], "id": "dd86a324-8041-4000-92d7-7bcdfa4dfdcb", "name": "Manual Trigger" }, { "parameters": { "rule": { "interval": [ { "field": "minutes", "minutesInterval": 5 } ] } }, "type": "n8n-nodes-base.scheduleTrigger", "typeVersion": 1.2, "position": [ -760, 160 ], "id": "1b25c434-e019-4395-887b-8452f136f543", "name": "Every 5 Minutes" }, { "parameters": { "mode": "runOnceForAllItems", "jsCode": "const staticData = $getWorkflowStaticData('global');\nconst CONFIG = {\n baseUrl: 'http://172.19.0.1:8081',\n hostBaseUrl: 'http://172.19.0.1',\n staleAfterMs: 3 * 60 * 1000,\n failureThreshold: 2,\n reminderEveryFailedRuns: 6,\n requiredServices: [\n 'agentmon-ingest',\n 'agentmon-query',\n 'agentmon-ui',\n 'agentmon-processor',\n 'agentmon-swarm-monitor',\n 'agentmon-db',\n 'agentmon-nats',\n ],\n};\n\nconst httpRequest = this.helpers.httpRequest.bind(this.helpers);\n\nasync function requestJson(url, timeout = 10000) {\n const response = await httpRequest({\n method: 'GET',\n url,\n timeout,\n json: true,\n simple: false,\n resolveWithFullResponse: true,\n returnFullResponse: true,\n ignoreHttpStatusErrors: true,\n });\n const status = response.statusCode || response.status;\n if (status < 200 || status >= 300) {\n throw new Error(`${url} returned HTTP ${status}`);\n }\n return response.body;\n}\n\nasync function requestText(url, timeout = 5000) {\n const response = await httpRequest({\n method: 'GET',\n url,\n timeout,\n json: false,\n simple: false,\n resolveWithFullResponse: true,\n returnFullResponse: true,\n ignoreHttpStatusErrors: true,\n });\n const status = response.statusCode || response.status;\n if (status < 200 || status >= 300) {\n throw new Error(`${url} returned HTTP ${status}`);\n }\n return typeof response.body === 'string' ? response.body : JSON.stringify(response.body);\n}\n\nfunction serviceSummary(svc) {\n if (!svc) return 'missing';\n const bits = [`status=${svc.status || 'unknown'}`, `state=${svc.container_state || 'unknown'}`, `health=${svc.health_state || 'unknown'}`];\n if (svc.http_status !== undefined) bits.push(`http=${svc.http_status}`);\n if (svc.uptime_sec !== undefined) bits.push(`uptime=${svc.uptime_sec}s`);\n return bits.join(' ');\n}\n\nfunction normalizeIssues(issues) {\n const out = [];\n if (!issues || typeof issues !== 'object') return out;\n for (const [key, value] of Object.entries(issues)) {\n if (Array.isArray(value) && value.length) out.push(`${key}: ${value.join(', ')}`);\n else if (value && typeof value === 'object' && Object.keys(value).length) out.push(`${key}: ${JSON.stringify(value)}`);\n else if (value && !Array.isArray(value)) out.push(`${key}: ${String(value)}`);\n }\n return out;\n}\n\nconst now = new Date();\nconst nowIso = now.toISOString();\nconst problems = [];\nconst details = [];\nlet snapshotEvent = null;\nlet services = [];\nlet stats = null;\n\ntry {\n await requestText(`${CONFIG.hostBaseUrl}:8080/healthz`, 5000);\n} catch (error) {\n problems.push(`agentmon-ingest /healthz failed: ${error.message}`);\n}\ntry {\n await requestText(`${CONFIG.hostBaseUrl}:8081/healthz`, 5000);\n} catch (error) {\n problems.push(`agentmon-query /healthz failed: ${error.message}`);\n}\ntry {\n await requestText(`${CONFIG.hostBaseUrl}:8082/healthz`, 5000);\n} catch (error) {\n problems.push(`agentmon-ui /healthz failed: ${error.message}`);\n}\n\ntry {\n const eventsResponse = await requestJson(`${CONFIG.baseUrl}/v1/events?event_type=swarm.snapshot&limit=1`, 10000);\n snapshotEvent = (eventsResponse.events || [])[0];\n if (!snapshotEvent) {\n problems.push('no swarm.snapshot events returned');\n } else {\n const tsRaw = snapshotEvent.ts || snapshotEvent.payload?.event?.ts;\n const ts = new Date(tsRaw);\n const ageMs = now.getTime() - ts.getTime();\n if (!Number.isFinite(ageMs)) {\n problems.push(`latest swarm.snapshot has invalid timestamp: ${tsRaw}`);\n } else if (ageMs > CONFIG.staleAfterMs) {\n problems.push(`latest swarm.snapshot stale: age=${Math.round(ageMs / 1000)}s ts=${tsRaw}`);\n }\n\n const payload = snapshotEvent.payload?.payload || snapshotEvent.payload || {};\n services = payload.services || [];\n const byName = Object.fromEntries(services.map((svc) => [svc.name, svc]));\n for (const issue of normalizeIssues(payload.issues)) {\n problems.push(`swarm issue ${issue}`);\n }\n for (const name of CONFIG.requiredServices) {\n const svc = byName[name];\n if (!svc) {\n problems.push(`required service missing: ${name}`);\n } else if (svc.status !== 'healthy' || svc.container_state !== 'running') {\n problems.push(`required service unhealthy: ${name} (${serviceSummary(svc)})`);\n }\n }\n const unhealthy = services.filter((svc) => svc.status && svc.status !== 'healthy');\n for (const svc of unhealthy.slice(0, 20)) {\n details.push(`${svc.name}: ${serviceSummary(svc)}`);\n }\n }\n} catch (error) {\n problems.push(`swarm.snapshot query failed: ${error.message}`);\n}\n\ntry {\n stats = await requestJson(`${CONFIG.baseUrl}/v1/stats/summary`, 5000);\n} catch (error) {\n problems.push(`stats summary query failed: ${error.message}`);\n}\n\nstaticData.agentmon = staticData.agentmon || { failedRuns: 0, alerted: false };\nconst prev = staticData.agentmon;\nconst healthy = problems.length === 0;\nconst result = {\n checkedAt: nowIso,\n healthy,\n problems,\n details,\n snapshotTs: snapshotEvent?.ts || snapshotEvent?.payload?.event?.ts || null,\n serviceCount: services.length,\n stats,\n};\n\nif (healthy) {\n if (prev.alerted) {\n staticData.agentmon = { failedRuns: 0, alerted: false, lastOk: nowIso };\n return [{ json: { ...result, text: `\u2705 Agentmon Health Watchdog recovered\\n- snapshot=${result.snapshotTs}\\n- services=${result.serviceCount}\\n- checked=${nowIso}` } }];\n }\n staticData.agentmon = { failedRuns: 0, alerted: false, lastOk: nowIso };\n return [];\n}\n\nconst failedRuns = (prev.failedRuns || 0) + 1;\nconst shouldAlert = failedRuns >= CONFIG.failureThreshold && (!prev.alerted || (CONFIG.reminderEveryFailedRuns > 0 && failedRuns % CONFIG.reminderEveryFailedRuns === 0));\nstaticData.agentmon = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: nowIso, lastProblems: problems };\n\nif (!shouldAlert) return [];\nconst lines = ['\ud83d\udea8 Agentmon Health Watchdog', `failedChecks=${failedRuns}`, `checked=${nowIso}`];\nfor (const p of problems.slice(0, 12)) lines.push(`- ${p}`);\nif (details.length) {\n lines.push('details:');\n for (const d of details.slice(0, 12)) lines.push(`- ${d}`);\n}\nlines.push('suggested: check `docker logs agentmon-query --tail 100`, `docker logs agentmon-swarm-monitor --tail 100`, and agentmon query `/v1/events?event_type=swarm.snapshot&limit=1`.');\nreturn [{ json: { ...result, failedRuns, text: lines.join('\\n') } }];" }, "type": "n8n-nodes-base.code", "typeVersion": 2, "position": [ -500, 60 ], "id": "201ffa92-12f9-4b7f-9a0e-7e4df4fbdbe0", "name": "Check Agentmon Snapshot" }, { "parameters": { "chatId": "8367012007", "text": "={{$json.text}}", "additionalFields": { "parse_mode": "Markdown" } }, "type": "n8n-nodes-base.telegram", "typeVersion": 1.2, "position": [ -220, -40 ], "id": "1e160d4e-7614-4479-b470-a3048e08124c", "name": "Send Telegram Alert", "credentials": { "telegramApi": { "id": "aox4dyIWVSRdcH5z", "name": "Telegram Bot (OpenClaw)" } } }, { "parameters": { "authentication": "predefinedCredentialType", "nodeCredentialType": "httpHeaderAuth", "method": "POST", "url": "https://discord.com/api/v10/channels/425781661268049931/messages", "sendBody": true, "specifyBody": "json", "jsonBody": "={{ { content: $json.text } }}", "options": {} }, "type": "n8n-nodes-base.httpRequest", "typeVersion": 4.2, "position": [ -220, 140 ], "id": "cf94f111-7824-48a8-8c00-e06cc36cd01e", "name": "Send Discord Alert", "credentials": { "httpHeaderAuth": { "id": "UgPqYcoCNNIgr55m", "name": "Discord Bot Auth" } } } ], "connections": { "Manual Trigger": { "main": [ [ { "node": "Check Agentmon Snapshot", "type": "main", "index": 0 } ] ] }, "Every 5 Minutes": { "main": [ [ { "node": "Check Agentmon Snapshot", "type": "main", "index": 0 } ] ] }, "Check Agentmon Snapshot": { "main": [ [ { "node": "Send Telegram Alert", "type": "main", "index": 0 }, { "node": "Send Discord Alert", "type": "main", "index": 0 } ] ] } }, "settings": { "executionOrder": "v1" }, "staticData": null, "pinData": {}, "tags": [], "id": "AgentmonHealthWatchdog" }