feat: add Docker health-state HTTP endpoint for Swarm Health Watchdog

- Python HTTP server on 0.0.0.0:18809
- GET /health -> all monitored containers (JSON)
- GET /health/<name> -> single container
- Monitors: brave-search, kokoro-tts, litellm, litellm-db, n8n-agent, searxng, whisper-server
- Returns status, health, restart count via docker inspect
- systemd user service for auto-start

Task: t_461f71fe
This commit is contained in:
William Valentin
2026-05-13 14:29:25 -07:00
parent aea9042cce
commit 9fdd29f7b7
1272 changed files with 863206 additions and 16973 deletions
+97
View File
@@ -0,0 +1,97 @@
#!/usr/bin/env python3
"""
Docker Health Endpoint - lightweight HTTP server that exposes container health state.
Listens on 0.0.0.0:18809 (configurable via PORT env var).
Endpoints:
GET /health -> all monitored containers
GET /health/<name> -> single container
"""
import http.server
import json
import os
import subprocess
import sys
PORT = int(os.environ.get("PORT", 18809))
# Containers to monitor
CONTAINERS = [
"brave-search",
"kokoro-tts",
"litellm",
"litellm-db",
"n8n-agent",
"searxng",
"whisper-server",
]
def inspect_container(name: str) -> dict:
"""Run docker inspect and extract health info for a single container."""
try:
result = subprocess.run(
["docker", "inspect", "--format",
"{{.State.Status}}|{{if .State.Health}}{{.State.Health.Status}}{{else}}n/a{{end}}|{{.RestartCount}}",
name],
capture_output=True, text=True, timeout=5,
)
if result.returncode != 0:
return {"name": name, "status": "not_found", "health": "unknown", "restarts": -1}
parts = result.stdout.strip().split("|")
if len(parts) != 3:
return {"name": name, "status": "error", "health": "unknown", "restarts": -1}
return {
"name": name,
"status": parts[0],
"health": parts[1],
"restarts": int(parts[2]) if parts[2].isdigit() else 0,
}
except Exception as e:
return {"name": name, "status": "error", "health": str(e), "restarts": -1}
def inspect_all() -> list:
"""Inspect all monitored containers."""
return [inspect_container(c) for c in CONTAINERS]
class HealthHandler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
path = self.path.rstrip("/")
if path == "/health":
data = {"containers": inspect_all()}
self._json_response(data)
elif path.startswith("/health/"):
name = path[len("/health/"):]
data = inspect_container(name)
self._json_response(data)
else:
self._json_response({"error": "not found"}, status=404)
def _json_response(self, data, status=200):
body = json.dumps(data, indent=2).encode()
self.send_response(status)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def log_message(self, format, *args):
# Suppress default stderr logging
pass
def main():
server = http.server.HTTPServer(("0.0.0.0", PORT), HealthHandler)
print(f"docker-health-server listening on 0.0.0.0:{PORT}", flush=True)
try:
server.serve_forever()
except KeyboardInterrupt:
pass
server.server_close()
if __name__ == "__main__":
main()