feat: add Docker health-state HTTP endpoint for Swarm Health Watchdog
- Python HTTP server on 0.0.0.0:18809 - GET /health -> all monitored containers (JSON) - GET /health/<name> -> single container - Monitors: brave-search, kokoro-tts, litellm, litellm-db, n8n-agent, searxng, whisper-server - Returns status, health, restart count via docker inspect - systemd user service for auto-start Task: t_461f71fe
This commit is contained in:
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Docker Health Endpoint - lightweight HTTP server that exposes container health state.
|
||||
Listens on 0.0.0.0:18809 (configurable via PORT env var).
|
||||
|
||||
Endpoints:
|
||||
GET /health -> all monitored containers
|
||||
GET /health/<name> -> single container
|
||||
"""
|
||||
|
||||
import http.server
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
PORT = int(os.environ.get("PORT", 18809))
|
||||
|
||||
# Containers to monitor
|
||||
CONTAINERS = [
|
||||
"brave-search",
|
||||
"kokoro-tts",
|
||||
"litellm",
|
||||
"litellm-db",
|
||||
"n8n-agent",
|
||||
"searxng",
|
||||
"whisper-server",
|
||||
]
|
||||
|
||||
|
||||
def inspect_container(name: str) -> dict:
|
||||
"""Run docker inspect and extract health info for a single container."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["docker", "inspect", "--format",
|
||||
"{{.State.Status}}|{{if .State.Health}}{{.State.Health.Status}}{{else}}n/a{{end}}|{{.RestartCount}}",
|
||||
name],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
return {"name": name, "status": "not_found", "health": "unknown", "restarts": -1}
|
||||
parts = result.stdout.strip().split("|")
|
||||
if len(parts) != 3:
|
||||
return {"name": name, "status": "error", "health": "unknown", "restarts": -1}
|
||||
return {
|
||||
"name": name,
|
||||
"status": parts[0],
|
||||
"health": parts[1],
|
||||
"restarts": int(parts[2]) if parts[2].isdigit() else 0,
|
||||
}
|
||||
except Exception as e:
|
||||
return {"name": name, "status": "error", "health": str(e), "restarts": -1}
|
||||
|
||||
|
||||
def inspect_all() -> list:
|
||||
"""Inspect all monitored containers."""
|
||||
return [inspect_container(c) for c in CONTAINERS]
|
||||
|
||||
|
||||
class HealthHandler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
path = self.path.rstrip("/")
|
||||
if path == "/health":
|
||||
data = {"containers": inspect_all()}
|
||||
self._json_response(data)
|
||||
elif path.startswith("/health/"):
|
||||
name = path[len("/health/"):]
|
||||
data = inspect_container(name)
|
||||
self._json_response(data)
|
||||
else:
|
||||
self._json_response({"error": "not found"}, status=404)
|
||||
|
||||
def _json_response(self, data, status=200):
|
||||
body = json.dumps(data, indent=2).encode()
|
||||
self.send_response(status)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, format, *args):
|
||||
# Suppress default stderr logging
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
server = http.server.HTTPServer(("0.0.0.0", PORT), HealthHandler)
|
||||
print(f"docker-health-server listening on 0.0.0.0:{PORT}", flush=True)
|
||||
try:
|
||||
server.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
server.server_close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user