fix: use Docker socket HTTP API in swarm collector, no CLI dependency
Replace exec.CommandContext calls (docker ps, docker inspect, nc -z) with direct HTTP calls over the Unix socket using Go's net/http + custom transport. Also removes netcat-openbsd from Dockerfile since nc is no longer used. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -20,7 +20,6 @@ RUN apt-get update && apt-get install -y \
|
|||||||
ca-certificates \
|
ca-certificates \
|
||||||
libvirt-clients \
|
libvirt-clients \
|
||||||
openssh-client \
|
openssh-client \
|
||||||
netcat-openbsd \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|||||||
@@ -4,8 +4,9 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os/exec"
|
"net/url"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@@ -15,19 +16,19 @@ type Config struct {
|
|||||||
LiteLLMBaseURL string
|
LiteLLMBaseURL string
|
||||||
LiteLLMAPIKey string
|
LiteLLMAPIKey string
|
||||||
HTTPTimeout time.Duration
|
HTTPTimeout time.Duration
|
||||||
|
DockerSocket string // defaults to /var/run/docker.sock
|
||||||
}
|
}
|
||||||
|
|
||||||
// dockerPsEntry is the JSON shape from `docker ps --format '{{json .}}'`.
|
// dockerContainer is the shape returned by GET /containers/json.
|
||||||
type dockerPsEntry struct {
|
type dockerContainer struct {
|
||||||
ID string `json:"ID"`
|
ID string `json:"Id"`
|
||||||
Names string `json:"Names"`
|
Names []string `json:"Names"`
|
||||||
Status string `json:"Status"`
|
|
||||||
State string `json:"State"`
|
State string `json:"State"`
|
||||||
|
Labels map[string]string `json:"Labels"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// dockerInspectEntry is the minimal shape we need from `docker inspect`.
|
// dockerContainerDetail is the shape returned by GET /containers/{id}/json.
|
||||||
type dockerInspectEntry struct {
|
type dockerContainerDetail struct {
|
||||||
Name string `json:"Name"`
|
|
||||||
State struct {
|
State struct {
|
||||||
Status string `json:"Status"`
|
Status string `json:"Status"`
|
||||||
Running bool `json:"Running"`
|
Running bool `json:"Running"`
|
||||||
@@ -36,119 +37,122 @@ type dockerInspectEntry struct {
|
|||||||
Status string `json:"Status"`
|
Status string `json:"Status"`
|
||||||
} `json:"Health"`
|
} `json:"Health"`
|
||||||
} `json:"State"`
|
} `json:"State"`
|
||||||
Config struct {
|
}
|
||||||
Labels map[string]string `json:"Labels"`
|
|
||||||
} `json:"Config"`
|
func newDockerClient(socketPath string) *http.Client {
|
||||||
|
if socketPath == "" {
|
||||||
|
socketPath = "/var/run/docker.sock"
|
||||||
|
}
|
||||||
|
return &http.Client{
|
||||||
|
Transport: &http.Transport{
|
||||||
|
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
|
||||||
|
return (&net.Dialer{}).DialContext(ctx, "unix", socketPath)
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// CollectAll lists all containers labeled agentmon.monitor=true and collects
|
// CollectAll lists all containers labeled agentmon.monitor=true and collects
|
||||||
// a ServiceSnapshot for each.
|
// a ServiceSnapshot for each.
|
||||||
func CollectAll(ctx context.Context, cfg Config) ([]ServiceSnapshot, error) {
|
func CollectAll(ctx context.Context, cfg Config) ([]ServiceSnapshot, error) {
|
||||||
// List labeled containers (running + stopped).
|
dockerClient := newDockerClient(cfg.DockerSocket)
|
||||||
out, err := exec.CommandContext(ctx, "docker", "ps", "-a",
|
httpClient := &http.Client{Timeout: cfg.HTTPTimeout}
|
||||||
"--filter", "label=agentmon.monitor=true",
|
|
||||||
"--format", "{{json .}}",
|
filters := url.QueryEscape(`{"label":["agentmon.monitor=true"]}`)
|
||||||
).Output()
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet,
|
||||||
|
"http://localhost/v1.41/containers/json?all=1&filters="+filters, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("docker ps failed: %w", err)
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
var entries []dockerPsEntry
|
resp, err := dockerClient.Do(req)
|
||||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
if err != nil {
|
||||||
if line == "" {
|
return nil, fmt.Errorf("docker API unavailable: %w", err)
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
var e dockerPsEntry
|
defer resp.Body.Close()
|
||||||
if err := json.Unmarshal([]byte(line), &e); err != nil {
|
|
||||||
continue
|
var containers []dockerContainer
|
||||||
}
|
if err := json.NewDecoder(resp.Body).Decode(&containers); err != nil {
|
||||||
entries = append(entries, e)
|
return nil, fmt.Errorf("docker API parse error: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
client := &http.Client{Timeout: cfg.HTTPTimeout}
|
|
||||||
var snapshots []ServiceSnapshot
|
var snapshots []ServiceSnapshot
|
||||||
for _, e := range entries {
|
for _, c := range containers {
|
||||||
snap := collectOne(ctx, e.Names, client, cfg)
|
snapshots = append(snapshots, collectOne(ctx, c, dockerClient, httpClient, cfg))
|
||||||
snapshots = append(snapshots, snap)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return snapshots, nil
|
return snapshots, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func collectOne(ctx context.Context, name string, client *http.Client, cfg Config) ServiceSnapshot {
|
func collectOne(ctx context.Context, c dockerContainer, dockerClient, httpClient *http.Client, cfg Config) ServiceSnapshot {
|
||||||
|
name := containerName(c)
|
||||||
snap := ServiceSnapshot{
|
snap := ServiceSnapshot{
|
||||||
Name: name,
|
Name: name,
|
||||||
ContainerState: "missing",
|
Role: c.Labels["agentmon.role"],
|
||||||
|
ContainerState: c.State,
|
||||||
HealthState: "none",
|
HealthState: "none",
|
||||||
Status: "down",
|
Status: "down",
|
||||||
}
|
}
|
||||||
|
|
||||||
// Inspect for detailed state.
|
// Inspect for health state and uptime (not in list response).
|
||||||
out, err := exec.CommandContext(ctx, "docker", "inspect", "--format", "{{json .}}", name).Output()
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet,
|
||||||
if err != nil {
|
"http://localhost/v1.41/containers/"+c.ID+"/json", nil)
|
||||||
return snap
|
if err == nil {
|
||||||
}
|
if resp, err := dockerClient.Do(req); err == nil {
|
||||||
|
var detail dockerContainerDetail
|
||||||
var detail dockerInspectEntry
|
if json.NewDecoder(resp.Body).Decode(&detail) == nil {
|
||||||
if err := json.Unmarshal(out, &detail); err != nil {
|
|
||||||
return snap
|
|
||||||
}
|
|
||||||
|
|
||||||
snap.Role = detail.Config.Labels["agentmon.role"]
|
|
||||||
snap.ContainerState = detail.State.Status
|
|
||||||
|
|
||||||
if detail.State.Health != nil {
|
if detail.State.Health != nil {
|
||||||
snap.HealthState = detail.State.Health.Status
|
snap.HealthState = detail.State.Health.Status
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate uptime if running.
|
|
||||||
if detail.State.Running && detail.State.StartedAt != "" {
|
if detail.State.Running && detail.State.StartedAt != "" {
|
||||||
if t, err := time.Parse(time.RFC3339Nano, detail.State.StartedAt); err == nil {
|
if t, err := time.Parse(time.RFC3339Nano, detail.State.StartedAt); err == nil {
|
||||||
snap.UptimeSec = int64(time.Since(t).Seconds())
|
snap.UptimeSec = int64(time.Since(t).Seconds())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
resp.Body.Close()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Role-specific probes.
|
port := c.Labels["agentmon.port"]
|
||||||
switch snap.Role {
|
switch snap.Role {
|
||||||
case "llm-proxy":
|
case "llm-proxy":
|
||||||
collectLLMProxy(ctx, &snap, client, cfg)
|
collectLLMProxy(ctx, &snap, httpClient, cfg)
|
||||||
case "search":
|
case "search":
|
||||||
collectHTTPProbe(ctx, &snap, client, "http://localhost:"+detail.Config.Labels["agentmon.port"]+"/")
|
collectHTTPProbe(ctx, &snap, httpClient, "http://localhost:"+port+"/")
|
||||||
case "mcp":
|
case "mcp":
|
||||||
collectPortProbe(ctx, &snap, detail.Config.Labels["agentmon.port"])
|
collectPortProbe(&snap, port)
|
||||||
case "db", "voice", "automation":
|
|
||||||
// Docker healthcheck state is sufficient; no HTTP probe.
|
|
||||||
}
|
}
|
||||||
|
|
||||||
snap.Status = deriveStatus(snap)
|
snap.Status = deriveStatus(snap)
|
||||||
return snap
|
return snap
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func containerName(c dockerContainer) string {
|
||||||
|
if len(c.Names) > 0 {
|
||||||
|
return strings.TrimPrefix(c.Names[0], "/")
|
||||||
|
}
|
||||||
|
return c.ID[:12]
|
||||||
|
}
|
||||||
|
|
||||||
func collectLLMProxy(ctx context.Context, snap *ServiceSnapshot, client *http.Client, cfg Config) {
|
func collectLLMProxy(ctx context.Context, snap *ServiceSnapshot, client *http.Client, cfg Config) {
|
||||||
if snap.Extra == nil {
|
if snap.Extra == nil {
|
||||||
snap.Extra = make(map[string]any)
|
snap.Extra = make(map[string]any)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Health probe.
|
|
||||||
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, cfg.LiteLLMBaseURL+"/health/liveliness", nil)
|
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, cfg.LiteLLMBaseURL+"/health/liveliness", nil)
|
||||||
resp, err := client.Do(req)
|
if resp, err := client.Do(req); err == nil {
|
||||||
if err == nil {
|
|
||||||
code := resp.StatusCode
|
code := resp.StatusCode
|
||||||
snap.HTTPStatus = &code
|
snap.HTTPStatus = &code
|
||||||
resp.Body.Close()
|
resp.Body.Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Model count.
|
|
||||||
if cfg.LiteLLMAPIKey != "" {
|
if cfg.LiteLLMAPIKey != "" {
|
||||||
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, cfg.LiteLLMBaseURL+"/v2/model/info", nil)
|
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, cfg.LiteLLMBaseURL+"/v2/model/info", nil)
|
||||||
req.Header.Set("Authorization", "Bearer "+cfg.LiteLLMAPIKey)
|
req.Header.Set("Authorization", "Bearer "+cfg.LiteLLMAPIKey)
|
||||||
resp, err := client.Do(req)
|
if resp, err := client.Do(req); err == nil {
|
||||||
if err == nil {
|
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
var result struct {
|
var result struct {
|
||||||
Data []struct {
|
Data []struct{} `json:"data"`
|
||||||
ModelName string `json:"model_name"`
|
|
||||||
} `json:"data"`
|
|
||||||
}
|
}
|
||||||
if json.NewDecoder(resp.Body).Decode(&result) == nil {
|
if json.NewDecoder(resp.Body).Decode(&result) == nil {
|
||||||
snap.Extra["model_count"] = len(result.Data)
|
snap.Extra["model_count"] = len(result.Data)
|
||||||
@@ -157,29 +161,29 @@ func collectLLMProxy(ctx context.Context, snap *ServiceSnapshot, client *http.Cl
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func collectHTTPProbe(ctx context.Context, snap *ServiceSnapshot, client *http.Client, url string) {
|
func collectHTTPProbe(ctx context.Context, snap *ServiceSnapshot, client *http.Client, target string) {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, target, nil)
|
||||||
resp, err := client.Do(req)
|
if resp, err := client.Do(req); err == nil {
|
||||||
if err == nil {
|
|
||||||
code := resp.StatusCode
|
code := resp.StatusCode
|
||||||
snap.HTTPStatus = &code
|
snap.HTTPStatus = &code
|
||||||
resp.Body.Close()
|
resp.Body.Close()
|
||||||
ms := time.Since(start).Milliseconds()
|
|
||||||
if snap.Extra == nil {
|
if snap.Extra == nil {
|
||||||
snap.Extra = make(map[string]any)
|
snap.Extra = make(map[string]any)
|
||||||
}
|
}
|
||||||
snap.Extra["response_ms"] = ms
|
snap.Extra["response_ms"] = time.Since(start).Milliseconds()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func collectPortProbe(ctx context.Context, snap *ServiceSnapshot, port string) {
|
func collectPortProbe(snap *ServiceSnapshot, port string) {
|
||||||
if port == "" {
|
if port == "" {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Use nc to check TCP reachability.
|
conn, err := net.DialTimeout("tcp", "localhost:"+port, 2*time.Second)
|
||||||
err := exec.CommandContext(ctx, "nc", "-z", "-w1", "localhost", port).Run()
|
|
||||||
reachable := err == nil
|
reachable := err == nil
|
||||||
|
if conn != nil {
|
||||||
|
conn.Close()
|
||||||
|
}
|
||||||
if snap.Extra == nil {
|
if snap.Extra == nil {
|
||||||
snap.Extra = make(map[string]any)
|
snap.Extra = make(map[string]any)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user