swarm-zap/scripts/ops-sentinel.sh

#!/usr/bin/env bash
set -euo pipefail

# Lightweight operational snapshot for OpenClaw homelab.
# Output frame: Now / Soon / Watch / Next actions

OPENCLAW_BIN="${OPENCLAW_BIN:-openclaw}"
BACKUP_LOG="${BACKUP_LOG:-memory/minio-backup.log}"
BACKUP_MAX_AGE_HOURS="${BACKUP_MAX_AGE_HOURS:-8}"
SEARX_URL="${SEARX_URL:-http://192.168.153.113:18803}"
WHISPER_URL="${WHISPER_URL:-http://192.168.153.113:18801}"
MCP_URL="${MCP_URL:-http://192.168.153.113:18802/mcp}"
OLLAMA_URL="${OLLAMA_URL:-http://192.168.153.113:18807}"
OLLAMA_EMBED_MODEL="${OLLAMA_EMBED_MODEL:-nomic-embed-text:latest}"
OLLAMA_REQUIRED="${OLLAMA_REQUIRED:-true}"
OLLAMA_SERVICE="${OLLAMA_SERVICE:-ollama.service}"
LLAMA_CPP_SERVICE="${LLAMA_CPP_SERVICE:-llama-server.service}"
LLAMA_CPP_URL="${LLAMA_CPP_URL:-http://192.168.153.113:18806/health}"
LLAMA_CPP_REQUIRED="${LLAMA_CPP_REQUIRED:-true}"
SYSTEMD_LOCAL_CHECKS="${SYSTEMD_LOCAL_CHECKS:-false}"
MINIO_URL="${MINIO_URL:-http://192.168.153.253:9000}"
DOCKER_EXPECTED_CONTAINERS="${DOCKER_EXPECTED_CONTAINERS:-searxng whisper-server brave-search}"
DOCKER_REQUIRE_EXPECTED="${DOCKER_REQUIRE_EXPECTED:-false}"
WARN_DISK_PCT="${WARN_DISK_PCT:-85}"
WARN_MEM_PCT="${WARN_MEM_PCT:-85}"

TS_DAY="$(date -u +%F)"
TS_STAMP="$(date -u +%H%M%S)"
ARTIFACT_DIR="${HEALTHCHECK_OUTPUT_DIR:-/tmp/openclaw-healthcheck}/${TS_DAY}/${TS_STAMP}"
mkdir -p "$ARTIFACT_DIR"

NOW=()
SOON=()
WATCH=()
NEXT=()
P1=0
P2=0

add_now() { NOW+=("$1"); }
add_soon() { SOON+=("$1"); }
add_watch() { WATCH+=("$1"); }
add_next() { NEXT+=("$1"); }

mark_p1() { P1=$((P1 + 1)); }
mark_p2() { P2=$((P2 + 1)); }

http_probe() {
  local name="$1" url="$2" expected_regex="$3"
  local out_file="$ARTIFACT_DIR/http-${name}.txt"
  local result code ttotal
  if ! result="$(curl -sS -m 6 -o "$out_file" -w '%{http_code} %{time_total}' "$url" 2>"$ARTIFACT_DIR/http-${name}.err")"; then
    add_now "P1 ${name} unreachable (${url})"
    mark_p1
    add_next "Check ${name} service/container and LAN route"
    return
  fi
  code="${result%% *}"
  ttotal="${result##* }"
  if [[ "$code" =~ $expected_regex ]]; then
    add_watch "P4 ${name} OK (HTTP ${code}, ${ttotal}s)"
  else
    add_watch "P2 ${name} unexpected response (HTTP ${code}, ${ttotal}s)"
    mark_p2
    add_next "Validate ${name} endpoint/health semantics"
  fi
}

# 1) OpenClaw health + security
if "$OPENCLAW_BIN" health --json >"$ARTIFACT_DIR/openclaw-health.json" 2>"$ARTIFACT_DIR/openclaw-health.err"; then
  if jq -e '.ok == true' "$ARTIFACT_DIR/openclaw-health.json" >/dev/null 2>&1; then
    add_watch "P4 OpenClaw gateway health OK"
  else
    add_now "P1 OpenClaw health reported not-ok"
    mark_p1
    add_next "Run: openclaw health --json"
  fi
else
  add_now "P1 Failed to run openclaw health"
  mark_p1
  add_next "Run: openclaw status && openclaw logs --follow"
fi

if "$OPENCLAW_BIN" security audit --json >"$ARTIFACT_DIR/openclaw-security-audit.json" 2>"$ARTIFACT_DIR/openclaw-security-audit.err"; then
  sec_critical="$(jq -r '.summary.critical // 0' "$ARTIFACT_DIR/openclaw-security-audit.json" 2>/dev/null || echo 0)"
  sec_warn="$(jq -r '.summary.warn // 0' "$ARTIFACT_DIR/openclaw-security-audit.json" 2>/dev/null || echo 0)"
  if [[ "$sec_critical" =~ ^[0-9]+$ ]] && (( sec_critical > 0 )); then
    add_now "P1 Security audit has ${sec_critical} critical finding(s)"
    mark_p1
    add_next "Run: openclaw security audit --deep"
  fi
  if [[ "$sec_warn" =~ ^[0-9]+$ ]] && (( sec_warn > 0 )); then
    add_watch "P2 Security audit has ${sec_warn} warning(s)"
    mark_p2
    add_next "Review plugin/tool policy allowlists"
  fi
else
  add_watch "P3 Security audit command failed"
  add_next "Run: openclaw security audit --json"
fi

if "$OPENCLAW_BIN" update status >"$ARTIFACT_DIR/openclaw-update-status.txt" 2>"$ARTIFACT_DIR/openclaw-update-status.err"; then
  if grep -Eqi 'update available|outdated|new version|available:' "$ARTIFACT_DIR/openclaw-update-status.txt"; then
    add_watch "P2 OpenClaw update available"
    mark_p2
    add_next "Review release notes, then run safe update workflow"
  else
    add_watch "P4 OpenClaw update status checked"
  fi
else
  add_watch "P3 OpenClaw update status check failed"
  add_next "Run: openclaw update status"
fi

# 2) Backup freshness from minio backup log
if [[ -f "$BACKUP_LOG" ]]; then
  last_key="$(grep 'Backup complete:' "$BACKUP_LOG" | tail -n1 | sed -E 's#.*workspace-backups/([0-9]{8}T[0-9]{6}Z).*#\1#' || true)"
  if [[ -n "$last_key" ]]; then
    backup_iso="${last_key:0:4}-${last_key:4:2}-${last_key:6:2} ${last_key:9:2}:${last_key:11:2}:${last_key:13:2} UTC"
    backup_epoch="$(date -u -d "$backup_iso" +%s 2>/dev/null || echo 0)"
    now_epoch="$(date -u +%s)"
    if (( backup_epoch > 0 )); then
      age_hours=$(( (now_epoch - backup_epoch) / 3600 ))
      if (( age_hours > BACKUP_MAX_AGE_HOURS )); then
        add_now "P1 Backup stale: last success ${age_hours}h ago (${last_key})"
        mark_p1
        add_next "Run backup job now and verify new 'Backup complete' entry"
      elif (( age_hours >= BACKUP_MAX_AGE_HOURS - 1 )); then
        add_soon "P2 Backup nearing threshold: ${age_hours}h old (${last_key})"
        mark_p2
      else
        add_watch "P4 Backup fresh (${age_hours}h old, ${last_key})"
      fi
    else
      add_now "P1 Could not parse backup timestamp from ${BACKUP_LOG}"
      mark_p1
      add_next "Inspect backup log format or backup script output"
    fi
  else
    add_now "P1 No 'Backup complete' entry found in ${BACKUP_LOG}"
    mark_p1
    add_next "Run backup and confirm completion line is written"
  fi
else
  add_now "P1 Backup log missing: ${BACKUP_LOG}"
  mark_p1
  add_next "Create backup log or fix backup script path"
fi

# 3) systemd service health (local model runtimes)
if [[ "${SYSTEMD_LOCAL_CHECKS}" == "true" ]]; then
  if command -v systemctl >/dev/null 2>&1; then
    for unit in "$OLLAMA_SERVICE" "$LLAMA_CPP_SERVICE"; do
      if systemctl is-active --quiet "$unit"; then
        add_watch "P4 systemd service active: ${unit}"
      else
        if [[ "$unit" == "$OLLAMA_SERVICE" && "${OLLAMA_REQUIRED}" != "true" ]]; then
          add_watch "P3 systemd service inactive: ${unit} (optional)"
        elif [[ "$unit" == "$LLAMA_CPP_SERVICE" && "${LLAMA_CPP_REQUIRED}" != "true" ]]; then
          add_watch "P3 systemd service inactive: ${unit} (optional)"
        else
          add_now "P1 systemd service inactive: ${unit}"
          mark_p1
          add_next "Check unit: systemctl status ${unit}"
        fi
      fi
    done
  else
    add_watch "P3 systemctl not available"
  fi
else
  add_watch "P3 Skipping local systemd checks (SYSTEMD_LOCAL_CHECKS=false)"
fi

# 4) Docker service health (containers + health state)
if command -v docker >/dev/null 2>&1; then
  if docker ps --format '{{.Names}}\t{{.Status}}' >"$ARTIFACT_DIR/docker-ps.txt" 2>"$ARTIFACT_DIR/docker-ps.err"; then
    for svc in $DOCKER_EXPECTED_CONTAINERS; do
      if grep -qE "^${svc}[[:space:]]" "$ARTIFACT_DIR/docker-ps.txt"; then
        line="$(grep -E "^${svc}[[:space:]]" "$ARTIFACT_DIR/docker-ps.txt" | head -n1)"
        if echo "$line" | grep -qiE 'unhealthy|dead|exited|restarting'; then
          add_now "P1 Docker container ${svc} unhealthy (${line#*$'\t'})"
          mark_p1
          add_next "Check logs: docker logs --tail=200 ${svc}"
        else
          add_watch "P4 Docker container ${svc} running"
        fi
      else
        if [[ "${DOCKER_REQUIRE_EXPECTED}" == "true" ]]; then
          add_now "P1 Docker container missing: ${svc}"
          mark_p1
          add_next "Start or restore container: ${svc}"
        else
          add_watch "P3 Docker container not found locally: ${svc} (may run on another host)"
        fi
      fi
    done
  else
    add_watch "P3 docker ps failed"
    add_next "Check Docker daemon permissions/state"
  fi
else
  add_watch "P3 docker CLI not available"
fi

# 5) Key LAN + local service probes
http_probe "searxng" "$SEARX_URL" '^200$'
http_probe "whisper" "$WHISPER_URL" '^200$'
http_probe "brave-mcp" "$MCP_URL" '^(200|406)$'
http_probe "minio-live" "${MINIO_URL%/}/minio/health/live" '^200$'

llama_code="$(curl -sS -m 6 -o "$ARTIFACT_DIR/llamacpp-health.txt" -w '%{http_code}' "$LLAMA_CPP_URL" 2>"$ARTIFACT_DIR/llamacpp-health.err" || true)"
if [[ -n "$llama_code" && "$llama_code" != "000" ]]; then
  add_watch "P4 llama.cpp endpoint responsive (${LLAMA_CPP_URL}, HTTP ${llama_code})"
else
  llama_root="${LLAMA_CPP_URL%/health}"
  llama_code_root="$(curl -sS -m 6 -o "$ARTIFACT_DIR/llamacpp-root.txt" -w '%{http_code}' "$llama_root" 2>"$ARTIFACT_DIR/llamacpp-root.err" || true)"
  if [[ -n "$llama_code_root" && "$llama_code_root" != "000" ]]; then
    add_watch "P4 llama.cpp host reachable (${llama_root}, HTTP ${llama_code_root})"
  else
    if [[ "${LLAMA_CPP_REQUIRED}" == "true" ]]; then
      add_now "P1 llama.cpp endpoint unreachable (${LLAMA_CPP_URL})"
      mark_p1
      add_next "Check llama.cpp bind address/port and service logs"
    else
      add_watch "P3 llama.cpp endpoint unreachable at configured URL (${LLAMA_CPP_URL})"
    fi
  fi
fi

# 6) Ollama embeddings availability + target model
if curl -sS -m 6 "${OLLAMA_URL%/}/api/tags" >"$ARTIFACT_DIR/ollama-tags.json" 2>"$ARTIFACT_DIR/ollama-tags.err"; then
  if jq -e --arg model "$OLLAMA_EMBED_MODEL" '.models[]? | select(.name == $model)' "$ARTIFACT_DIR/ollama-tags.json" >/dev/null 2>&1; then
    add_watch "P4 Ollama up; embedding model present (${OLLAMA_EMBED_MODEL})"
  else
    add_soon "P2 Ollama up but embedding model missing (${OLLAMA_EMBED_MODEL})"
    mark_p2
    add_next "Pull model: ollama pull ${OLLAMA_EMBED_MODEL}"
  fi
else
  if [[ "${OLLAMA_REQUIRED}" == "true" ]]; then
    add_now "P1 Ollama unreachable (${OLLAMA_URL})"
    mark_p1
    add_next "Check Ollama service and port (default 11434)"
  else
    add_watch "P3 Ollama unreachable at configured URL (${OLLAMA_URL}); set OLLAMA_URL if remote"
  fi
fi

# 7) Host pressure: disk + memory
root_disk_pct="$(df -P / | awk 'NR==2 {gsub(/%/,"",$5); print $5}' 2>/dev/null || echo 0)"
if [[ "$root_disk_pct" =~ ^[0-9]+$ ]]; then
  if (( root_disk_pct >= 95 )); then
    add_now "P1 Root disk critical: ${root_disk_pct}% used"
    mark_p1
    add_next "Free disk space urgently"
  elif (( root_disk_pct >= WARN_DISK_PCT )); then
    add_soon "P2 Root disk high: ${root_disk_pct}% used"
    mark_p2
    add_next "Prune logs/artifacts and monitor growth"
  else
    add_watch "P4 Root disk normal: ${root_disk_pct}% used"
  fi
fi

if [[ -r /proc/meminfo ]]; then
  mem_total_kb="$(awk '/MemTotal:/ {print $2}' /proc/meminfo)"
  mem_avail_kb="$(awk '/MemAvailable:/ {print $2}' /proc/meminfo)"
  if [[ "$mem_total_kb" =~ ^[0-9]+$ ]] && [[ "$mem_avail_kb" =~ ^[0-9]+$ ]] && (( mem_total_kb > 0 )); then
    mem_used_pct=$(( (100 * (mem_total_kb - mem_avail_kb)) / mem_total_kb ))
    if (( mem_used_pct >= 95 )); then
      add_now "P1 Memory pressure critical: ${mem_used_pct}% used"
      mark_p1
      add_next "Inspect heavy processes / reduce workload"
    elif (( mem_used_pct >= WARN_MEM_PCT )); then
      add_soon "P2 Memory pressure high: ${mem_used_pct}% used"
      mark_p2
      add_next "Check workload spikes and tune limits"
    else
      add_watch "P4 Memory normal: ${mem_used_pct}% used"
    fi
  fi
fi

VERDICT="OK"
EXIT_CODE=0
if (( P1 > 0 )); then
  VERDICT="NEEDS_ATTENTION"
  EXIT_CODE=2
elif (( P2 > 0 )); then
  VERDICT="MONITOR"
  EXIT_CODE=1
fi

{
  echo "Verdict: ${VERDICT}"
  echo "Counts: p1=${P1} p2=${P2}"
  echo "Artifact path: ${ARTIFACT_DIR}"
  echo
  echo "Now:"
  if (( ${#NOW[@]} == 0 )); then echo "- P4 Nothing urgent"; else for x in "${NOW[@]}"; do echo "- ${x}"; done; fi
  echo
  echo "Soon:"
  if (( ${#SOON[@]} == 0 )); then echo "- P4 No near-term risks"; else for x in "${SOON[@]}"; do echo "- ${x}"; done; fi
  echo
  echo "Watch:"
  if (( ${#WATCH[@]} == 0 )); then echo "- P4 No watch items"; else for x in "${WATCH[@]}"; do echo "- ${x}"; done; fi
  echo
  echo "Next actions:"
  if (( ${#NEXT[@]} == 0 )); then
    echo "- Keep current cadence"
  else
    printf '%s\n' "${NEXT[@]}" | awk '!seen[$0]++' | sed 's/^/- /'
  fi
} | tee "$ARTIFACT_DIR/summary.txt"

exit "$EXIT_CODE"