Files
swarm-zap/scripts/ops-sentinel.sh

317 lines
12 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
# Lightweight operational snapshot for OpenClaw homelab.
# Output frame: Now / Soon / Watch / Next actions
OPENCLAW_BIN="${OPENCLAW_BIN:-openclaw}"
BACKUP_LOG="${BACKUP_LOG:-memory/minio-backup.log}"
BACKUP_MAX_AGE_HOURS="${BACKUP_MAX_AGE_HOURS:-8}"
SEARX_URL="${SEARX_URL:-http://192.168.153.113:18803}"
WHISPER_URL="${WHISPER_URL:-http://192.168.153.113:18801}"
MCP_URL="${MCP_URL:-http://192.168.153.113:18802/mcp}"
OLLAMA_URL="${OLLAMA_URL:-http://192.168.153.113:18807}"
OLLAMA_EMBED_MODEL="${OLLAMA_EMBED_MODEL:-nomic-embed-text:latest}"
OLLAMA_REQUIRED="${OLLAMA_REQUIRED:-true}"
OLLAMA_SERVICE="${OLLAMA_SERVICE:-ollama.service}"
LLAMA_CPP_SERVICE="${LLAMA_CPP_SERVICE:-llama-server.service}"
LLAMA_CPP_URL="${LLAMA_CPP_URL:-http://192.168.153.113:18806/health}"
LLAMA_CPP_REQUIRED="${LLAMA_CPP_REQUIRED:-true}"
SYSTEMD_LOCAL_CHECKS="${SYSTEMD_LOCAL_CHECKS:-false}"
MINIO_URL="${MINIO_URL:-http://192.168.153.253:9000}"
DOCKER_EXPECTED_CONTAINERS="${DOCKER_EXPECTED_CONTAINERS:-searxng whisper-server brave-search}"
DOCKER_REQUIRE_EXPECTED="${DOCKER_REQUIRE_EXPECTED:-false}"
WARN_DISK_PCT="${WARN_DISK_PCT:-85}"
WARN_MEM_PCT="${WARN_MEM_PCT:-85}"
TS_DAY="$(date -u +%F)"
TS_STAMP="$(date -u +%H%M%S)"
ARTIFACT_DIR="${HEALTHCHECK_OUTPUT_DIR:-/tmp/openclaw-healthcheck}/${TS_DAY}/${TS_STAMP}"
mkdir -p "$ARTIFACT_DIR"
NOW=()
SOON=()
WATCH=()
NEXT=()
P1=0
P2=0
add_now() { NOW+=("$1"); }
add_soon() { SOON+=("$1"); }
add_watch() { WATCH+=("$1"); }
add_next() { NEXT+=("$1"); }
mark_p1() { P1=$((P1 + 1)); }
mark_p2() { P2=$((P2 + 1)); }
http_probe() {
local name="$1" url="$2" expected_regex="$3"
local out_file="$ARTIFACT_DIR/http-${name}.txt"
local result code ttotal
if ! result="$(curl -sS -m 6 -o "$out_file" -w '%{http_code} %{time_total}' "$url" 2>"$ARTIFACT_DIR/http-${name}.err")"; then
add_now "P1 ${name} unreachable (${url})"
mark_p1
add_next "Check ${name} service/container and LAN route"
return
fi
code="${result%% *}"
ttotal="${result##* }"
if [[ "$code" =~ $expected_regex ]]; then
add_watch "P4 ${name} OK (HTTP ${code}, ${ttotal}s)"
else
add_watch "P2 ${name} unexpected response (HTTP ${code}, ${ttotal}s)"
mark_p2
add_next "Validate ${name} endpoint/health semantics"
fi
}
# 1) OpenClaw health + security
if "$OPENCLAW_BIN" health --json >"$ARTIFACT_DIR/openclaw-health.json" 2>"$ARTIFACT_DIR/openclaw-health.err"; then
if jq -e '.ok == true' "$ARTIFACT_DIR/openclaw-health.json" >/dev/null 2>&1; then
add_watch "P4 OpenClaw gateway health OK"
else
add_now "P1 OpenClaw health reported not-ok"
mark_p1
add_next "Run: openclaw health --json"
fi
else
add_now "P1 Failed to run openclaw health"
mark_p1
add_next "Run: openclaw status && openclaw logs --follow"
fi
if "$OPENCLAW_BIN" security audit --json >"$ARTIFACT_DIR/openclaw-security-audit.json" 2>"$ARTIFACT_DIR/openclaw-security-audit.err"; then
sec_critical="$(jq -r '.summary.critical // 0' "$ARTIFACT_DIR/openclaw-security-audit.json" 2>/dev/null || echo 0)"
sec_warn="$(jq -r '.summary.warn // 0' "$ARTIFACT_DIR/openclaw-security-audit.json" 2>/dev/null || echo 0)"
if [[ "$sec_critical" =~ ^[0-9]+$ ]] && (( sec_critical > 0 )); then
add_now "P1 Security audit has ${sec_critical} critical finding(s)"
mark_p1
add_next "Run: openclaw security audit --deep"
fi
if [[ "$sec_warn" =~ ^[0-9]+$ ]] && (( sec_warn > 0 )); then
add_watch "P2 Security audit has ${sec_warn} warning(s)"
mark_p2
add_next "Review plugin/tool policy allowlists"
fi
else
add_watch "P3 Security audit command failed"
add_next "Run: openclaw security audit --json"
fi
if "$OPENCLAW_BIN" update status >"$ARTIFACT_DIR/openclaw-update-status.txt" 2>"$ARTIFACT_DIR/openclaw-update-status.err"; then
if grep -Eqi 'update available|outdated|new version|available:' "$ARTIFACT_DIR/openclaw-update-status.txt"; then
add_watch "P2 OpenClaw update available"
mark_p2
add_next "Review release notes, then run safe update workflow"
else
add_watch "P4 OpenClaw update status checked"
fi
else
add_watch "P3 OpenClaw update status check failed"
add_next "Run: openclaw update status"
fi
# 2) Backup freshness from minio backup log
if [[ -f "$BACKUP_LOG" ]]; then
last_key="$(grep 'Backup complete:' "$BACKUP_LOG" | tail -n1 | sed -E 's#.*workspace-backups/([0-9]{8}T[0-9]{6}Z).*#\1#' || true)"
if [[ -n "$last_key" ]]; then
backup_iso="${last_key:0:4}-${last_key:4:2}-${last_key:6:2} ${last_key:9:2}:${last_key:11:2}:${last_key:13:2} UTC"
backup_epoch="$(date -u -d "$backup_iso" +%s 2>/dev/null || echo 0)"
now_epoch="$(date -u +%s)"
if (( backup_epoch > 0 )); then
age_hours=$(( (now_epoch - backup_epoch) / 3600 ))
if (( age_hours > BACKUP_MAX_AGE_HOURS )); then
add_now "P1 Backup stale: last success ${age_hours}h ago (${last_key})"
mark_p1
add_next "Run backup job now and verify new 'Backup complete' entry"
elif (( age_hours >= BACKUP_MAX_AGE_HOURS - 1 )); then
add_soon "P2 Backup nearing threshold: ${age_hours}h old (${last_key})"
mark_p2
else
add_watch "P4 Backup fresh (${age_hours}h old, ${last_key})"
fi
else
add_now "P1 Could not parse backup timestamp from ${BACKUP_LOG}"
mark_p1
add_next "Inspect backup log format or backup script output"
fi
else
add_now "P1 No 'Backup complete' entry found in ${BACKUP_LOG}"
mark_p1
add_next "Run backup and confirm completion line is written"
fi
else
add_now "P1 Backup log missing: ${BACKUP_LOG}"
mark_p1
add_next "Create backup log or fix backup script path"
fi
# 3) systemd service health (local model runtimes)
if [[ "${SYSTEMD_LOCAL_CHECKS}" == "true" ]]; then
if command -v systemctl >/dev/null 2>&1; then
for unit in "$OLLAMA_SERVICE" "$LLAMA_CPP_SERVICE"; do
if systemctl is-active --quiet "$unit"; then
add_watch "P4 systemd service active: ${unit}"
else
if [[ "$unit" == "$OLLAMA_SERVICE" && "${OLLAMA_REQUIRED}" != "true" ]]; then
add_watch "P3 systemd service inactive: ${unit} (optional)"
elif [[ "$unit" == "$LLAMA_CPP_SERVICE" && "${LLAMA_CPP_REQUIRED}" != "true" ]]; then
add_watch "P3 systemd service inactive: ${unit} (optional)"
else
add_now "P1 systemd service inactive: ${unit}"
mark_p1
add_next "Check unit: systemctl status ${unit}"
fi
fi
done
else
add_watch "P3 systemctl not available"
fi
else
add_watch "P3 Skipping local systemd checks (SYSTEMD_LOCAL_CHECKS=false)"
fi
# 4) Docker service health (containers + health state)
if command -v docker >/dev/null 2>&1; then
if docker ps --format '{{.Names}}\t{{.Status}}' >"$ARTIFACT_DIR/docker-ps.txt" 2>"$ARTIFACT_DIR/docker-ps.err"; then
for svc in $DOCKER_EXPECTED_CONTAINERS; do
if grep -qE "^${svc}[[:space:]]" "$ARTIFACT_DIR/docker-ps.txt"; then
line="$(grep -E "^${svc}[[:space:]]" "$ARTIFACT_DIR/docker-ps.txt" | head -n1)"
if echo "$line" | grep -qiE 'unhealthy|dead|exited|restarting'; then
add_now "P1 Docker container ${svc} unhealthy (${line#*$'\t'})"
mark_p1
add_next "Check logs: docker logs --tail=200 ${svc}"
else
add_watch "P4 Docker container ${svc} running"
fi
else
if [[ "${DOCKER_REQUIRE_EXPECTED}" == "true" ]]; then
add_now "P1 Docker container missing: ${svc}"
mark_p1
add_next "Start or restore container: ${svc}"
else
add_watch "P3 Docker container not found locally: ${svc} (may run on another host)"
fi
fi
done
else
add_watch "P3 docker ps failed"
add_next "Check Docker daemon permissions/state"
fi
else
add_watch "P3 docker CLI not available"
fi
# 5) Key LAN + local service probes
http_probe "searxng" "$SEARX_URL" '^200$'
http_probe "whisper" "$WHISPER_URL" '^200$'
http_probe "brave-mcp" "$MCP_URL" '^(200|406)$'
http_probe "minio-live" "${MINIO_URL%/}/minio/health/live" '^200$'
llama_code="$(curl -sS -m 6 -o "$ARTIFACT_DIR/llamacpp-health.txt" -w '%{http_code}' "$LLAMA_CPP_URL" 2>"$ARTIFACT_DIR/llamacpp-health.err" || true)"
if [[ -n "$llama_code" && "$llama_code" != "000" ]]; then
add_watch "P4 llama.cpp endpoint responsive (${LLAMA_CPP_URL}, HTTP ${llama_code})"
else
llama_root="${LLAMA_CPP_URL%/health}"
llama_code_root="$(curl -sS -m 6 -o "$ARTIFACT_DIR/llamacpp-root.txt" -w '%{http_code}' "$llama_root" 2>"$ARTIFACT_DIR/llamacpp-root.err" || true)"
if [[ -n "$llama_code_root" && "$llama_code_root" != "000" ]]; then
add_watch "P4 llama.cpp host reachable (${llama_root}, HTTP ${llama_code_root})"
else
if [[ "${LLAMA_CPP_REQUIRED}" == "true" ]]; then
add_now "P1 llama.cpp endpoint unreachable (${LLAMA_CPP_URL})"
mark_p1
add_next "Check llama.cpp bind address/port and service logs"
else
add_watch "P3 llama.cpp endpoint unreachable at configured URL (${LLAMA_CPP_URL})"
fi
fi
fi
# 6) Ollama embeddings availability + target model
if curl -sS -m 6 "${OLLAMA_URL%/}/api/tags" >"$ARTIFACT_DIR/ollama-tags.json" 2>"$ARTIFACT_DIR/ollama-tags.err"; then
if jq -e --arg model "$OLLAMA_EMBED_MODEL" '.models[]? | select(.name == $model)' "$ARTIFACT_DIR/ollama-tags.json" >/dev/null 2>&1; then
add_watch "P4 Ollama up; embedding model present (${OLLAMA_EMBED_MODEL})"
else
add_soon "P2 Ollama up but embedding model missing (${OLLAMA_EMBED_MODEL})"
mark_p2
add_next "Pull model: ollama pull ${OLLAMA_EMBED_MODEL}"
fi
else
if [[ "${OLLAMA_REQUIRED}" == "true" ]]; then
add_now "P1 Ollama unreachable (${OLLAMA_URL})"
mark_p1
add_next "Check Ollama service and port (default 11434)"
else
add_watch "P3 Ollama unreachable at configured URL (${OLLAMA_URL}); set OLLAMA_URL if remote"
fi
fi
# 7) Host pressure: disk + memory
root_disk_pct="$(df -P / | awk 'NR==2 {gsub(/%/,"",$5); print $5}' 2>/dev/null || echo 0)"
if [[ "$root_disk_pct" =~ ^[0-9]+$ ]]; then
if (( root_disk_pct >= 95 )); then
add_now "P1 Root disk critical: ${root_disk_pct}% used"
mark_p1
add_next "Free disk space urgently"
elif (( root_disk_pct >= WARN_DISK_PCT )); then
add_soon "P2 Root disk high: ${root_disk_pct}% used"
mark_p2
add_next "Prune logs/artifacts and monitor growth"
else
add_watch "P4 Root disk normal: ${root_disk_pct}% used"
fi
fi
if [[ -r /proc/meminfo ]]; then
mem_total_kb="$(awk '/MemTotal:/ {print $2}' /proc/meminfo)"
mem_avail_kb="$(awk '/MemAvailable:/ {print $2}' /proc/meminfo)"
if [[ "$mem_total_kb" =~ ^[0-9]+$ ]] && [[ "$mem_avail_kb" =~ ^[0-9]+$ ]] && (( mem_total_kb > 0 )); then
mem_used_pct=$(( (100 * (mem_total_kb - mem_avail_kb)) / mem_total_kb ))
if (( mem_used_pct >= 95 )); then
add_now "P1 Memory pressure critical: ${mem_used_pct}% used"
mark_p1
add_next "Inspect heavy processes / reduce workload"
elif (( mem_used_pct >= WARN_MEM_PCT )); then
add_soon "P2 Memory pressure high: ${mem_used_pct}% used"
mark_p2
add_next "Check workload spikes and tune limits"
else
add_watch "P4 Memory normal: ${mem_used_pct}% used"
fi
fi
fi
VERDICT="OK"
EXIT_CODE=0
if (( P1 > 0 )); then
VERDICT="NEEDS_ATTENTION"
EXIT_CODE=2
elif (( P2 > 0 )); then
VERDICT="MONITOR"
EXIT_CODE=1
fi
{
echo "Verdict: ${VERDICT}"
echo "Counts: p1=${P1} p2=${P2}"
echo "Artifact path: ${ARTIFACT_DIR}"
echo
echo "Now:"
if (( ${#NOW[@]} == 0 )); then echo "- P4 Nothing urgent"; else for x in "${NOW[@]}"; do echo "- ${x}"; done; fi
echo
echo "Soon:"
if (( ${#SOON[@]} == 0 )); then echo "- P4 No near-term risks"; else for x in "${SOON[@]}"; do echo "- ${x}"; done; fi
echo
echo "Watch:"
if (( ${#WATCH[@]} == 0 )); then echo "- P4 No watch items"; else for x in "${WATCH[@]}"; do echo "- ${x}"; done; fi
echo
echo "Next actions:"
if (( ${#NEXT[@]} == 0 )); then
echo "- Keep current cadence"
else
printf '%s\n' "${NEXT[@]}" | awk '!seen[$0]++' | sed 's/^/- /'
fi
} | tee "$ARTIFACT_DIR/summary.txt"
exit "$EXIT_CODE"