#!/usr/bin/env bash set -euo pipefail # Lightweight operational snapshot for OpenClaw homelab. # Output frame: Now / Soon / Watch / Next actions OPENCLAW_BIN="${OPENCLAW_BIN:-openclaw}" BACKUP_LOG="${BACKUP_LOG:-memory/minio-backup.log}" BACKUP_MAX_AGE_HOURS="${BACKUP_MAX_AGE_HOURS:-8}" SEARX_URL="${SEARX_URL:-http://192.168.153.113:18803}" WHISPER_URL="${WHISPER_URL:-http://192.168.153.113:18801}" MCP_URL="${MCP_URL:-http://192.168.153.113:18802/mcp}" OLLAMA_URL="${OLLAMA_URL:-http://192.168.153.113:18807}" OLLAMA_EMBED_MODEL="${OLLAMA_EMBED_MODEL:-nomic-embed-text:latest}" OLLAMA_REQUIRED="${OLLAMA_REQUIRED:-true}" OLLAMA_SERVICE="${OLLAMA_SERVICE:-ollama.service}" LLAMA_CPP_SERVICE="${LLAMA_CPP_SERVICE:-llama-server.service}" LLAMA_CPP_URL="${LLAMA_CPP_URL:-http://192.168.153.113:18806/health}" LLAMA_CPP_REQUIRED="${LLAMA_CPP_REQUIRED:-true}" SYSTEMD_LOCAL_CHECKS="${SYSTEMD_LOCAL_CHECKS:-false}" MINIO_URL="${MINIO_URL:-http://192.168.153.253:9000}" DOCKER_EXPECTED_CONTAINERS="${DOCKER_EXPECTED_CONTAINERS:-searxng whisper-server brave-search}" DOCKER_REQUIRE_EXPECTED="${DOCKER_REQUIRE_EXPECTED:-false}" WARN_DISK_PCT="${WARN_DISK_PCT:-85}" WARN_MEM_PCT="${WARN_MEM_PCT:-85}" TS_DAY="$(date -u +%F)" TS_STAMP="$(date -u +%H%M%S)" ARTIFACT_DIR="${HEALTHCHECK_OUTPUT_DIR:-/tmp/openclaw-healthcheck}/${TS_DAY}/${TS_STAMP}" mkdir -p "$ARTIFACT_DIR" NOW=() SOON=() WATCH=() NEXT=() P1=0 P2=0 add_now() { NOW+=("$1"); } add_soon() { SOON+=("$1"); } add_watch() { WATCH+=("$1"); } add_next() { NEXT+=("$1"); } mark_p1() { P1=$((P1 + 1)); } mark_p2() { P2=$((P2 + 1)); } http_probe() { local name="$1" url="$2" expected_regex="$3" local out_file="$ARTIFACT_DIR/http-${name}.txt" local result code ttotal if ! result="$(curl -sS -m 6 -o "$out_file" -w '%{http_code} %{time_total}' "$url" 2>"$ARTIFACT_DIR/http-${name}.err")"; then add_now "P1 ${name} unreachable (${url})" mark_p1 add_next "Check ${name} service/container and LAN route" return fi code="${result%% *}" ttotal="${result##* }" if [[ "$code" =~ $expected_regex ]]; then add_watch "P4 ${name} OK (HTTP ${code}, ${ttotal}s)" else add_watch "P2 ${name} unexpected response (HTTP ${code}, ${ttotal}s)" mark_p2 add_next "Validate ${name} endpoint/health semantics" fi } # 1) OpenClaw health + security if "$OPENCLAW_BIN" health --json >"$ARTIFACT_DIR/openclaw-health.json" 2>"$ARTIFACT_DIR/openclaw-health.err"; then if jq -e '.ok == true' "$ARTIFACT_DIR/openclaw-health.json" >/dev/null 2>&1; then add_watch "P4 OpenClaw gateway health OK" else add_now "P1 OpenClaw health reported not-ok" mark_p1 add_next "Run: openclaw health --json" fi else add_now "P1 Failed to run openclaw health" mark_p1 add_next "Run: openclaw status && openclaw logs --follow" fi if "$OPENCLAW_BIN" security audit --json >"$ARTIFACT_DIR/openclaw-security-audit.json" 2>"$ARTIFACT_DIR/openclaw-security-audit.err"; then sec_critical="$(jq -r '.summary.critical // 0' "$ARTIFACT_DIR/openclaw-security-audit.json" 2>/dev/null || echo 0)" sec_warn="$(jq -r '.summary.warn // 0' "$ARTIFACT_DIR/openclaw-security-audit.json" 2>/dev/null || echo 0)" if [[ "$sec_critical" =~ ^[0-9]+$ ]] && (( sec_critical > 0 )); then add_now "P1 Security audit has ${sec_critical} critical finding(s)" mark_p1 add_next "Run: openclaw security audit --deep" fi if [[ "$sec_warn" =~ ^[0-9]+$ ]] && (( sec_warn > 0 )); then add_watch "P2 Security audit has ${sec_warn} warning(s)" mark_p2 add_next "Review plugin/tool policy allowlists" fi else add_watch "P3 Security audit command failed" add_next "Run: openclaw security audit --json" fi if "$OPENCLAW_BIN" update status >"$ARTIFACT_DIR/openclaw-update-status.txt" 2>"$ARTIFACT_DIR/openclaw-update-status.err"; then if grep -Eqi 'update available|outdated|new version|available:' "$ARTIFACT_DIR/openclaw-update-status.txt"; then add_watch "P2 OpenClaw update available" mark_p2 add_next "Review release notes, then run safe update workflow" else add_watch "P4 OpenClaw update status checked" fi else add_watch "P3 OpenClaw update status check failed" add_next "Run: openclaw update status" fi # 2) Backup freshness from minio backup log if [[ -f "$BACKUP_LOG" ]]; then last_key="$(grep 'Backup complete:' "$BACKUP_LOG" | tail -n1 | sed -E 's#.*workspace-backups/([0-9]{8}T[0-9]{6}Z).*#\1#' || true)" if [[ -n "$last_key" ]]; then backup_iso="${last_key:0:4}-${last_key:4:2}-${last_key:6:2} ${last_key:9:2}:${last_key:11:2}:${last_key:13:2} UTC" backup_epoch="$(date -u -d "$backup_iso" +%s 2>/dev/null || echo 0)" now_epoch="$(date -u +%s)" if (( backup_epoch > 0 )); then age_hours=$(( (now_epoch - backup_epoch) / 3600 )) if (( age_hours > BACKUP_MAX_AGE_HOURS )); then add_now "P1 Backup stale: last success ${age_hours}h ago (${last_key})" mark_p1 add_next "Run backup job now and verify new 'Backup complete' entry" elif (( age_hours >= BACKUP_MAX_AGE_HOURS - 1 )); then add_soon "P2 Backup nearing threshold: ${age_hours}h old (${last_key})" mark_p2 else add_watch "P4 Backup fresh (${age_hours}h old, ${last_key})" fi else add_now "P1 Could not parse backup timestamp from ${BACKUP_LOG}" mark_p1 add_next "Inspect backup log format or backup script output" fi else add_now "P1 No 'Backup complete' entry found in ${BACKUP_LOG}" mark_p1 add_next "Run backup and confirm completion line is written" fi else add_now "P1 Backup log missing: ${BACKUP_LOG}" mark_p1 add_next "Create backup log or fix backup script path" fi # 3) systemd service health (local model runtimes) if [[ "${SYSTEMD_LOCAL_CHECKS}" == "true" ]]; then if command -v systemctl >/dev/null 2>&1; then for unit in "$OLLAMA_SERVICE" "$LLAMA_CPP_SERVICE"; do if systemctl is-active --quiet "$unit"; then add_watch "P4 systemd service active: ${unit}" else if [[ "$unit" == "$OLLAMA_SERVICE" && "${OLLAMA_REQUIRED}" != "true" ]]; then add_watch "P3 systemd service inactive: ${unit} (optional)" elif [[ "$unit" == "$LLAMA_CPP_SERVICE" && "${LLAMA_CPP_REQUIRED}" != "true" ]]; then add_watch "P3 systemd service inactive: ${unit} (optional)" else add_now "P1 systemd service inactive: ${unit}" mark_p1 add_next "Check unit: systemctl status ${unit}" fi fi done else add_watch "P3 systemctl not available" fi else add_watch "P3 Skipping local systemd checks (SYSTEMD_LOCAL_CHECKS=false)" fi # 4) Docker service health (containers + health state) if command -v docker >/dev/null 2>&1; then if docker ps --format '{{.Names}}\t{{.Status}}' >"$ARTIFACT_DIR/docker-ps.txt" 2>"$ARTIFACT_DIR/docker-ps.err"; then for svc in $DOCKER_EXPECTED_CONTAINERS; do if grep -qE "^${svc}[[:space:]]" "$ARTIFACT_DIR/docker-ps.txt"; then line="$(grep -E "^${svc}[[:space:]]" "$ARTIFACT_DIR/docker-ps.txt" | head -n1)" if echo "$line" | grep -qiE 'unhealthy|dead|exited|restarting'; then add_now "P1 Docker container ${svc} unhealthy (${line#*$'\t'})" mark_p1 add_next "Check logs: docker logs --tail=200 ${svc}" else add_watch "P4 Docker container ${svc} running" fi else if [[ "${DOCKER_REQUIRE_EXPECTED}" == "true" ]]; then add_now "P1 Docker container missing: ${svc}" mark_p1 add_next "Start or restore container: ${svc}" else add_watch "P3 Docker container not found locally: ${svc} (may run on another host)" fi fi done else add_watch "P3 docker ps failed" add_next "Check Docker daemon permissions/state" fi else add_watch "P3 docker CLI not available" fi # 5) Key LAN + local service probes http_probe "searxng" "$SEARX_URL" '^200$' http_probe "whisper" "$WHISPER_URL" '^200$' http_probe "brave-mcp" "$MCP_URL" '^(200|406)$' http_probe "minio-live" "${MINIO_URL%/}/minio/health/live" '^200$' llama_code="$(curl -sS -m 6 -o "$ARTIFACT_DIR/llamacpp-health.txt" -w '%{http_code}' "$LLAMA_CPP_URL" 2>"$ARTIFACT_DIR/llamacpp-health.err" || true)" if [[ -n "$llama_code" && "$llama_code" != "000" ]]; then add_watch "P4 llama.cpp endpoint responsive (${LLAMA_CPP_URL}, HTTP ${llama_code})" else llama_root="${LLAMA_CPP_URL%/health}" llama_code_root="$(curl -sS -m 6 -o "$ARTIFACT_DIR/llamacpp-root.txt" -w '%{http_code}' "$llama_root" 2>"$ARTIFACT_DIR/llamacpp-root.err" || true)" if [[ -n "$llama_code_root" && "$llama_code_root" != "000" ]]; then add_watch "P4 llama.cpp host reachable (${llama_root}, HTTP ${llama_code_root})" else if [[ "${LLAMA_CPP_REQUIRED}" == "true" ]]; then add_now "P1 llama.cpp endpoint unreachable (${LLAMA_CPP_URL})" mark_p1 add_next "Check llama.cpp bind address/port and service logs" else add_watch "P3 llama.cpp endpoint unreachable at configured URL (${LLAMA_CPP_URL})" fi fi fi # 6) Ollama embeddings availability + target model if curl -sS -m 6 "${OLLAMA_URL%/}/api/tags" >"$ARTIFACT_DIR/ollama-tags.json" 2>"$ARTIFACT_DIR/ollama-tags.err"; then if jq -e --arg model "$OLLAMA_EMBED_MODEL" '.models[]? | select(.name == $model)' "$ARTIFACT_DIR/ollama-tags.json" >/dev/null 2>&1; then add_watch "P4 Ollama up; embedding model present (${OLLAMA_EMBED_MODEL})" else add_soon "P2 Ollama up but embedding model missing (${OLLAMA_EMBED_MODEL})" mark_p2 add_next "Pull model: ollama pull ${OLLAMA_EMBED_MODEL}" fi else if [[ "${OLLAMA_REQUIRED}" == "true" ]]; then add_now "P1 Ollama unreachable (${OLLAMA_URL})" mark_p1 add_next "Check Ollama service and port (default 11434)" else add_watch "P3 Ollama unreachable at configured URL (${OLLAMA_URL}); set OLLAMA_URL if remote" fi fi # 7) Host pressure: disk + memory root_disk_pct="$(df -P / | awk 'NR==2 {gsub(/%/,"",$5); print $5}' 2>/dev/null || echo 0)" if [[ "$root_disk_pct" =~ ^[0-9]+$ ]]; then if (( root_disk_pct >= 95 )); then add_now "P1 Root disk critical: ${root_disk_pct}% used" mark_p1 add_next "Free disk space urgently" elif (( root_disk_pct >= WARN_DISK_PCT )); then add_soon "P2 Root disk high: ${root_disk_pct}% used" mark_p2 add_next "Prune logs/artifacts and monitor growth" else add_watch "P4 Root disk normal: ${root_disk_pct}% used" fi fi if [[ -r /proc/meminfo ]]; then mem_total_kb="$(awk '/MemTotal:/ {print $2}' /proc/meminfo)" mem_avail_kb="$(awk '/MemAvailable:/ {print $2}' /proc/meminfo)" if [[ "$mem_total_kb" =~ ^[0-9]+$ ]] && [[ "$mem_avail_kb" =~ ^[0-9]+$ ]] && (( mem_total_kb > 0 )); then mem_used_pct=$(( (100 * (mem_total_kb - mem_avail_kb)) / mem_total_kb )) if (( mem_used_pct >= 95 )); then add_now "P1 Memory pressure critical: ${mem_used_pct}% used" mark_p1 add_next "Inspect heavy processes / reduce workload" elif (( mem_used_pct >= WARN_MEM_PCT )); then add_soon "P2 Memory pressure high: ${mem_used_pct}% used" mark_p2 add_next "Check workload spikes and tune limits" else add_watch "P4 Memory normal: ${mem_used_pct}% used" fi fi fi VERDICT="OK" EXIT_CODE=0 if (( P1 > 0 )); then VERDICT="NEEDS_ATTENTION" EXIT_CODE=2 elif (( P2 > 0 )); then VERDICT="MONITOR" EXIT_CODE=1 fi { echo "Verdict: ${VERDICT}" echo "Counts: p1=${P1} p2=${P2}" echo "Artifact path: ${ARTIFACT_DIR}" echo echo "Now:" if (( ${#NOW[@]} == 0 )); then echo "- P4 Nothing urgent"; else for x in "${NOW[@]}"; do echo "- ${x}"; done; fi echo echo "Soon:" if (( ${#SOON[@]} == 0 )); then echo "- P4 No near-term risks"; else for x in "${SOON[@]}"; do echo "- ${x}"; done; fi echo echo "Watch:" if (( ${#WATCH[@]} == 0 )); then echo "- P4 No watch items"; else for x in "${WATCH[@]}"; do echo "- ${x}"; done; fi echo echo "Next actions:" if (( ${#NEXT[@]} == 0 )); then echo "- Keep current cadence" else printf '%s\n' "${NEXT[@]}" | awk '!seen[$0]++' | sed 's/^/- /' fi } | tee "$ARTIFACT_DIR/summary.txt" exit "$EXIT_CODE"