317 lines
12 KiB
Bash
Executable File
317 lines
12 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# Lightweight operational snapshot for OpenClaw homelab.
|
|
# Output frame: Now / Soon / Watch / Next actions
|
|
|
|
OPENCLAW_BIN="${OPENCLAW_BIN:-openclaw}"
|
|
BACKUP_LOG="${BACKUP_LOG:-memory/minio-backup.log}"
|
|
BACKUP_MAX_AGE_HOURS="${BACKUP_MAX_AGE_HOURS:-8}"
|
|
SEARX_URL="${SEARX_URL:-http://192.168.153.113:18803}"
|
|
WHISPER_URL="${WHISPER_URL:-http://192.168.153.113:18801}"
|
|
MCP_URL="${MCP_URL:-http://192.168.153.113:18802/mcp}"
|
|
OLLAMA_URL="${OLLAMA_URL:-http://192.168.153.113:18807}"
|
|
OLLAMA_EMBED_MODEL="${OLLAMA_EMBED_MODEL:-nomic-embed-text:latest}"
|
|
OLLAMA_REQUIRED="${OLLAMA_REQUIRED:-true}"
|
|
OLLAMA_SERVICE="${OLLAMA_SERVICE:-ollama.service}"
|
|
LLAMA_CPP_SERVICE="${LLAMA_CPP_SERVICE:-llama-server.service}"
|
|
LLAMA_CPP_URL="${LLAMA_CPP_URL:-http://192.168.153.113:18806/health}"
|
|
LLAMA_CPP_REQUIRED="${LLAMA_CPP_REQUIRED:-true}"
|
|
SYSTEMD_LOCAL_CHECKS="${SYSTEMD_LOCAL_CHECKS:-false}"
|
|
MINIO_URL="${MINIO_URL:-http://192.168.153.253:9000}"
|
|
DOCKER_EXPECTED_CONTAINERS="${DOCKER_EXPECTED_CONTAINERS:-searxng whisper-server brave-search}"
|
|
DOCKER_REQUIRE_EXPECTED="${DOCKER_REQUIRE_EXPECTED:-false}"
|
|
WARN_DISK_PCT="${WARN_DISK_PCT:-85}"
|
|
WARN_MEM_PCT="${WARN_MEM_PCT:-85}"
|
|
|
|
TS_DAY="$(date -u +%F)"
|
|
TS_STAMP="$(date -u +%H%M%S)"
|
|
ARTIFACT_DIR="${HEALTHCHECK_OUTPUT_DIR:-/tmp/openclaw-healthcheck}/${TS_DAY}/${TS_STAMP}"
|
|
mkdir -p "$ARTIFACT_DIR"
|
|
|
|
NOW=()
|
|
SOON=()
|
|
WATCH=()
|
|
NEXT=()
|
|
P1=0
|
|
P2=0
|
|
|
|
add_now() { NOW+=("$1"); }
|
|
add_soon() { SOON+=("$1"); }
|
|
add_watch() { WATCH+=("$1"); }
|
|
add_next() { NEXT+=("$1"); }
|
|
|
|
mark_p1() { P1=$((P1 + 1)); }
|
|
mark_p2() { P2=$((P2 + 1)); }
|
|
|
|
http_probe() {
|
|
local name="$1" url="$2" expected_regex="$3"
|
|
local out_file="$ARTIFACT_DIR/http-${name}.txt"
|
|
local result code ttotal
|
|
if ! result="$(curl -sS -m 6 -o "$out_file" -w '%{http_code} %{time_total}' "$url" 2>"$ARTIFACT_DIR/http-${name}.err")"; then
|
|
add_now "P1 ${name} unreachable (${url})"
|
|
mark_p1
|
|
add_next "Check ${name} service/container and LAN route"
|
|
return
|
|
fi
|
|
code="${result%% *}"
|
|
ttotal="${result##* }"
|
|
if [[ "$code" =~ $expected_regex ]]; then
|
|
add_watch "P4 ${name} OK (HTTP ${code}, ${ttotal}s)"
|
|
else
|
|
add_watch "P2 ${name} unexpected response (HTTP ${code}, ${ttotal}s)"
|
|
mark_p2
|
|
add_next "Validate ${name} endpoint/health semantics"
|
|
fi
|
|
}
|
|
|
|
# 1) OpenClaw health + security
|
|
if "$OPENCLAW_BIN" health --json >"$ARTIFACT_DIR/openclaw-health.json" 2>"$ARTIFACT_DIR/openclaw-health.err"; then
|
|
if jq -e '.ok == true' "$ARTIFACT_DIR/openclaw-health.json" >/dev/null 2>&1; then
|
|
add_watch "P4 OpenClaw gateway health OK"
|
|
else
|
|
add_now "P1 OpenClaw health reported not-ok"
|
|
mark_p1
|
|
add_next "Run: openclaw health --json"
|
|
fi
|
|
else
|
|
add_now "P1 Failed to run openclaw health"
|
|
mark_p1
|
|
add_next "Run: openclaw status && openclaw logs --follow"
|
|
fi
|
|
|
|
if "$OPENCLAW_BIN" security audit --json >"$ARTIFACT_DIR/openclaw-security-audit.json" 2>"$ARTIFACT_DIR/openclaw-security-audit.err"; then
|
|
sec_critical="$(jq -r '.summary.critical // 0' "$ARTIFACT_DIR/openclaw-security-audit.json" 2>/dev/null || echo 0)"
|
|
sec_warn="$(jq -r '.summary.warn // 0' "$ARTIFACT_DIR/openclaw-security-audit.json" 2>/dev/null || echo 0)"
|
|
if [[ "$sec_critical" =~ ^[0-9]+$ ]] && (( sec_critical > 0 )); then
|
|
add_now "P1 Security audit has ${sec_critical} critical finding(s)"
|
|
mark_p1
|
|
add_next "Run: openclaw security audit --deep"
|
|
fi
|
|
if [[ "$sec_warn" =~ ^[0-9]+$ ]] && (( sec_warn > 0 )); then
|
|
add_watch "P2 Security audit has ${sec_warn} warning(s)"
|
|
mark_p2
|
|
add_next "Review plugin/tool policy allowlists"
|
|
fi
|
|
else
|
|
add_watch "P3 Security audit command failed"
|
|
add_next "Run: openclaw security audit --json"
|
|
fi
|
|
|
|
if "$OPENCLAW_BIN" update status >"$ARTIFACT_DIR/openclaw-update-status.txt" 2>"$ARTIFACT_DIR/openclaw-update-status.err"; then
|
|
if grep -Eqi 'update available|outdated|new version|available:' "$ARTIFACT_DIR/openclaw-update-status.txt"; then
|
|
add_watch "P2 OpenClaw update available"
|
|
mark_p2
|
|
add_next "Review release notes, then run safe update workflow"
|
|
else
|
|
add_watch "P4 OpenClaw update status checked"
|
|
fi
|
|
else
|
|
add_watch "P3 OpenClaw update status check failed"
|
|
add_next "Run: openclaw update status"
|
|
fi
|
|
|
|
# 2) Backup freshness from minio backup log
|
|
if [[ -f "$BACKUP_LOG" ]]; then
|
|
last_key="$(grep 'Backup complete:' "$BACKUP_LOG" | tail -n1 | sed -E 's#.*workspace-backups/([0-9]{8}T[0-9]{6}Z).*#\1#' || true)"
|
|
if [[ -n "$last_key" ]]; then
|
|
backup_iso="${last_key:0:4}-${last_key:4:2}-${last_key:6:2} ${last_key:9:2}:${last_key:11:2}:${last_key:13:2} UTC"
|
|
backup_epoch="$(date -u -d "$backup_iso" +%s 2>/dev/null || echo 0)"
|
|
now_epoch="$(date -u +%s)"
|
|
if (( backup_epoch > 0 )); then
|
|
age_hours=$(( (now_epoch - backup_epoch) / 3600 ))
|
|
if (( age_hours > BACKUP_MAX_AGE_HOURS )); then
|
|
add_now "P1 Backup stale: last success ${age_hours}h ago (${last_key})"
|
|
mark_p1
|
|
add_next "Run backup job now and verify new 'Backup complete' entry"
|
|
elif (( age_hours >= BACKUP_MAX_AGE_HOURS - 1 )); then
|
|
add_soon "P2 Backup nearing threshold: ${age_hours}h old (${last_key})"
|
|
mark_p2
|
|
else
|
|
add_watch "P4 Backup fresh (${age_hours}h old, ${last_key})"
|
|
fi
|
|
else
|
|
add_now "P1 Could not parse backup timestamp from ${BACKUP_LOG}"
|
|
mark_p1
|
|
add_next "Inspect backup log format or backup script output"
|
|
fi
|
|
else
|
|
add_now "P1 No 'Backup complete' entry found in ${BACKUP_LOG}"
|
|
mark_p1
|
|
add_next "Run backup and confirm completion line is written"
|
|
fi
|
|
else
|
|
add_now "P1 Backup log missing: ${BACKUP_LOG}"
|
|
mark_p1
|
|
add_next "Create backup log or fix backup script path"
|
|
fi
|
|
|
|
# 3) systemd service health (local model runtimes)
|
|
if [[ "${SYSTEMD_LOCAL_CHECKS}" == "true" ]]; then
|
|
if command -v systemctl >/dev/null 2>&1; then
|
|
for unit in "$OLLAMA_SERVICE" "$LLAMA_CPP_SERVICE"; do
|
|
if systemctl is-active --quiet "$unit"; then
|
|
add_watch "P4 systemd service active: ${unit}"
|
|
else
|
|
if [[ "$unit" == "$OLLAMA_SERVICE" && "${OLLAMA_REQUIRED}" != "true" ]]; then
|
|
add_watch "P3 systemd service inactive: ${unit} (optional)"
|
|
elif [[ "$unit" == "$LLAMA_CPP_SERVICE" && "${LLAMA_CPP_REQUIRED}" != "true" ]]; then
|
|
add_watch "P3 systemd service inactive: ${unit} (optional)"
|
|
else
|
|
add_now "P1 systemd service inactive: ${unit}"
|
|
mark_p1
|
|
add_next "Check unit: systemctl status ${unit}"
|
|
fi
|
|
fi
|
|
done
|
|
else
|
|
add_watch "P3 systemctl not available"
|
|
fi
|
|
else
|
|
add_watch "P3 Skipping local systemd checks (SYSTEMD_LOCAL_CHECKS=false)"
|
|
fi
|
|
|
|
# 4) Docker service health (containers + health state)
|
|
if command -v docker >/dev/null 2>&1; then
|
|
if docker ps --format '{{.Names}}\t{{.Status}}' >"$ARTIFACT_DIR/docker-ps.txt" 2>"$ARTIFACT_DIR/docker-ps.err"; then
|
|
for svc in $DOCKER_EXPECTED_CONTAINERS; do
|
|
if grep -qE "^${svc}[[:space:]]" "$ARTIFACT_DIR/docker-ps.txt"; then
|
|
line="$(grep -E "^${svc}[[:space:]]" "$ARTIFACT_DIR/docker-ps.txt" | head -n1)"
|
|
if echo "$line" | grep -qiE 'unhealthy|dead|exited|restarting'; then
|
|
add_now "P1 Docker container ${svc} unhealthy (${line#*$'\t'})"
|
|
mark_p1
|
|
add_next "Check logs: docker logs --tail=200 ${svc}"
|
|
else
|
|
add_watch "P4 Docker container ${svc} running"
|
|
fi
|
|
else
|
|
if [[ "${DOCKER_REQUIRE_EXPECTED}" == "true" ]]; then
|
|
add_now "P1 Docker container missing: ${svc}"
|
|
mark_p1
|
|
add_next "Start or restore container: ${svc}"
|
|
else
|
|
add_watch "P3 Docker container not found locally: ${svc} (may run on another host)"
|
|
fi
|
|
fi
|
|
done
|
|
else
|
|
add_watch "P3 docker ps failed"
|
|
add_next "Check Docker daemon permissions/state"
|
|
fi
|
|
else
|
|
add_watch "P3 docker CLI not available"
|
|
fi
|
|
|
|
# 5) Key LAN + local service probes
|
|
http_probe "searxng" "$SEARX_URL" '^200$'
|
|
http_probe "whisper" "$WHISPER_URL" '^200$'
|
|
http_probe "brave-mcp" "$MCP_URL" '^(200|406)$'
|
|
http_probe "minio-live" "${MINIO_URL%/}/minio/health/live" '^200$'
|
|
|
|
llama_code="$(curl -sS -m 6 -o "$ARTIFACT_DIR/llamacpp-health.txt" -w '%{http_code}' "$LLAMA_CPP_URL" 2>"$ARTIFACT_DIR/llamacpp-health.err" || true)"
|
|
if [[ -n "$llama_code" && "$llama_code" != "000" ]]; then
|
|
add_watch "P4 llama.cpp endpoint responsive (${LLAMA_CPP_URL}, HTTP ${llama_code})"
|
|
else
|
|
llama_root="${LLAMA_CPP_URL%/health}"
|
|
llama_code_root="$(curl -sS -m 6 -o "$ARTIFACT_DIR/llamacpp-root.txt" -w '%{http_code}' "$llama_root" 2>"$ARTIFACT_DIR/llamacpp-root.err" || true)"
|
|
if [[ -n "$llama_code_root" && "$llama_code_root" != "000" ]]; then
|
|
add_watch "P4 llama.cpp host reachable (${llama_root}, HTTP ${llama_code_root})"
|
|
else
|
|
if [[ "${LLAMA_CPP_REQUIRED}" == "true" ]]; then
|
|
add_now "P1 llama.cpp endpoint unreachable (${LLAMA_CPP_URL})"
|
|
mark_p1
|
|
add_next "Check llama.cpp bind address/port and service logs"
|
|
else
|
|
add_watch "P3 llama.cpp endpoint unreachable at configured URL (${LLAMA_CPP_URL})"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# 6) Ollama embeddings availability + target model
|
|
if curl -sS -m 6 "${OLLAMA_URL%/}/api/tags" >"$ARTIFACT_DIR/ollama-tags.json" 2>"$ARTIFACT_DIR/ollama-tags.err"; then
|
|
if jq -e --arg model "$OLLAMA_EMBED_MODEL" '.models[]? | select(.name == $model)' "$ARTIFACT_DIR/ollama-tags.json" >/dev/null 2>&1; then
|
|
add_watch "P4 Ollama up; embedding model present (${OLLAMA_EMBED_MODEL})"
|
|
else
|
|
add_soon "P2 Ollama up but embedding model missing (${OLLAMA_EMBED_MODEL})"
|
|
mark_p2
|
|
add_next "Pull model: ollama pull ${OLLAMA_EMBED_MODEL}"
|
|
fi
|
|
else
|
|
if [[ "${OLLAMA_REQUIRED}" == "true" ]]; then
|
|
add_now "P1 Ollama unreachable (${OLLAMA_URL})"
|
|
mark_p1
|
|
add_next "Check Ollama service and port (default 11434)"
|
|
else
|
|
add_watch "P3 Ollama unreachable at configured URL (${OLLAMA_URL}); set OLLAMA_URL if remote"
|
|
fi
|
|
fi
|
|
|
|
# 7) Host pressure: disk + memory
|
|
root_disk_pct="$(df -P / | awk 'NR==2 {gsub(/%/,"",$5); print $5}' 2>/dev/null || echo 0)"
|
|
if [[ "$root_disk_pct" =~ ^[0-9]+$ ]]; then
|
|
if (( root_disk_pct >= 95 )); then
|
|
add_now "P1 Root disk critical: ${root_disk_pct}% used"
|
|
mark_p1
|
|
add_next "Free disk space urgently"
|
|
elif (( root_disk_pct >= WARN_DISK_PCT )); then
|
|
add_soon "P2 Root disk high: ${root_disk_pct}% used"
|
|
mark_p2
|
|
add_next "Prune logs/artifacts and monitor growth"
|
|
else
|
|
add_watch "P4 Root disk normal: ${root_disk_pct}% used"
|
|
fi
|
|
fi
|
|
|
|
if [[ -r /proc/meminfo ]]; then
|
|
mem_total_kb="$(awk '/MemTotal:/ {print $2}' /proc/meminfo)"
|
|
mem_avail_kb="$(awk '/MemAvailable:/ {print $2}' /proc/meminfo)"
|
|
if [[ "$mem_total_kb" =~ ^[0-9]+$ ]] && [[ "$mem_avail_kb" =~ ^[0-9]+$ ]] && (( mem_total_kb > 0 )); then
|
|
mem_used_pct=$(( (100 * (mem_total_kb - mem_avail_kb)) / mem_total_kb ))
|
|
if (( mem_used_pct >= 95 )); then
|
|
add_now "P1 Memory pressure critical: ${mem_used_pct}% used"
|
|
mark_p1
|
|
add_next "Inspect heavy processes / reduce workload"
|
|
elif (( mem_used_pct >= WARN_MEM_PCT )); then
|
|
add_soon "P2 Memory pressure high: ${mem_used_pct}% used"
|
|
mark_p2
|
|
add_next "Check workload spikes and tune limits"
|
|
else
|
|
add_watch "P4 Memory normal: ${mem_used_pct}% used"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
VERDICT="OK"
|
|
EXIT_CODE=0
|
|
if (( P1 > 0 )); then
|
|
VERDICT="NEEDS_ATTENTION"
|
|
EXIT_CODE=2
|
|
elif (( P2 > 0 )); then
|
|
VERDICT="MONITOR"
|
|
EXIT_CODE=1
|
|
fi
|
|
|
|
{
|
|
echo "Verdict: ${VERDICT}"
|
|
echo "Counts: p1=${P1} p2=${P2}"
|
|
echo "Artifact path: ${ARTIFACT_DIR}"
|
|
echo
|
|
echo "Now:"
|
|
if (( ${#NOW[@]} == 0 )); then echo "- P4 Nothing urgent"; else for x in "${NOW[@]}"; do echo "- ${x}"; done; fi
|
|
echo
|
|
echo "Soon:"
|
|
if (( ${#SOON[@]} == 0 )); then echo "- P4 No near-term risks"; else for x in "${SOON[@]}"; do echo "- ${x}"; done; fi
|
|
echo
|
|
echo "Watch:"
|
|
if (( ${#WATCH[@]} == 0 )); then echo "- P4 No watch items"; else for x in "${WATCH[@]}"; do echo "- ${x}"; done; fi
|
|
echo
|
|
echo "Next actions:"
|
|
if (( ${#NEXT[@]} == 0 )); then
|
|
echo "- Keep current cadence"
|
|
else
|
|
printf '%s\n' "${NEXT[@]}" | awk '!seen[$0]++' | sed 's/^/- /'
|
|
fi
|
|
} | tee "$ARTIFACT_DIR/summary.txt"
|
|
|
|
exit "$EXIT_CODE"
|