feat(scripts): add ops and mcp sentinel automation scripts

2026-03-05 02:17:32 +00:00
parent d31bb80f04
commit ca65f245a3
2 changed files with 439 additions and 0 deletions
@@ -0,0 +1,242 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Lightweight MCP smoke test for HTTP MCP servers.
+# Default target: local Brave MCP server.
+
+MCP_URL="${MCP_URL:-http://192.168.153.113:18802/mcp}"
+TIMEOUT_SEC="${TIMEOUT_SEC:-10}"
+BASELINE_FILE="${BASELINE_FILE:-memory/mcp-smoke-tools-baseline.txt}"
+PROBE_QUERY="${PROBE_QUERY:-openclaw}"
+UPDATE_BASELINE=0
+SKIP_TOOL_CALL=0
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --url)
+      MCP_URL="$2"; shift 2 ;;
+    --timeout)
+      TIMEOUT_SEC="$2"; shift 2 ;;
+    --baseline)
+      BASELINE_FILE="$2"; shift 2 ;;
+    --query)
+      PROBE_QUERY="$2"; shift 2 ;;
+    --update-baseline)
+      UPDATE_BASELINE=1; shift ;;
+    --skip-tool-call)
+      SKIP_TOOL_CALL=1; shift ;;
+    -h|--help)
+      cat <<EOF
+Usage: $(basename "$0") [options]
+  --url <mcp_url>           MCP endpoint (default: ${MCP_URL})
+  --timeout <seconds>       Curl timeout (default: ${TIMEOUT_SEC})
+  --baseline <path>         Baseline tool-name file (default: ${BASELINE_FILE})
+  --query <text>            Query used for brave_web_search probe (default: ${PROBE_QUERY})
+  --skip-tool-call          Skip tools/call probe
+  --update-baseline         Save current tool names as baseline
+EOF
+      exit 0 ;;
+    *)
+      echo "Unknown arg: $1" >&2
+      exit 2 ;;
+  esac
+done
+
+TS_DAY="$(date -u +%F)"
+TS_STAMP="$(date -u +%H%M%S)"
+ARTIFACT_DIR="${MCP_SMOKE_OUTPUT_DIR:-/tmp/openclaw-mcp-smoke}/${TS_DAY}/${TS_STAMP}"
+mkdir -p "$ARTIFACT_DIR"
+
+NOW=()
+WATCH=()
+NEXT=()
+P1=0
+P2=0
+
+add_now(){ NOW+=("$1"); }
+add_watch(){ WATCH+=("$1"); }
+add_next(){ NEXT+=("$1"); }
+mark_p1(){ P1=$((P1+1)); }
+mark_p2(){ P2=$((P2+1)); }
+
+ms_now() { date +%s%3N; }
+
+# 1) initialize
+init_headers="$ARTIFACT_DIR/init.headers"
+init_body="$ARTIFACT_DIR/init.body"
+init_start="$(ms_now)"
+if ! curl -sS -m "$TIMEOUT_SEC" -D "$init_headers" -o "$init_body" \
+  -H 'Accept: text/event-stream, application/json' \
+  -H 'Content-Type: application/json' \
+  -X POST "$MCP_URL" \
+  --data '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2025-03-26","capabilities":{},"clientInfo":{"name":"mcp-smoke","version":"1.0"}}}' \
+  2>"$ARTIFACT_DIR/init.err"; then
+  add_now "P1 initialize request failed (${MCP_URL})"
+  mark_p1
+  add_next "Check MCP endpoint reachability and auth requirements"
+fi
+init_ms=$(( $(ms_now) - init_start ))
+
+session_id="$(awk -F': ' 'tolower($1)=="mcp-session-id" {gsub(/\r/,"",$2); print $2}' "$init_headers" | tail -n1 || true)"
+init_data_line="$(grep '^data: ' "$init_body" | tail -n1 | sed 's/^data: //' || true)"
+
+if [[ -z "$session_id" ]]; then
+  add_now "P1 initialize succeeded without mcp-session-id header"
+  mark_p1
+  add_next "Confirm endpoint is MCP over HTTP (streamable)"
+else
+  add_watch "P4 initialize OK (${init_ms}ms)"
+fi
+
+if [[ -n "$init_data_line" ]] && jq -e '.error' >/dev/null 2>&1 <<<"$init_data_line"; then
+  init_err_msg="$(jq -r '.error.message // "unknown initialize error"' <<<"$init_data_line")"
+  add_now "P1 initialize error: ${init_err_msg}"
+  mark_p1
+  add_next "Verify MCP auth/API key configuration"
+fi
+
+# 2) notifications/initialized (best effort)
+if [[ -n "$session_id" ]]; then
+  curl -sS -m "$TIMEOUT_SEC" -D "$ARTIFACT_DIR/initialized.headers" -o "$ARTIFACT_DIR/initialized.body" \
+    -H "mcp-session-id: ${session_id}" \
+    -H 'Accept: text/event-stream, application/json' \
+    -H 'Content-Type: application/json' \
+    -X POST "$MCP_URL" \
+    --data '{"jsonrpc":"2.0","method":"notifications/initialized","params":{}}' \
+    > /dev/null 2>"$ARTIFACT_DIR/initialized.err" || true
+fi
+
+# 3) tools/list
+tools_names_file="$ARTIFACT_DIR/tools.current.txt"
+tools_ms=0
+if [[ -n "$session_id" ]]; then
+  tools_start="$(ms_now)"
+  if curl -sS -m "$TIMEOUT_SEC" -o "$ARTIFACT_DIR/tools.body" \
+    -H "mcp-session-id: ${session_id}" \
+    -H 'Accept: text/event-stream, application/json' \
+    -H 'Content-Type: application/json' \
+    -X POST "$MCP_URL" \
+    --data '{"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}' \
+    2>"$ARTIFACT_DIR/tools.err"; then
+    tools_ms=$(( $(ms_now) - tools_start ))
+    grep '^data: ' "$ARTIFACT_DIR/tools.body" | sed 's/^data: //' | tail -n1 > "$ARTIFACT_DIR/tools.json" || true
+    if jq -e '.error' "$ARTIFACT_DIR/tools.json" >/dev/null 2>&1; then
+      msg="$(jq -r '.error.message // "tools/list failed"' "$ARTIFACT_DIR/tools.json")"
+      add_now "P1 tools/list error: ${msg}"
+      mark_p1
+      add_next "Check MCP upstream provider credentials"
+    else
+      jq -r '.result.tools[]?.name' "$ARTIFACT_DIR/tools.json" | sort -u > "$tools_names_file"
+      tool_count="$(wc -l < "$tools_names_file" | tr -d ' ')"
+      add_watch "P4 tools/list OK (${tools_ms}ms, ${tool_count} tools)"
+    fi
+  else
+    add_now "P1 tools/list request failed"
+    mark_p1
+    add_next "Inspect MCP server logs and network path"
+  fi
+fi
+
+# 4) optional tool probe (auth + runtime)
+if (( SKIP_TOOL_CALL == 0 )) && [[ -n "$session_id" ]] && [[ -s "$tools_names_file" ]]; then
+  if grep -qx 'brave_web_search' "$tools_names_file"; then
+    call_start="$(ms_now)"
+    if curl -sS -m "$TIMEOUT_SEC" -o "$ARTIFACT_DIR/tool-call.body" \
+      -H "mcp-session-id: ${session_id}" \
+      -H 'Accept: text/event-stream, application/json' \
+      -H 'Content-Type: application/json' \
+      -X POST "$MCP_URL" \
+      --data "{\"jsonrpc\":\"2.0\",\"id\":3,\"method\":\"tools/call\",\"params\":{\"name\":\"brave_web_search\",\"arguments\":{\"query\":\"${PROBE_QUERY}\",\"count\":1}}}" \
+      2>"$ARTIFACT_DIR/tool-call.err"; then
+      call_ms=$(( $(ms_now) - call_start ))
+      grep '^data: ' "$ARTIFACT_DIR/tool-call.body" | sed 's/^data: //' | tail -n1 > "$ARTIFACT_DIR/tool-call.json" || true
+      if jq -e '.error' "$ARTIFACT_DIR/tool-call.json" >/dev/null 2>&1; then
+        msg="$(jq -r '.error.message // "tools/call failed"' "$ARTIFACT_DIR/tool-call.json")"
+        add_now "P1 tools/call error: ${msg}"
+        mark_p1
+        add_next "Verify Brave API key/plan and outbound internet access"
+      else
+        add_watch "P4 tools/call brave_web_search OK (${call_ms}ms)"
+      fi
+    else
+      add_now "P1 tools/call request failed"
+      mark_p1
+      add_next "Check MCP service health and external API reachability"
+    fi
+  else
+    add_watch "P3 brave_web_search not present; skipped tools/call probe"
+  fi
+fi
+
+# 5) tool-list drift
+if [[ -s "$tools_names_file" ]]; then
+  if [[ -f "$BASELINE_FILE" ]]; then
+    sort -u "$BASELINE_FILE" > "$ARTIFACT_DIR/tools.baseline.sorted.txt"
+    comm -13 "$ARTIFACT_DIR/tools.baseline.sorted.txt" "$tools_names_file" > "$ARTIFACT_DIR/tools.added.txt" || true
+    comm -23 "$ARTIFACT_DIR/tools.baseline.sorted.txt" "$tools_names_file" > "$ARTIFACT_DIR/tools.removed.txt" || true
+
+    added_n="$(wc -l < "$ARTIFACT_DIR/tools.added.txt" | tr -d ' ')"
+    removed_n="$(wc -l < "$ARTIFACT_DIR/tools.removed.txt" | tr -d ' ')"
+    if (( added_n > 0 || removed_n > 0 )); then
+      add_watch "P2 Tool-list drift detected (+${added_n}/-${removed_n})"
+      mark_p2
+      add_next "Review drift and update baseline if expected"
+    else
+      add_watch "P4 Tool list matches baseline"
+    fi
+  else
+    if (( UPDATE_BASELINE == 1 )); then
+      add_watch "P4 Baseline bootstrap mode (creating ${BASELINE_FILE})"
+    else
+      add_watch "P3 No baseline file yet (${BASELINE_FILE})"
+      add_next "Run with --update-baseline after confirming current tool list"
+    fi
+  fi
+fi
+
+if (( UPDATE_BASELINE == 1 )) && [[ -s "$tools_names_file" ]]; then
+  mkdir -p "$(dirname "$BASELINE_FILE")"
+  cp "$tools_names_file" "$BASELINE_FILE"
+  add_watch "P4 Baseline updated: ${BASELINE_FILE}"
+fi
+
+# 6) mcporter quick config signal (optional)
+if command -v mcporter >/dev/null 2>&1; then
+  if mcporter list --json >"$ARTIFACT_DIR/mcporter-list.json" 2>"$ARTIFACT_DIR/mcporter-list.err"; then
+    configured="$(jq -r '(.servers // []) | length' "$ARTIFACT_DIR/mcporter-list.json" 2>/dev/null || echo 0)"
+    add_watch "P4 mcporter configured servers: ${configured}"
+  fi
+fi
+
+VERDICT="OK"
+EXIT_CODE=0
+if (( P1 > 0 )); then
+  VERDICT="NEEDS_ATTENTION"
+  EXIT_CODE=2
+elif (( P2 > 0 )); then
+  VERDICT="MONITOR"
+  EXIT_CODE=1
+fi
+
+{
+  echo "Verdict: ${VERDICT}"
+  echo "Counts: p1=${P1} p2=${P2}"
+  echo "Endpoint: ${MCP_URL}"
+  echo "Session: ${session_id:-none}"
+  echo "Artifact path: ${ARTIFACT_DIR}"
+  echo
+  echo "Now:"
+  if (( ${#NOW[@]} == 0 )); then echo "- P4 Nothing urgent"; else for x in "${NOW[@]}"; do echo "- ${x}"; done; fi
+  echo
+  echo "Watch:"
+  if (( ${#WATCH[@]} == 0 )); then echo "- P4 No watch items"; else for x in "${WATCH[@]}"; do echo "- ${x}"; done; fi
+  echo
+  echo "Next actions:"
+  if (( ${#NEXT[@]} == 0 )); then
+    echo "- Keep current baseline and run periodically"
+  else
+    printf '%s\n' "${NEXT[@]}" | awk '!seen[$0]++' | sed 's/^/- /'
+  fi
+} | tee "$ARTIFACT_DIR/summary.txt"
+
+exit "$EXIT_CODE"
@@ -0,0 +1,197 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Lightweight operational snapshot for OpenClaw homelab.
+# Output frame: Now / Soon / Watch / Next actions
+
+OPENCLAW_BIN="${OPENCLAW_BIN:-openclaw}"
+BACKUP_LOG="${BACKUP_LOG:-memory/minio-backup.log}"
+BACKUP_MAX_AGE_HOURS="${BACKUP_MAX_AGE_HOURS:-8}"
+SEARX_URL="${SEARX_URL:-http://192.168.153.113:18803}"
+WHISPER_URL="${WHISPER_URL:-http://192.168.153.113:18801}"
+MCP_URL="${MCP_URL:-http://192.168.153.113:18802/mcp}"
+WARN_DISK_PCT="${WARN_DISK_PCT:-85}"
+WARN_MEM_PCT="${WARN_MEM_PCT:-85}"
+
+TS_DAY="$(date -u +%F)"
+TS_STAMP="$(date -u +%H%M%S)"
+ARTIFACT_DIR="${HEALTHCHECK_OUTPUT_DIR:-/tmp/openclaw-healthcheck}/${TS_DAY}/${TS_STAMP}"
+mkdir -p "$ARTIFACT_DIR"
+
+NOW=()
+SOON=()
+WATCH=()
+NEXT=()
+P1=0
+P2=0
+
+add_now() { NOW+=("$1"); }
+add_soon() { SOON+=("$1"); }
+add_watch() { WATCH+=("$1"); }
+add_next() { NEXT+=("$1"); }
+
+mark_p1() { P1=$((P1 + 1)); }
+mark_p2() { P2=$((P2 + 1)); }
+
+http_probe() {
+  local name="$1" url="$2" expected_regex="$3"
+  local out_file="$ARTIFACT_DIR/http-${name}.txt"
+  local result code ttotal
+  if ! result="$(curl -sS -m 6 -o "$out_file" -w '%{http_code} %{time_total}' "$url" 2>"$ARTIFACT_DIR/http-${name}.err")"; then
+    add_now "P1 ${name} unreachable (${url})"
+    mark_p1
+    add_next "Check ${name} service/container and LAN route"
+    return
+  fi
+  code="${result%% *}"
+  ttotal="${result##* }"
+  if [[ "$code" =~ $expected_regex ]]; then
+    add_watch "P4 ${name} OK (HTTP ${code}, ${ttotal}s)"
+  else
+    add_watch "P2 ${name} unexpected response (HTTP ${code}, ${ttotal}s)"
+    mark_p2
+    add_next "Validate ${name} endpoint/health semantics"
+  fi
+}
+
+# 1) OpenClaw health + security
+if "$OPENCLAW_BIN" health --json >"$ARTIFACT_DIR/openclaw-health.json" 2>"$ARTIFACT_DIR/openclaw-health.err"; then
+  if jq -e '.ok == true' "$ARTIFACT_DIR/openclaw-health.json" >/dev/null 2>&1; then
+    add_watch "P4 OpenClaw gateway health OK"
+  else
+    add_now "P1 OpenClaw health reported not-ok"
+    mark_p1
+    add_next "Run: openclaw health --json"
+  fi
+else
+  add_now "P1 Failed to run openclaw health"
+  mark_p1
+  add_next "Run: openclaw status && openclaw logs --follow"
+fi
+
+if "$OPENCLAW_BIN" security audit --json >"$ARTIFACT_DIR/openclaw-security-audit.json" 2>"$ARTIFACT_DIR/openclaw-security-audit.err"; then
+  sec_critical="$(jq -r '.summary.critical // 0' "$ARTIFACT_DIR/openclaw-security-audit.json" 2>/dev/null || echo 0)"
+  sec_warn="$(jq -r '.summary.warn // 0' "$ARTIFACT_DIR/openclaw-security-audit.json" 2>/dev/null || echo 0)"
+  if [[ "$sec_critical" =~ ^[0-9]+$ ]] && (( sec_critical > 0 )); then
+    add_now "P1 Security audit has ${sec_critical} critical finding(s)"
+    mark_p1
+    add_next "Run: openclaw security audit --deep"
+  fi
+  if [[ "$sec_warn" =~ ^[0-9]+$ ]] && (( sec_warn > 0 )); then
+    add_watch "P2 Security audit has ${sec_warn} warning(s)"
+    mark_p2
+    add_next "Review plugin/tool policy allowlists"
+  fi
+else
+  add_watch "P3 Security audit command failed"
+  add_next "Run: openclaw security audit --json"
+fi
+
+# 2) Backup freshness from minio backup log
+if [[ -f "$BACKUP_LOG" ]]; then
+  last_key="$(grep 'Backup complete:' "$BACKUP_LOG" | tail -n1 | sed -E 's#.*workspace-backups/([0-9]{8}T[0-9]{6}Z).*#\1#' || true)"
+  if [[ -n "$last_key" ]]; then
+    backup_iso="${last_key:0:4}-${last_key:4:2}-${last_key:6:2} ${last_key:9:2}:${last_key:11:2}:${last_key:13:2} UTC"
+    backup_epoch="$(date -u -d "$backup_iso" +%s 2>/dev/null || echo 0)"
+    now_epoch="$(date -u +%s)"
+    if (( backup_epoch > 0 )); then
+      age_hours=$(( (now_epoch - backup_epoch) / 3600 ))
+      if (( age_hours > BACKUP_MAX_AGE_HOURS )); then
+        add_now "P1 Backup stale: last success ${age_hours}h ago (${last_key})"
+        mark_p1
+        add_next "Run backup job now and verify new 'Backup complete' entry"
+      elif (( age_hours >= BACKUP_MAX_AGE_HOURS - 1 )); then
+        add_soon "P2 Backup nearing threshold: ${age_hours}h old (${last_key})"
+        mark_p2
+      else
+        add_watch "P4 Backup fresh (${age_hours}h old, ${last_key})"
+      fi
+    else
+      add_now "P1 Could not parse backup timestamp from ${BACKUP_LOG}"
+      mark_p1
+      add_next "Inspect backup log format or backup script output"
+    fi
+  else
+    add_now "P1 No 'Backup complete' entry found in ${BACKUP_LOG}"
+    mark_p1
+    add_next "Run backup and confirm completion line is written"
+  fi
+else
+  add_now "P1 Backup log missing: ${BACKUP_LOG}"
+  mark_p1
+  add_next "Create backup log or fix backup script path"
+fi
+
+# 3) Key LAN service probes
+http_probe "searxng" "$SEARX_URL" '^200$'
+http_probe "whisper" "$WHISPER_URL" '^200$'
+http_probe "brave-mcp" "$MCP_URL" '^(200|406)$'
+
+# 4) Host pressure: disk + memory
+root_disk_pct="$(df -P / | awk 'NR==2 {gsub(/%/,"",$5); print $5}' 2>/dev/null || echo 0)"
+if [[ "$root_disk_pct" =~ ^[0-9]+$ ]]; then
+  if (( root_disk_pct >= 95 )); then
+    add_now "P1 Root disk critical: ${root_disk_pct}% used"
+    mark_p1
+    add_next "Free disk space urgently"
+  elif (( root_disk_pct >= WARN_DISK_PCT )); then
+    add_soon "P2 Root disk high: ${root_disk_pct}% used"
+    mark_p2
+    add_next "Prune logs/artifacts and monitor growth"
+  else
+    add_watch "P4 Root disk normal: ${root_disk_pct}% used"
+  fi
+fi
+
+if [[ -r /proc/meminfo ]]; then
+  mem_total_kb="$(awk '/MemTotal:/ {print $2}' /proc/meminfo)"
+  mem_avail_kb="$(awk '/MemAvailable:/ {print $2}' /proc/meminfo)"
+  if [[ "$mem_total_kb" =~ ^[0-9]+$ ]] && [[ "$mem_avail_kb" =~ ^[0-9]+$ ]] && (( mem_total_kb > 0 )); then
+    mem_used_pct=$(( (100 * (mem_total_kb - mem_avail_kb)) / mem_total_kb ))
+    if (( mem_used_pct >= 95 )); then
+      add_now "P1 Memory pressure critical: ${mem_used_pct}% used"
+      mark_p1
+      add_next "Inspect heavy processes / reduce workload"
+    elif (( mem_used_pct >= WARN_MEM_PCT )); then
+      add_soon "P2 Memory pressure high: ${mem_used_pct}% used"
+      mark_p2
+      add_next "Check workload spikes and tune limits"
+    else
+      add_watch "P4 Memory normal: ${mem_used_pct}% used"
+    fi
+  fi
+fi
+
+VERDICT="OK"
+EXIT_CODE=0
+if (( P1 > 0 )); then
+  VERDICT="NEEDS_ATTENTION"
+  EXIT_CODE=2
+elif (( P2 > 0 )); then
+  VERDICT="MONITOR"
+  EXIT_CODE=1
+fi
+
+{
+  echo "Verdict: ${VERDICT}"
+  echo "Counts: p1=${P1} p2=${P2}"
+  echo "Artifact path: ${ARTIFACT_DIR}"
+  echo
+  echo "Now:"
+  if (( ${#NOW[@]} == 0 )); then echo "- P4 Nothing urgent"; else for x in "${NOW[@]}"; do echo "- ${x}"; done; fi
+  echo
+  echo "Soon:"
+  if (( ${#SOON[@]} == 0 )); then echo "- P4 No near-term risks"; else for x in "${SOON[@]}"; do echo "- ${x}"; done; fi
+  echo
+  echo "Watch:"
+  if (( ${#WATCH[@]} == 0 )); then echo "- P4 No watch items"; else for x in "${WATCH[@]}"; do echo "- ${x}"; done; fi
+  echo
+  echo "Next actions:"
+  if (( ${#NEXT[@]} == 0 )); then
+    echo "- Keep current cadence"
+  else
+    printf '%s\n' "${NEXT[@]}" | awk '!seen[$0]++' | sed 's/^/- /'
+  fi
+} | tee "$ARTIFACT_DIR/summary.txt"
+
+exit "$EXIT_CODE"