From ca65f245a3378ff96203077a245f29328aa0b42d Mon Sep 17 00:00:00 2001 From: zap Date: Thu, 5 Mar 2026 02:17:32 +0000 Subject: [PATCH] feat(scripts): add ops and mcp sentinel automation scripts --- scripts/mcp-smoke.sh | 242 ++++++++++++++++++++++++++++++++++++++++ scripts/ops-sentinel.sh | 197 ++++++++++++++++++++++++++++++++ 2 files changed, 439 insertions(+) create mode 100755 scripts/mcp-smoke.sh create mode 100755 scripts/ops-sentinel.sh diff --git a/scripts/mcp-smoke.sh b/scripts/mcp-smoke.sh new file mode 100755 index 0000000..93237aa --- /dev/null +++ b/scripts/mcp-smoke.sh @@ -0,0 +1,242 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Lightweight MCP smoke test for HTTP MCP servers. +# Default target: local Brave MCP server. + +MCP_URL="${MCP_URL:-http://192.168.153.113:18802/mcp}" +TIMEOUT_SEC="${TIMEOUT_SEC:-10}" +BASELINE_FILE="${BASELINE_FILE:-memory/mcp-smoke-tools-baseline.txt}" +PROBE_QUERY="${PROBE_QUERY:-openclaw}" +UPDATE_BASELINE=0 +SKIP_TOOL_CALL=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --url) + MCP_URL="$2"; shift 2 ;; + --timeout) + TIMEOUT_SEC="$2"; shift 2 ;; + --baseline) + BASELINE_FILE="$2"; shift 2 ;; + --query) + PROBE_QUERY="$2"; shift 2 ;; + --update-baseline) + UPDATE_BASELINE=1; shift ;; + --skip-tool-call) + SKIP_TOOL_CALL=1; shift ;; + -h|--help) + cat < MCP endpoint (default: ${MCP_URL}) + --timeout Curl timeout (default: ${TIMEOUT_SEC}) + --baseline Baseline tool-name file (default: ${BASELINE_FILE}) + --query Query used for brave_web_search probe (default: ${PROBE_QUERY}) + --skip-tool-call Skip tools/call probe + --update-baseline Save current tool names as baseline +EOF + exit 0 ;; + *) + echo "Unknown arg: $1" >&2 + exit 2 ;; + esac +done + +TS_DAY="$(date -u +%F)" +TS_STAMP="$(date -u +%H%M%S)" +ARTIFACT_DIR="${MCP_SMOKE_OUTPUT_DIR:-/tmp/openclaw-mcp-smoke}/${TS_DAY}/${TS_STAMP}" +mkdir -p "$ARTIFACT_DIR" + +NOW=() +WATCH=() +NEXT=() +P1=0 +P2=0 + +add_now(){ NOW+=("$1"); } +add_watch(){ WATCH+=("$1"); } +add_next(){ NEXT+=("$1"); } +mark_p1(){ P1=$((P1+1)); } +mark_p2(){ P2=$((P2+1)); } + +ms_now() { date +%s%3N; } + +# 1) initialize +init_headers="$ARTIFACT_DIR/init.headers" +init_body="$ARTIFACT_DIR/init.body" +init_start="$(ms_now)" +if ! curl -sS -m "$TIMEOUT_SEC" -D "$init_headers" -o "$init_body" \ + -H 'Accept: text/event-stream, application/json' \ + -H 'Content-Type: application/json' \ + -X POST "$MCP_URL" \ + --data '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2025-03-26","capabilities":{},"clientInfo":{"name":"mcp-smoke","version":"1.0"}}}' \ + 2>"$ARTIFACT_DIR/init.err"; then + add_now "P1 initialize request failed (${MCP_URL})" + mark_p1 + add_next "Check MCP endpoint reachability and auth requirements" +fi +init_ms=$(( $(ms_now) - init_start )) + +session_id="$(awk -F': ' 'tolower($1)=="mcp-session-id" {gsub(/\r/,"",$2); print $2}' "$init_headers" | tail -n1 || true)" +init_data_line="$(grep '^data: ' "$init_body" | tail -n1 | sed 's/^data: //' || true)" + +if [[ -z "$session_id" ]]; then + add_now "P1 initialize succeeded without mcp-session-id header" + mark_p1 + add_next "Confirm endpoint is MCP over HTTP (streamable)" +else + add_watch "P4 initialize OK (${init_ms}ms)" +fi + +if [[ -n "$init_data_line" ]] && jq -e '.error' >/dev/null 2>&1 <<<"$init_data_line"; then + init_err_msg="$(jq -r '.error.message // "unknown initialize error"' <<<"$init_data_line")" + add_now "P1 initialize error: ${init_err_msg}" + mark_p1 + add_next "Verify MCP auth/API key configuration" +fi + +# 2) notifications/initialized (best effort) +if [[ -n "$session_id" ]]; then + curl -sS -m "$TIMEOUT_SEC" -D "$ARTIFACT_DIR/initialized.headers" -o "$ARTIFACT_DIR/initialized.body" \ + -H "mcp-session-id: ${session_id}" \ + -H 'Accept: text/event-stream, application/json' \ + -H 'Content-Type: application/json' \ + -X POST "$MCP_URL" \ + --data '{"jsonrpc":"2.0","method":"notifications/initialized","params":{}}' \ + > /dev/null 2>"$ARTIFACT_DIR/initialized.err" || true +fi + +# 3) tools/list +tools_names_file="$ARTIFACT_DIR/tools.current.txt" +tools_ms=0 +if [[ -n "$session_id" ]]; then + tools_start="$(ms_now)" + if curl -sS -m "$TIMEOUT_SEC" -o "$ARTIFACT_DIR/tools.body" \ + -H "mcp-session-id: ${session_id}" \ + -H 'Accept: text/event-stream, application/json' \ + -H 'Content-Type: application/json' \ + -X POST "$MCP_URL" \ + --data '{"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}' \ + 2>"$ARTIFACT_DIR/tools.err"; then + tools_ms=$(( $(ms_now) - tools_start )) + grep '^data: ' "$ARTIFACT_DIR/tools.body" | sed 's/^data: //' | tail -n1 > "$ARTIFACT_DIR/tools.json" || true + if jq -e '.error' "$ARTIFACT_DIR/tools.json" >/dev/null 2>&1; then + msg="$(jq -r '.error.message // "tools/list failed"' "$ARTIFACT_DIR/tools.json")" + add_now "P1 tools/list error: ${msg}" + mark_p1 + add_next "Check MCP upstream provider credentials" + else + jq -r '.result.tools[]?.name' "$ARTIFACT_DIR/tools.json" | sort -u > "$tools_names_file" + tool_count="$(wc -l < "$tools_names_file" | tr -d ' ')" + add_watch "P4 tools/list OK (${tools_ms}ms, ${tool_count} tools)" + fi + else + add_now "P1 tools/list request failed" + mark_p1 + add_next "Inspect MCP server logs and network path" + fi +fi + +# 4) optional tool probe (auth + runtime) +if (( SKIP_TOOL_CALL == 0 )) && [[ -n "$session_id" ]] && [[ -s "$tools_names_file" ]]; then + if grep -qx 'brave_web_search' "$tools_names_file"; then + call_start="$(ms_now)" + if curl -sS -m "$TIMEOUT_SEC" -o "$ARTIFACT_DIR/tool-call.body" \ + -H "mcp-session-id: ${session_id}" \ + -H 'Accept: text/event-stream, application/json' \ + -H 'Content-Type: application/json' \ + -X POST "$MCP_URL" \ + --data "{\"jsonrpc\":\"2.0\",\"id\":3,\"method\":\"tools/call\",\"params\":{\"name\":\"brave_web_search\",\"arguments\":{\"query\":\"${PROBE_QUERY}\",\"count\":1}}}" \ + 2>"$ARTIFACT_DIR/tool-call.err"; then + call_ms=$(( $(ms_now) - call_start )) + grep '^data: ' "$ARTIFACT_DIR/tool-call.body" | sed 's/^data: //' | tail -n1 > "$ARTIFACT_DIR/tool-call.json" || true + if jq -e '.error' "$ARTIFACT_DIR/tool-call.json" >/dev/null 2>&1; then + msg="$(jq -r '.error.message // "tools/call failed"' "$ARTIFACT_DIR/tool-call.json")" + add_now "P1 tools/call error: ${msg}" + mark_p1 + add_next "Verify Brave API key/plan and outbound internet access" + else + add_watch "P4 tools/call brave_web_search OK (${call_ms}ms)" + fi + else + add_now "P1 tools/call request failed" + mark_p1 + add_next "Check MCP service health and external API reachability" + fi + else + add_watch "P3 brave_web_search not present; skipped tools/call probe" + fi +fi + +# 5) tool-list drift +if [[ -s "$tools_names_file" ]]; then + if [[ -f "$BASELINE_FILE" ]]; then + sort -u "$BASELINE_FILE" > "$ARTIFACT_DIR/tools.baseline.sorted.txt" + comm -13 "$ARTIFACT_DIR/tools.baseline.sorted.txt" "$tools_names_file" > "$ARTIFACT_DIR/tools.added.txt" || true + comm -23 "$ARTIFACT_DIR/tools.baseline.sorted.txt" "$tools_names_file" > "$ARTIFACT_DIR/tools.removed.txt" || true + + added_n="$(wc -l < "$ARTIFACT_DIR/tools.added.txt" | tr -d ' ')" + removed_n="$(wc -l < "$ARTIFACT_DIR/tools.removed.txt" | tr -d ' ')" + if (( added_n > 0 || removed_n > 0 )); then + add_watch "P2 Tool-list drift detected (+${added_n}/-${removed_n})" + mark_p2 + add_next "Review drift and update baseline if expected" + else + add_watch "P4 Tool list matches baseline" + fi + else + if (( UPDATE_BASELINE == 1 )); then + add_watch "P4 Baseline bootstrap mode (creating ${BASELINE_FILE})" + else + add_watch "P3 No baseline file yet (${BASELINE_FILE})" + add_next "Run with --update-baseline after confirming current tool list" + fi + fi +fi + +if (( UPDATE_BASELINE == 1 )) && [[ -s "$tools_names_file" ]]; then + mkdir -p "$(dirname "$BASELINE_FILE")" + cp "$tools_names_file" "$BASELINE_FILE" + add_watch "P4 Baseline updated: ${BASELINE_FILE}" +fi + +# 6) mcporter quick config signal (optional) +if command -v mcporter >/dev/null 2>&1; then + if mcporter list --json >"$ARTIFACT_DIR/mcporter-list.json" 2>"$ARTIFACT_DIR/mcporter-list.err"; then + configured="$(jq -r '(.servers // []) | length' "$ARTIFACT_DIR/mcporter-list.json" 2>/dev/null || echo 0)" + add_watch "P4 mcporter configured servers: ${configured}" + fi +fi + +VERDICT="OK" +EXIT_CODE=0 +if (( P1 > 0 )); then + VERDICT="NEEDS_ATTENTION" + EXIT_CODE=2 +elif (( P2 > 0 )); then + VERDICT="MONITOR" + EXIT_CODE=1 +fi + +{ + echo "Verdict: ${VERDICT}" + echo "Counts: p1=${P1} p2=${P2}" + echo "Endpoint: ${MCP_URL}" + echo "Session: ${session_id:-none}" + echo "Artifact path: ${ARTIFACT_DIR}" + echo + echo "Now:" + if (( ${#NOW[@]} == 0 )); then echo "- P4 Nothing urgent"; else for x in "${NOW[@]}"; do echo "- ${x}"; done; fi + echo + echo "Watch:" + if (( ${#WATCH[@]} == 0 )); then echo "- P4 No watch items"; else for x in "${WATCH[@]}"; do echo "- ${x}"; done; fi + echo + echo "Next actions:" + if (( ${#NEXT[@]} == 0 )); then + echo "- Keep current baseline and run periodically" + else + printf '%s\n' "${NEXT[@]}" | awk '!seen[$0]++' | sed 's/^/- /' + fi +} | tee "$ARTIFACT_DIR/summary.txt" + +exit "$EXIT_CODE" diff --git a/scripts/ops-sentinel.sh b/scripts/ops-sentinel.sh new file mode 100755 index 0000000..a3b70d5 --- /dev/null +++ b/scripts/ops-sentinel.sh @@ -0,0 +1,197 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Lightweight operational snapshot for OpenClaw homelab. +# Output frame: Now / Soon / Watch / Next actions + +OPENCLAW_BIN="${OPENCLAW_BIN:-openclaw}" +BACKUP_LOG="${BACKUP_LOG:-memory/minio-backup.log}" +BACKUP_MAX_AGE_HOURS="${BACKUP_MAX_AGE_HOURS:-8}" +SEARX_URL="${SEARX_URL:-http://192.168.153.113:18803}" +WHISPER_URL="${WHISPER_URL:-http://192.168.153.113:18801}" +MCP_URL="${MCP_URL:-http://192.168.153.113:18802/mcp}" +WARN_DISK_PCT="${WARN_DISK_PCT:-85}" +WARN_MEM_PCT="${WARN_MEM_PCT:-85}" + +TS_DAY="$(date -u +%F)" +TS_STAMP="$(date -u +%H%M%S)" +ARTIFACT_DIR="${HEALTHCHECK_OUTPUT_DIR:-/tmp/openclaw-healthcheck}/${TS_DAY}/${TS_STAMP}" +mkdir -p "$ARTIFACT_DIR" + +NOW=() +SOON=() +WATCH=() +NEXT=() +P1=0 +P2=0 + +add_now() { NOW+=("$1"); } +add_soon() { SOON+=("$1"); } +add_watch() { WATCH+=("$1"); } +add_next() { NEXT+=("$1"); } + +mark_p1() { P1=$((P1 + 1)); } +mark_p2() { P2=$((P2 + 1)); } + +http_probe() { + local name="$1" url="$2" expected_regex="$3" + local out_file="$ARTIFACT_DIR/http-${name}.txt" + local result code ttotal + if ! result="$(curl -sS -m 6 -o "$out_file" -w '%{http_code} %{time_total}' "$url" 2>"$ARTIFACT_DIR/http-${name}.err")"; then + add_now "P1 ${name} unreachable (${url})" + mark_p1 + add_next "Check ${name} service/container and LAN route" + return + fi + code="${result%% *}" + ttotal="${result##* }" + if [[ "$code" =~ $expected_regex ]]; then + add_watch "P4 ${name} OK (HTTP ${code}, ${ttotal}s)" + else + add_watch "P2 ${name} unexpected response (HTTP ${code}, ${ttotal}s)" + mark_p2 + add_next "Validate ${name} endpoint/health semantics" + fi +} + +# 1) OpenClaw health + security +if "$OPENCLAW_BIN" health --json >"$ARTIFACT_DIR/openclaw-health.json" 2>"$ARTIFACT_DIR/openclaw-health.err"; then + if jq -e '.ok == true' "$ARTIFACT_DIR/openclaw-health.json" >/dev/null 2>&1; then + add_watch "P4 OpenClaw gateway health OK" + else + add_now "P1 OpenClaw health reported not-ok" + mark_p1 + add_next "Run: openclaw health --json" + fi +else + add_now "P1 Failed to run openclaw health" + mark_p1 + add_next "Run: openclaw status && openclaw logs --follow" +fi + +if "$OPENCLAW_BIN" security audit --json >"$ARTIFACT_DIR/openclaw-security-audit.json" 2>"$ARTIFACT_DIR/openclaw-security-audit.err"; then + sec_critical="$(jq -r '.summary.critical // 0' "$ARTIFACT_DIR/openclaw-security-audit.json" 2>/dev/null || echo 0)" + sec_warn="$(jq -r '.summary.warn // 0' "$ARTIFACT_DIR/openclaw-security-audit.json" 2>/dev/null || echo 0)" + if [[ "$sec_critical" =~ ^[0-9]+$ ]] && (( sec_critical > 0 )); then + add_now "P1 Security audit has ${sec_critical} critical finding(s)" + mark_p1 + add_next "Run: openclaw security audit --deep" + fi + if [[ "$sec_warn" =~ ^[0-9]+$ ]] && (( sec_warn > 0 )); then + add_watch "P2 Security audit has ${sec_warn} warning(s)" + mark_p2 + add_next "Review plugin/tool policy allowlists" + fi +else + add_watch "P3 Security audit command failed" + add_next "Run: openclaw security audit --json" +fi + +# 2) Backup freshness from minio backup log +if [[ -f "$BACKUP_LOG" ]]; then + last_key="$(grep 'Backup complete:' "$BACKUP_LOG" | tail -n1 | sed -E 's#.*workspace-backups/([0-9]{8}T[0-9]{6}Z).*#\1#' || true)" + if [[ -n "$last_key" ]]; then + backup_iso="${last_key:0:4}-${last_key:4:2}-${last_key:6:2} ${last_key:9:2}:${last_key:11:2}:${last_key:13:2} UTC" + backup_epoch="$(date -u -d "$backup_iso" +%s 2>/dev/null || echo 0)" + now_epoch="$(date -u +%s)" + if (( backup_epoch > 0 )); then + age_hours=$(( (now_epoch - backup_epoch) / 3600 )) + if (( age_hours > BACKUP_MAX_AGE_HOURS )); then + add_now "P1 Backup stale: last success ${age_hours}h ago (${last_key})" + mark_p1 + add_next "Run backup job now and verify new 'Backup complete' entry" + elif (( age_hours >= BACKUP_MAX_AGE_HOURS - 1 )); then + add_soon "P2 Backup nearing threshold: ${age_hours}h old (${last_key})" + mark_p2 + else + add_watch "P4 Backup fresh (${age_hours}h old, ${last_key})" + fi + else + add_now "P1 Could not parse backup timestamp from ${BACKUP_LOG}" + mark_p1 + add_next "Inspect backup log format or backup script output" + fi + else + add_now "P1 No 'Backup complete' entry found in ${BACKUP_LOG}" + mark_p1 + add_next "Run backup and confirm completion line is written" + fi +else + add_now "P1 Backup log missing: ${BACKUP_LOG}" + mark_p1 + add_next "Create backup log or fix backup script path" +fi + +# 3) Key LAN service probes +http_probe "searxng" "$SEARX_URL" '^200$' +http_probe "whisper" "$WHISPER_URL" '^200$' +http_probe "brave-mcp" "$MCP_URL" '^(200|406)$' + +# 4) Host pressure: disk + memory +root_disk_pct="$(df -P / | awk 'NR==2 {gsub(/%/,"",$5); print $5}' 2>/dev/null || echo 0)" +if [[ "$root_disk_pct" =~ ^[0-9]+$ ]]; then + if (( root_disk_pct >= 95 )); then + add_now "P1 Root disk critical: ${root_disk_pct}% used" + mark_p1 + add_next "Free disk space urgently" + elif (( root_disk_pct >= WARN_DISK_PCT )); then + add_soon "P2 Root disk high: ${root_disk_pct}% used" + mark_p2 + add_next "Prune logs/artifacts and monitor growth" + else + add_watch "P4 Root disk normal: ${root_disk_pct}% used" + fi +fi + +if [[ -r /proc/meminfo ]]; then + mem_total_kb="$(awk '/MemTotal:/ {print $2}' /proc/meminfo)" + mem_avail_kb="$(awk '/MemAvailable:/ {print $2}' /proc/meminfo)" + if [[ "$mem_total_kb" =~ ^[0-9]+$ ]] && [[ "$mem_avail_kb" =~ ^[0-9]+$ ]] && (( mem_total_kb > 0 )); then + mem_used_pct=$(( (100 * (mem_total_kb - mem_avail_kb)) / mem_total_kb )) + if (( mem_used_pct >= 95 )); then + add_now "P1 Memory pressure critical: ${mem_used_pct}% used" + mark_p1 + add_next "Inspect heavy processes / reduce workload" + elif (( mem_used_pct >= WARN_MEM_PCT )); then + add_soon "P2 Memory pressure high: ${mem_used_pct}% used" + mark_p2 + add_next "Check workload spikes and tune limits" + else + add_watch "P4 Memory normal: ${mem_used_pct}% used" + fi + fi +fi + +VERDICT="OK" +EXIT_CODE=0 +if (( P1 > 0 )); then + VERDICT="NEEDS_ATTENTION" + EXIT_CODE=2 +elif (( P2 > 0 )); then + VERDICT="MONITOR" + EXIT_CODE=1 +fi + +{ + echo "Verdict: ${VERDICT}" + echo "Counts: p1=${P1} p2=${P2}" + echo "Artifact path: ${ARTIFACT_DIR}" + echo + echo "Now:" + if (( ${#NOW[@]} == 0 )); then echo "- P4 Nothing urgent"; else for x in "${NOW[@]}"; do echo "- ${x}"; done; fi + echo + echo "Soon:" + if (( ${#SOON[@]} == 0 )); then echo "- P4 No near-term risks"; else for x in "${SOON[@]}"; do echo "- ${x}"; done; fi + echo + echo "Watch:" + if (( ${#WATCH[@]} == 0 )); then echo "- P4 No watch items"; else for x in "${WATCH[@]}"; do echo "- ${x}"; done; fi + echo + echo "Next actions:" + if (( ${#NEXT[@]} == 0 )); then + echo "- Keep current cadence" + else + printf '%s\n' "${NEXT[@]}" | awk '!seen[$0]++' | sed 's/^/- /' + fi +} | tee "$ARTIFACT_DIR/summary.txt" + +exit "$EXIT_CODE"