diff --git a/scripts/ops-sentinel.sh b/scripts/ops-sentinel.sh index a3b70d5..ec03b50 100755 --- a/scripts/ops-sentinel.sh +++ b/scripts/ops-sentinel.sh @@ -10,6 +10,12 @@ BACKUP_MAX_AGE_HOURS="${BACKUP_MAX_AGE_HOURS:-8}" SEARX_URL="${SEARX_URL:-http://192.168.153.113:18803}" WHISPER_URL="${WHISPER_URL:-http://192.168.153.113:18801}" MCP_URL="${MCP_URL:-http://192.168.153.113:18802/mcp}" +OLLAMA_URL="${OLLAMA_URL:-http://127.0.0.1:11434}" +OLLAMA_EMBED_MODEL="${OLLAMA_EMBED_MODEL:-nomic-embed-text:latest}" +OLLAMA_REQUIRED="${OLLAMA_REQUIRED:-false}" +MINIO_URL="${MINIO_URL:-http://192.168.153.253:9000}" +DOCKER_EXPECTED_CONTAINERS="${DOCKER_EXPECTED_CONTAINERS:-searxng whisper-server brave-search}" +DOCKER_REQUIRE_EXPECTED="${DOCKER_REQUIRE_EXPECTED:-false}" WARN_DISK_PCT="${WARN_DISK_PCT:-85}" WARN_MEM_PCT="${WARN_MEM_PCT:-85}" @@ -122,12 +128,63 @@ else add_next "Create backup log or fix backup script path" fi -# 3) Key LAN service probes +# 3) Docker service health (containers + health state) +if command -v docker >/dev/null 2>&1; then + if docker ps --format '{{.Names}}\t{{.Status}}' >"$ARTIFACT_DIR/docker-ps.txt" 2>"$ARTIFACT_DIR/docker-ps.err"; then + for svc in $DOCKER_EXPECTED_CONTAINERS; do + if grep -qE "^${svc}[[:space:]]" "$ARTIFACT_DIR/docker-ps.txt"; then + line="$(grep -E "^${svc}[[:space:]]" "$ARTIFACT_DIR/docker-ps.txt" | head -n1)" + if echo "$line" | grep -qiE 'unhealthy|dead|exited|restarting'; then + add_now "P1 Docker container ${svc} unhealthy (${line#*$'\t'})" + mark_p1 + add_next "Check logs: docker logs --tail=200 ${svc}" + else + add_watch "P4 Docker container ${svc} running" + fi + else + if [[ "${DOCKER_REQUIRE_EXPECTED}" == "true" ]]; then + add_now "P1 Docker container missing: ${svc}" + mark_p1 + add_next "Start or restore container: ${svc}" + else + add_watch "P3 Docker container not found locally: ${svc} (may run on another host)" + fi + fi + done + else + add_watch "P3 docker ps failed" + add_next "Check Docker daemon permissions/state" + fi +else + add_watch "P3 docker CLI not available" +fi + +# 4) Key LAN + local service probes http_probe "searxng" "$SEARX_URL" '^200$' http_probe "whisper" "$WHISPER_URL" '^200$' http_probe "brave-mcp" "$MCP_URL" '^(200|406)$' +http_probe "minio-live" "${MINIO_URL%/}/minio/health/live" '^200$' -# 4) Host pressure: disk + memory +# 5) Ollama embeddings availability + target model +if curl -sS -m 6 "${OLLAMA_URL%/}/api/tags" >"$ARTIFACT_DIR/ollama-tags.json" 2>"$ARTIFACT_DIR/ollama-tags.err"; then + if jq -e --arg model "$OLLAMA_EMBED_MODEL" '.models[]? | select(.name == $model)' "$ARTIFACT_DIR/ollama-tags.json" >/dev/null 2>&1; then + add_watch "P4 Ollama up; embedding model present (${OLLAMA_EMBED_MODEL})" + else + add_soon "P2 Ollama up but embedding model missing (${OLLAMA_EMBED_MODEL})" + mark_p2 + add_next "Pull model: ollama pull ${OLLAMA_EMBED_MODEL}" + fi +else + if [[ "${OLLAMA_REQUIRED}" == "true" ]]; then + add_now "P1 Ollama unreachable (${OLLAMA_URL})" + mark_p1 + add_next "Check Ollama service and port (default 11434)" + else + add_watch "P3 Ollama unreachable at configured URL (${OLLAMA_URL}); set OLLAMA_URL if remote" + fi +fi + +# 6) Host pressure: disk + memory root_disk_pct="$(df -P / | awk 'NR==2 {gsub(/%/,"",$5); print $5}' 2>/dev/null || echo 0)" if [[ "$root_disk_pct" =~ ^[0-9]+$ ]]; then if (( root_disk_pct >= 95 )); then