#!/usr/bin/env bash # LiteLLM health check — runs periodically via cron. # Checks: # 1. Liveness (restart container if unresponsive) # 2. Duplicate model entries (log warning, auto-dedup if DEDUP=1) # Exits 0 on healthy, non-zero on unrecoverable failure. set -euo pipefail LITELLM_URL="${LITELLM_URL:-http://localhost:18804}" LITELLM_MASTER_KEY="${LITELLM_MASTER_KEY:-$(grep LITELLM_MASTER_KEY /home/will/lab/swarm/.env | cut -d= -f2)}" SWARM_DIR="/home/will/lab/swarm" # Set DEDUP=1 to automatically remove duplicates when found DEDUP="${DEDUP:-1}" log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] litellm-health: $*"; } # ── 1. Liveness ──────────────────────────────────────────────────────────────── if ! curl -sf --max-time 10 "$LITELLM_URL/health/liveliness" > /dev/null 2>&1; then log "WARN: liveness check failed — restarting container" docker restart litellm # Wait up to 30s for recovery for i in $(seq 1 6); do sleep 5 if curl -sf --max-time 5 "$LITELLM_URL/health/liveliness" > /dev/null 2>&1; then log "OK: container recovered after restart" break fi if [ "$i" -eq 6 ]; then log "ERROR: container did not recover after restart" exit 1 fi done else log "OK: liveness check passed" fi # ── 2. Duplicate detection ───────────────────────────────────────────────────── RESPONSE=$(curl -sf --max-time 10 -H "Authorization: Bearer $LITELLM_MASTER_KEY" \ "$LITELLM_URL/v2/model/info" 2>/dev/null) || { log "WARN: could not fetch model info"; exit 0; } DUPE_COUNT=$(echo "$RESPONSE" | python3 -c " import sys, json data = json.loads(sys.stdin.read()) seen = set() dupes = 0 for m in data.get('data', []): if not m.get('model_info', {}).get('db_model'): continue name = m.get('model_name', '') if name in seen: dupes += 1 seen.add(name) print(dupes) ") if [ "$DUPE_COUNT" -gt 0 ]; then log "WARN: $DUPE_COUNT duplicate model entries detected" if [ "$DEDUP" = "1" ]; then log "Running dedup..." LITELLM_MASTER_KEY="$LITELLM_MASTER_KEY" \ LITELLM_URL="$LITELLM_URL" \ bash "$SWARM_DIR/litellm-dedup.sh" else log "Set DEDUP=1 or run litellm-dedup.sh manually to clean up" fi else log "OK: no duplicate model entries" fi