Files
swarm-master/litellm-health-check.sh
William Valentin c94bbe5de8 Add LiteLLM maintenance scripts and systemd health-check timer
litellm-dedup.sh: removes duplicate model DB entries (idempotent, supports
--dry-run). Root cause of duplicates was litellm-init running multiple times
before the DB was populated, causing all entries to be inserted concurrently.

litellm-health-check.sh: runs every 6 hours via systemd user timer; checks
liveness (auto-restarts container if unresponsive) and duplicate entries
(auto-dedups when DEDUP=1). Logs to litellm-maintenance.log.

Systemd units: litellm-health-check.{service,timer} installed under
~/.config/systemd/user/.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-12 13:33:16 -07:00

70 lines
2.5 KiB
Bash
Executable File

#!/usr/bin/env bash
# LiteLLM health check — runs periodically via cron.
# Checks:
# 1. Liveness (restart container if unresponsive)
# 2. Duplicate model entries (log warning, auto-dedup if DEDUP=1)
# Exits 0 on healthy, non-zero on unrecoverable failure.
set -euo pipefail
LITELLM_URL="${LITELLM_URL:-http://localhost:18804}"
LITELLM_MASTER_KEY="${LITELLM_MASTER_KEY:-$(grep LITELLM_MASTER_KEY /home/will/lab/swarm/.env | cut -d= -f2)}"
SWARM_DIR="/home/will/lab/swarm"
# Set DEDUP=1 to automatically remove duplicates when found
DEDUP="${DEDUP:-1}"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] litellm-health: $*"; }
# ── 1. Liveness ────────────────────────────────────────────────────────────────
if ! curl -sf --max-time 10 "$LITELLM_URL/health/liveliness" > /dev/null 2>&1; then
log "WARN: liveness check failed — restarting container"
docker restart litellm
# Wait up to 30s for recovery
for i in $(seq 1 6); do
sleep 5
if curl -sf --max-time 5 "$LITELLM_URL/health/liveliness" > /dev/null 2>&1; then
log "OK: container recovered after restart"
break
fi
if [ "$i" -eq 6 ]; then
log "ERROR: container did not recover after restart"
exit 1
fi
done
else
log "OK: liveness check passed"
fi
# ── 2. Duplicate detection ─────────────────────────────────────────────────────
RESPONSE=$(curl -sf --max-time 10 -H "Authorization: Bearer $LITELLM_MASTER_KEY" \
"$LITELLM_URL/v2/model/info" 2>/dev/null) || { log "WARN: could not fetch model info"; exit 0; }
DUPE_COUNT=$(echo "$RESPONSE" | python3 -c "
import sys, json
data = json.loads(sys.stdin.read())
seen = set()
dupes = 0
for m in data.get('data', []):
if not m.get('model_info', {}).get('db_model'):
continue
name = m.get('model_name', '')
if name in seen:
dupes += 1
seen.add(name)
print(dupes)
")
if [ "$DUPE_COUNT" -gt 0 ]; then
log "WARN: $DUPE_COUNT duplicate model entries detected"
if [ "$DEDUP" = "1" ]; then
log "Running dedup..."
LITELLM_MASTER_KEY="$LITELLM_MASTER_KEY" \
LITELLM_URL="$LITELLM_URL" \
bash "$SWARM_DIR/litellm-dedup.sh"
else
log "Set DEDUP=1 or run litellm-dedup.sh manually to clean up"
fi
else
log "OK: no duplicate model entries"
fi