Add LiteLLM maintenance scripts and systemd health-check timer
litellm-dedup.sh: removes duplicate model DB entries (idempotent, supports
--dry-run). Root cause of duplicates was litellm-init running multiple times
before the DB was populated, causing all entries to be inserted concurrently.
litellm-health-check.sh: runs every 6 hours via systemd user timer; checks
liveness (auto-restarts container if unresponsive) and duplicate entries
(auto-dedups when DEDUP=1). Logs to litellm-maintenance.log.
Systemd units: litellm-health-check.{service,timer} installed under
~/.config/systemd/user/.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
73
litellm-dedup.sh
Executable file
73
litellm-dedup.sh
Executable file
@@ -0,0 +1,73 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Removes duplicate model entries from the LiteLLM DB.
|
||||||
|
# Keeps the first registered entry per model name; deletes the rest.
|
||||||
|
# Safe to run at any time — idempotent, no-op when no duplicates exist.
|
||||||
|
# Usage: ./litellm-dedup.sh [--dry-run]
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
LITELLM_URL="${LITELLM_URL:-http://localhost:18804}"
|
||||||
|
LITELLM_MASTER_KEY="${LITELLM_MASTER_KEY:-$(grep LITELLM_MASTER_KEY /home/will/lab/swarm/.env | cut -d= -f2)}"
|
||||||
|
DRY_RUN=0
|
||||||
|
[ "${1:-}" = "--dry-run" ] && DRY_RUN=1
|
||||||
|
|
||||||
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
||||||
|
|
||||||
|
# Fetch all DB model entries
|
||||||
|
RESPONSE=$(curl -sf -H "Authorization: Bearer $LITELLM_MASTER_KEY" \
|
||||||
|
"$LITELLM_URL/v2/model/info") || { log "ERROR: failed to reach LiteLLM at $LITELLM_URL"; exit 1; }
|
||||||
|
|
||||||
|
# Find duplicate IDs (keep first occurrence of each name, collect the rest)
|
||||||
|
DUPES=$(echo "$RESPONSE" | python3 -c "
|
||||||
|
import sys, json
|
||||||
|
data = json.loads(sys.stdin.read())
|
||||||
|
seen = {}
|
||||||
|
dupes = []
|
||||||
|
for m in data.get('data', []):
|
||||||
|
info = m.get('model_info', {})
|
||||||
|
if not info.get('db_model'):
|
||||||
|
continue
|
||||||
|
name = m.get('model_name', '')
|
||||||
|
db_id = info.get('id', '')
|
||||||
|
if name not in seen:
|
||||||
|
seen[name] = db_id
|
||||||
|
else:
|
||||||
|
dupes.append(db_id)
|
||||||
|
for d in dupes:
|
||||||
|
print(d)
|
||||||
|
")
|
||||||
|
|
||||||
|
TOTAL=$(echo "$DUPES" | grep -c . 2>/dev/null || true)
|
||||||
|
|
||||||
|
if [ "$TOTAL" -eq 0 ]; then
|
||||||
|
log "No duplicates found."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Found $TOTAL duplicate entries."
|
||||||
|
|
||||||
|
if [ "$DRY_RUN" -eq 1 ]; then
|
||||||
|
log "Dry run — would delete:"
|
||||||
|
echo "$DUPES"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
ok=0; fail=0
|
||||||
|
while IFS= read -r id; do
|
||||||
|
[ -z "$id" ] && continue
|
||||||
|
result=$(curl -sf -X POST \
|
||||||
|
-H "Authorization: Bearer $LITELLM_MASTER_KEY" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"id\": \"$id\"}" \
|
||||||
|
"$LITELLM_URL/model/delete" 2>/dev/null || echo "error")
|
||||||
|
if echo "$result" | grep -q "deleted successfully"; then
|
||||||
|
ok=$((ok + 1))
|
||||||
|
else
|
||||||
|
log "WARN: failed to delete $id: $result"
|
||||||
|
fail=$((fail + 1))
|
||||||
|
fi
|
||||||
|
done <<EOF
|
||||||
|
$DUPES
|
||||||
|
EOF
|
||||||
|
|
||||||
|
log "Deleted: $ok Failed: $fail"
|
||||||
69
litellm-health-check.sh
Executable file
69
litellm-health-check.sh
Executable file
@@ -0,0 +1,69 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# LiteLLM health check — runs periodically via cron.
|
||||||
|
# Checks:
|
||||||
|
# 1. Liveness (restart container if unresponsive)
|
||||||
|
# 2. Duplicate model entries (log warning, auto-dedup if DEDUP=1)
|
||||||
|
# Exits 0 on healthy, non-zero on unrecoverable failure.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
LITELLM_URL="${LITELLM_URL:-http://localhost:18804}"
|
||||||
|
LITELLM_MASTER_KEY="${LITELLM_MASTER_KEY:-$(grep LITELLM_MASTER_KEY /home/will/lab/swarm/.env | cut -d= -f2)}"
|
||||||
|
SWARM_DIR="/home/will/lab/swarm"
|
||||||
|
# Set DEDUP=1 to automatically remove duplicates when found
|
||||||
|
DEDUP="${DEDUP:-1}"
|
||||||
|
|
||||||
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] litellm-health: $*"; }
|
||||||
|
|
||||||
|
# ── 1. Liveness ────────────────────────────────────────────────────────────────
|
||||||
|
if ! curl -sf --max-time 10 "$LITELLM_URL/health/liveliness" > /dev/null 2>&1; then
|
||||||
|
log "WARN: liveness check failed — restarting container"
|
||||||
|
docker restart litellm
|
||||||
|
# Wait up to 30s for recovery
|
||||||
|
for i in $(seq 1 6); do
|
||||||
|
sleep 5
|
||||||
|
if curl -sf --max-time 5 "$LITELLM_URL/health/liveliness" > /dev/null 2>&1; then
|
||||||
|
log "OK: container recovered after restart"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
if [ "$i" -eq 6 ]; then
|
||||||
|
log "ERROR: container did not recover after restart"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
else
|
||||||
|
log "OK: liveness check passed"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 2. Duplicate detection ─────────────────────────────────────────────────────
|
||||||
|
RESPONSE=$(curl -sf --max-time 10 -H "Authorization: Bearer $LITELLM_MASTER_KEY" \
|
||||||
|
"$LITELLM_URL/v2/model/info" 2>/dev/null) || { log "WARN: could not fetch model info"; exit 0; }
|
||||||
|
|
||||||
|
DUPE_COUNT=$(echo "$RESPONSE" | python3 -c "
|
||||||
|
import sys, json
|
||||||
|
data = json.loads(sys.stdin.read())
|
||||||
|
seen = set()
|
||||||
|
dupes = 0
|
||||||
|
for m in data.get('data', []):
|
||||||
|
if not m.get('model_info', {}).get('db_model'):
|
||||||
|
continue
|
||||||
|
name = m.get('model_name', '')
|
||||||
|
if name in seen:
|
||||||
|
dupes += 1
|
||||||
|
seen.add(name)
|
||||||
|
print(dupes)
|
||||||
|
")
|
||||||
|
|
||||||
|
if [ "$DUPE_COUNT" -gt 0 ]; then
|
||||||
|
log "WARN: $DUPE_COUNT duplicate model entries detected"
|
||||||
|
if [ "$DEDUP" = "1" ]; then
|
||||||
|
log "Running dedup..."
|
||||||
|
LITELLM_MASTER_KEY="$LITELLM_MASTER_KEY" \
|
||||||
|
LITELLM_URL="$LITELLM_URL" \
|
||||||
|
bash "$SWARM_DIR/litellm-dedup.sh"
|
||||||
|
else
|
||||||
|
log "Set DEDUP=1 or run litellm-dedup.sh manually to clean up"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "OK: no duplicate model entries"
|
||||||
|
fi
|
||||||
Reference in New Issue
Block a user