chore(swarm): wire local AI health checks
This commit is contained in:
@@ -0,0 +1,343 @@
|
||||
SHELL := /usr/bin/env bash
|
||||
.SHELLFLAGS := -eu -o pipefail -c
|
||||
|
||||
COMPOSE ?= docker compose
|
||||
COMPOSE_FILE ?= docker-compose.yaml
|
||||
COMMON_COMPOSE_FILE ?= swarm-common/docker-compose.yaml
|
||||
ANSIBLE_DIR ?= ansible
|
||||
INVENTORY ?= inventory.yml
|
||||
HOST ?= zap
|
||||
SERVICE ?=
|
||||
PROFILE ?=
|
||||
LOGS_TAIL ?= 200
|
||||
CONFIRM ?= no
|
||||
OPENCLAW_REGISTRY ?= $(HOME)/.claude/state/openclaw-instances.json
|
||||
OPENCLAW_PORT ?= 18789
|
||||
QEMU_URI ?= qemu:///system
|
||||
LLAMA_CPP_URL ?= http://127.0.0.1:18806
|
||||
OLLAMA_URL ?= http://127.0.0.1:18807
|
||||
OLLAMA_EMBED_MODEL ?= nomic-embed-text
|
||||
|
||||
DC := $(COMPOSE) -f $(COMPOSE_FILE)
|
||||
COMMON_DC := $(COMPOSE) -f $(COMMON_COMPOSE_FILE)
|
||||
ANSIBLE_PLAYBOOK := cd $(ANSIBLE_DIR) && ansible-playbook -i $(INVENTORY)
|
||||
OPENCLAW_HOST = $(shell jq -r '.instances[] | select(.name == "$(HOST)") | .host // empty' $(OPENCLAW_REGISTRY) 2>/dev/null)
|
||||
OPENCLAW_USER = $(shell jq -r '.instances[] | select(.name == "$(HOST)") | .user // "openclaw"' $(OPENCLAW_REGISTRY) 2>/dev/null)
|
||||
OPENCLAW_DOMAIN = $(shell jq -r '.instances[] | select(.name == "$(HOST)") | .domain // empty' $(OPENCLAW_REGISTRY) 2>/dev/null)
|
||||
REQUIRE_CONFIRM = test "$(CONFIRM)" = "yes" || { echo "This target changes VM/gateway state. Re-run with CONFIRM=yes"; exit 2; }
|
||||
REQUIRE_INSTANCE = test -n "$(OPENCLAW_HOST)" -a -n "$(OPENCLAW_DOMAIN)" || { echo "Unknown OpenClaw HOST=$(HOST) in $(OPENCLAW_REGISTRY)"; exit 2; }
|
||||
|
||||
.DEFAULT_GOAL := help
|
||||
.PHONY: help config ps status local-ai-health ollama-embed-health up down restart pull build logs shell clean \
|
||||
api-up api-down api-restart api-init api-init-force api-health api-dedup api-logs \
|
||||
voice-up voice-gpu voice-cpu voice-down voice-build voice-logs \
|
||||
search-up search-down automation-up automation-down n8n-logs \
|
||||
common-config common-ps common-up common-down common-logs \
|
||||
openclaw-instances openclaw-info openclaw-status openclaw-health openclaw-logs \
|
||||
openclaw-version openclaw-config openclaw-ssh openclaw-root-ssh \
|
||||
gateway-status gateway-health gateway-logs gateway-restart \
|
||||
vm-list vm-autostart-list vm-info vm-mem vm-disks vm-ifaces vm-dhcp vm-start vm-shutdown vm-reboot \
|
||||
vm-autostart-enable vm-autostart-disable vm-autostart-zap-only vm-snapshot-list vm-snapshot \
|
||||
provision install customize deploy restore backup timers \
|
||||
kube-status
|
||||
|
||||
help: ## Show available targets.
|
||||
@awk 'BEGIN {FS = ":.*## "; printf "Usage: make <target> [VAR=value]\n\nTargets:\n"} /^[a-zA-Z0-9_.-]+:.*## / {printf " %-18s %s\n", $$1, $$2}' $(MAKEFILE_LIST)
|
||||
@printf "\nCommon vars: HOST=%s SERVICE=<service> PROFILE=<profile> LOGS_TAIL=%s CONFIRM=%s\n" "$(HOST)" "$(LOGS_TAIL)" "$(CONFIRM)"
|
||||
|
||||
config: ## Validate and render all root Docker Compose profiles.
|
||||
BRAVE_API_KEY="$${BRAVE_API_KEY:-dummy}" $(DC) --profile "*" config
|
||||
|
||||
ps: ## Show root Docker Compose service status.
|
||||
$(DC) ps
|
||||
|
||||
status: ps local-ai-health ## Show Docker service status plus host-side local AI endpoints.
|
||||
|
||||
local-ai-health: ## Check host-side llama.cpp LLM and Ollama embeddings endpoints.
|
||||
@printf "\nHost-side local AI endpoints:\n"
|
||||
@printf "llama.cpp (%s): " "$(LLAMA_CPP_URL)"; \
|
||||
if curl -fsS --max-time 3 "$(LLAMA_CPP_URL)/v1/models" >/tmp/swarm-llama-models.json 2>/dev/null; then \
|
||||
printf "OK "; jq -r '[.data[].id] | join(", ")' /tmp/swarm-llama-models.json 2>/dev/null || true; \
|
||||
else \
|
||||
printf "FAILED\n"; \
|
||||
fi
|
||||
@printf "ollama.service: "; systemctl --user is-active ollama.service 2>/dev/null || true
|
||||
@printf "Ollama API (%s): " "$(OLLAMA_URL)"; \
|
||||
curl -fsS --max-time 3 "$(OLLAMA_URL)/api/version" 2>/dev/null | jq -r '"OK version=" + .version' || printf "FAILED\n"
|
||||
|
||||
ollama-embed-health: ## Smoke-test Ollama embeddings using OLLAMA_EMBED_MODEL=nomic-embed-text.
|
||||
@curl -fsS --max-time 20 "$(OLLAMA_URL)/api/embed" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"model":"$(OLLAMA_EMBED_MODEL)","input":"socket check"}' \
|
||||
| jq -r '"embeddings=" + ((.embeddings // []) | length | tostring) + " dim=" + (((.embeddings // [[]])[0] // []) | length | tostring)'
|
||||
|
||||
up: ## Start root compose services. Use PROFILE=api,voice,search,automation or SERVICE=name.
|
||||
@if [ -n "$(PROFILE)" ]; then \
|
||||
$(DC) --profile "$(PROFILE)" up -d $(SERVICE); \
|
||||
else \
|
||||
$(DC) up -d $(SERVICE); \
|
||||
fi
|
||||
|
||||
down: ## Stop root compose services. Use PROFILE=api,voice,search,automation to include profiled services.
|
||||
@if [ -n "$(PROFILE)" ]; then \
|
||||
$(DC) --profile "$(PROFILE)" down; \
|
||||
else \
|
||||
$(DC) down; \
|
||||
fi
|
||||
|
||||
restart: ## Restart a compose service. Use SERVICE=name.
|
||||
@test -n "$(SERVICE)" || { echo "SERVICE is required, e.g. make restart SERVICE=litellm"; exit 2; }
|
||||
$(DC) restart $(SERVICE)
|
||||
|
||||
pull: ## Pull compose images. Use PROFILE=api,voice,search,automation or SERVICE=name.
|
||||
@if [ -n "$(PROFILE)" ]; then \
|
||||
$(DC) --profile "$(PROFILE)" pull $(SERVICE); \
|
||||
else \
|
||||
$(DC) pull $(SERVICE); \
|
||||
fi
|
||||
|
||||
build: ## Build compose images. Use SERVICE=whisper-server-gpu or PROFILE=voice.
|
||||
@if [ -n "$(PROFILE)" ]; then \
|
||||
$(DC) --profile "$(PROFILE)" build $(SERVICE); \
|
||||
else \
|
||||
$(DC) build $(SERVICE); \
|
||||
fi
|
||||
|
||||
logs: ## Follow compose logs. Use SERVICE=name and LOGS_TAIL=n.
|
||||
$(DC) logs -f --tail="$(LOGS_TAIL)" $(SERVICE)
|
||||
|
||||
shell: ## Open a shell in a running compose service. Use SERVICE=name.
|
||||
@test -n "$(SERVICE)" || { echo "SERVICE is required, e.g. make shell SERVICE=litellm"; exit 2; }
|
||||
$(DC) exec $(SERVICE) sh
|
||||
|
||||
clean: ## Stop root compose services and remove anonymous volumes/orphans.
|
||||
$(DC) down --remove-orphans --volumes
|
||||
|
||||
api-up: ## Start LiteLLM and its Postgres/init services.
|
||||
$(DC) --profile api up -d
|
||||
|
||||
api-down: ## Stop LiteLLM profile services.
|
||||
$(DC) --profile api down
|
||||
|
||||
api-restart: ## Restart LiteLLM proxy container.
|
||||
$(DC) restart litellm
|
||||
|
||||
api-init: ## Run LiteLLM credential/model initialization once.
|
||||
$(DC) --profile api run --rm litellm-init
|
||||
|
||||
api-init-force: ## Force LiteLLM credential/model initialization.
|
||||
$(DC) --profile api run --rm -e FORCE=1 litellm-init
|
||||
|
||||
api-health: ## Run LiteLLM health check and auto-dedup script.
|
||||
./litellm-health-check.sh
|
||||
|
||||
api-dedup: ## Remove duplicate LiteLLM model DB entries.
|
||||
./litellm-dedup.sh
|
||||
|
||||
api-logs: ## Follow LiteLLM logs.
|
||||
$(DC) logs -f --tail="$(LOGS_TAIL)" litellm litellm-db litellm-init
|
||||
|
||||
voice-up: ## Start all voice services.
|
||||
$(DC) --profile voice up -d
|
||||
|
||||
voice-gpu: ## Start GPU whisper server and Kokoro TTS.
|
||||
$(DC) --profile voice up -d whisper-server-gpu kokoro-tts
|
||||
|
||||
voice-cpu: ## Start CPU whisper server and Kokoro TTS.
|
||||
$(DC) --profile voice up -d whisper-server kokoro-tts
|
||||
|
||||
voice-down: ## Stop voice profile services.
|
||||
$(DC) --profile voice down
|
||||
|
||||
voice-build: ## Build the custom Blackwell CUDA whisper image.
|
||||
$(DC) --profile voice build whisper-server-gpu
|
||||
|
||||
voice-logs: ## Follow voice service logs.
|
||||
$(DC) logs -f --tail="$(LOGS_TAIL)" whisper-server-gpu whisper-server kokoro-tts
|
||||
|
||||
search-up: ## Start Brave Search MCP and SearXNG.
|
||||
$(DC) --profile search up -d
|
||||
|
||||
search-down: ## Stop search profile services.
|
||||
$(DC) --profile search down
|
||||
|
||||
automation-up: ## Start n8n automation service.
|
||||
$(DC) --profile automation up -d
|
||||
|
||||
automation-down: ## Stop automation profile services.
|
||||
$(DC) --profile automation down
|
||||
|
||||
n8n-logs: ## Follow n8n automation logs.
|
||||
$(DC) logs -f --tail="$(LOGS_TAIL)" n8n-agent
|
||||
|
||||
common-config: ## Validate and render all swarm-common compose profiles.
|
||||
BRAVE_API_KEY="$${BRAVE_API_KEY:-dummy}" $(COMMON_DC) --profile "*" config
|
||||
|
||||
common-ps: ## Show swarm-common compose service status.
|
||||
$(COMMON_DC) ps
|
||||
|
||||
common-up: ## Start swarm-common compose services. Use PROFILE=... or SERVICE=name.
|
||||
@if [ -n "$(PROFILE)" ]; then \
|
||||
$(COMMON_DC) --profile "$(PROFILE)" up -d $(SERVICE); \
|
||||
else \
|
||||
$(COMMON_DC) up -d $(SERVICE); \
|
||||
fi
|
||||
|
||||
common-down: ## Stop swarm-common compose services. Use PROFILE=... to include profiled services.
|
||||
@if [ -n "$(PROFILE)" ]; then \
|
||||
$(COMMON_DC) --profile "$(PROFILE)" down; \
|
||||
else \
|
||||
$(COMMON_DC) down; \
|
||||
fi
|
||||
|
||||
common-logs: ## Follow swarm-common compose logs. Use SERVICE=name.
|
||||
$(COMMON_DC) logs -f --tail="$(LOGS_TAIL)" $(SERVICE)
|
||||
|
||||
openclaw-instances: ## List OpenClaw instances from the registry.
|
||||
@jq -r '.instances[] | "\(.name)\t\(.status)\t\(.domain)\t\(.user)@\(.host)\t\(.vcpus)vCPU/\(.memory_mib)MiB"' $(OPENCLAW_REGISTRY)
|
||||
|
||||
openclaw-info: ## Show registry details for one OpenClaw instance. Use HOST=zap.
|
||||
@jq '.instances[] | select(.name == "$(HOST)")' $(OPENCLAW_REGISTRY)
|
||||
|
||||
openclaw-status: openclaw-health ## Show VM and guest gateway health. Use HOST=zap.
|
||||
|
||||
openclaw-health: vm-info gateway-health ## Show VM and guest gateway health. Use HOST=zap.
|
||||
|
||||
openclaw-logs gateway-logs: ## Show recent OpenClaw gateway logs. Use HOST=zap LOGS_TAIL=200.
|
||||
$(REQUIRE_INSTANCE)
|
||||
ssh $(OPENCLAW_USER)@$(OPENCLAW_HOST) "journalctl --user -u openclaw-gateway.service --no-pager -n $(LOGS_TAIL)"
|
||||
|
||||
openclaw-version: ## Show OpenClaw service and CLI version hints. Use HOST=zap.
|
||||
$(REQUIRE_INSTANCE)
|
||||
ssh $(OPENCLAW_USER)@$(OPENCLAW_HOST) "grep OPENCLAW_SERVICE_VERSION ~/.config/systemd/user/openclaw-gateway.service || true; grep 'openclaw@' ~/.local/bin/openclaw | head -1 || true"
|
||||
|
||||
openclaw-config: ## Show guest OpenClaw config file list. Use HOST=zap.
|
||||
$(REQUIRE_INSTANCE)
|
||||
ssh $(OPENCLAW_USER)@$(OPENCLAW_HOST) "find ~/.openclaw -maxdepth 2 -type f | sort | head -200"
|
||||
|
||||
openclaw-ssh: ## Open SSH as the OpenClaw application user. Use HOST=zap.
|
||||
$(REQUIRE_INSTANCE)
|
||||
ssh $(OPENCLAW_USER)@$(OPENCLAW_HOST)
|
||||
|
||||
openclaw-root-ssh: ## Open SSH as root. Use HOST=zap.
|
||||
$(REQUIRE_INSTANCE)
|
||||
ssh root@$(OPENCLAW_HOST)
|
||||
|
||||
gateway-status: ## Show the OpenClaw gateway systemd user service status. Use HOST=zap.
|
||||
$(REQUIRE_INSTANCE)
|
||||
ssh $(OPENCLAW_USER)@$(OPENCLAW_HOST) "systemctl --user status openclaw-gateway.service --no-pager"
|
||||
|
||||
gateway-health: ## Check gateway service, listener, HTTP status, memory, disk, and uptime. Use HOST=zap.
|
||||
$(REQUIRE_INSTANCE)
|
||||
ssh $(OPENCLAW_USER)@$(OPENCLAW_HOST) "systemctl --user is-active openclaw-gateway.service; ps aux | grep openclaw | grep -v grep || true; ss -tlnp | grep -E '(openclaw|$(OPENCLAW_PORT))' || true; curl -s -o /dev/null -w 'gateway_http=%{http_code}\n' http://127.0.0.1:$(OPENCLAW_PORT)/; free -h; df -h /; uptime"
|
||||
|
||||
gateway-restart: ## Restart the OpenClaw gateway user service. Use HOST=zap CONFIRM=yes.
|
||||
$(REQUIRE_CONFIRM)
|
||||
$(REQUIRE_INSTANCE)
|
||||
ssh root@$(OPENCLAW_HOST) "su - $(OPENCLAW_USER) -c 'systemctl --user restart openclaw-gateway.service'"
|
||||
$(MAKE) --no-print-directory gateway-health HOST=$(HOST)
|
||||
|
||||
vm-list: ## List OpenClaw libvirt VMs.
|
||||
virsh -c $(QEMU_URI) list --all
|
||||
|
||||
vm-autostart-list: ## List libvirt VMs configured to start at host boot.
|
||||
virsh -c $(QEMU_URI) list --all --autostart
|
||||
|
||||
vm-info: ## Show libvirt domain info. Use HOST=zap.
|
||||
$(REQUIRE_INSTANCE)
|
||||
virsh -c $(QEMU_URI) dominfo "$(OPENCLAW_DOMAIN)"
|
||||
|
||||
vm-mem: ## Show libvirt memory stats. Use HOST=zap.
|
||||
$(REQUIRE_INSTANCE)
|
||||
virsh -c $(QEMU_URI) dommemstat "$(OPENCLAW_DOMAIN)"
|
||||
|
||||
vm-disks: ## Show libvirt disk devices. Use HOST=zap.
|
||||
$(REQUIRE_INSTANCE)
|
||||
virsh -c $(QEMU_URI) domblklist "$(OPENCLAW_DOMAIN)"
|
||||
|
||||
vm-ifaces: ## Show libvirt network interfaces. Use HOST=zap.
|
||||
$(REQUIRE_INSTANCE)
|
||||
virsh -c $(QEMU_URI) domiflist "$(OPENCLAW_DOMAIN)"
|
||||
|
||||
vm-dhcp: ## Show libvirt default network DHCP leases.
|
||||
virsh -c $(QEMU_URI) net-dhcp-leases default
|
||||
|
||||
vm-start: ## Start an OpenClaw VM. Use HOST=zap CONFIRM=yes.
|
||||
$(REQUIRE_CONFIRM)
|
||||
$(REQUIRE_INSTANCE)
|
||||
virsh -c $(QEMU_URI) start "$(OPENCLAW_DOMAIN)"
|
||||
$(MAKE) --no-print-directory vm-info HOST=$(HOST)
|
||||
|
||||
vm-shutdown: ## Gracefully shut down an OpenClaw VM. Use HOST=zap CONFIRM=yes.
|
||||
$(REQUIRE_CONFIRM)
|
||||
$(REQUIRE_INSTANCE)
|
||||
virsh -c $(QEMU_URI) shutdown "$(OPENCLAW_DOMAIN)"
|
||||
|
||||
vm-reboot: ## Reboot an OpenClaw VM. Use HOST=zap CONFIRM=yes.
|
||||
$(REQUIRE_CONFIRM)
|
||||
$(REQUIRE_INSTANCE)
|
||||
virsh -c $(QEMU_URI) reboot "$(OPENCLAW_DOMAIN)"
|
||||
|
||||
vm-autostart-enable: ## Enable host-boot autostart for one OpenClaw VM. Use HOST=zap CONFIRM=yes.
|
||||
$(REQUIRE_CONFIRM)
|
||||
$(REQUIRE_INSTANCE)
|
||||
virsh -c $(QEMU_URI) autostart "$(OPENCLAW_DOMAIN)"
|
||||
$(MAKE) --no-print-directory vm-info HOST=$(HOST)
|
||||
|
||||
vm-autostart-disable: ## Disable host-boot autostart for one OpenClaw VM. Use HOST=orb CONFIRM=yes.
|
||||
$(REQUIRE_CONFIRM)
|
||||
$(REQUIRE_INSTANCE)
|
||||
virsh -c $(QEMU_URI) autostart --disable "$(OPENCLAW_DOMAIN)"
|
||||
$(MAKE) --no-print-directory vm-info HOST=$(HOST)
|
||||
|
||||
vm-autostart-zap-only: ## Configure only zap to start at host boot. Use CONFIRM=yes.
|
||||
$(REQUIRE_CONFIRM)
|
||||
virsh -c $(QEMU_URI) autostart "zap [claw]"
|
||||
virsh -c $(QEMU_URI) autostart --disable "orb [claw]"
|
||||
virsh -c $(QEMU_URI) autostart --disable "sun [claw]"
|
||||
$(MAKE) --no-print-directory vm-autostart-list
|
||||
|
||||
vm-snapshot-list: ## List libvirt snapshots. Use HOST=zap.
|
||||
$(REQUIRE_INSTANCE)
|
||||
virsh -c $(QEMU_URI) snapshot-list "$(OPENCLAW_DOMAIN)"
|
||||
|
||||
vm-snapshot: ## Create a libvirt snapshot. Use HOST=zap CONFIRM=yes SNAPSHOT=name.
|
||||
$(REQUIRE_CONFIRM)
|
||||
$(REQUIRE_INSTANCE)
|
||||
virsh -c $(QEMU_URI) snapshot-create-as "$(OPENCLAW_DOMAIN)" --name "$${SNAPSHOT:-pre-change-$$(date +%Y%m%d-%H%M%S)}"
|
||||
|
||||
provision: ## Provision the KVM/libvirt VM. Use HOST=zap CONFIRM=yes.
|
||||
$(REQUIRE_CONFIRM)
|
||||
$(ANSIBLE_PLAYBOOK) playbooks/provision-vm.yml --limit $(HOST)
|
||||
|
||||
install: ## Install OpenClaw in the VM. Use HOST=zap CONFIRM=yes.
|
||||
$(REQUIRE_CONFIRM)
|
||||
$(ANSIBLE_PLAYBOOK) playbooks/install.yml --limit $(HOST)
|
||||
|
||||
customize: ## Apply post-provision VM customizations. Use HOST=zap CONFIRM=yes.
|
||||
$(REQUIRE_CONFIRM)
|
||||
$(ANSIBLE_PLAYBOOK) playbooks/customize.yml --limit $(HOST)
|
||||
|
||||
deploy: ## Run the deploy playbook. Use HOST=zap CONFIRM=yes.
|
||||
$(REQUIRE_CONFIRM)
|
||||
$(ANSIBLE_PLAYBOOK) playbooks/deploy.yml --limit $(HOST)
|
||||
|
||||
restore: ## Restore OpenClaw VM config. Use HOST=zap CONFIRM=yes and optional IP=<new-ip>.
|
||||
$(REQUIRE_CONFIRM)
|
||||
@if [ -n "$${IP:-}" ]; then \
|
||||
./restore-openclaw-vm.sh "$(HOST)" "$${IP}"; \
|
||||
else \
|
||||
./restore-openclaw-vm.sh "$(HOST)"; \
|
||||
fi
|
||||
|
||||
backup: ## Back up OpenClaw VM config. Use HOST=zap CONFIRM=yes.
|
||||
$(REQUIRE_CONFIRM)
|
||||
./backup-openclaw-vm.sh "$(HOST)"
|
||||
|
||||
timers: ## Show local user timers related to OpenClaw and LiteLLM.
|
||||
systemctl --user list-timers 'openclaw-backup.timer' 'litellm-health-check.timer'
|
||||
|
||||
kube-status: ## Show Kubernetes context, nodes, and pods using swarm-kubeconfig.yaml.
|
||||
KUBECONFIG=swarm-kubeconfig.yaml kubectl config current-context
|
||||
KUBECONFIG=swarm-kubeconfig.yaml kubectl get nodes -o wide
|
||||
KUBECONFIG=swarm-kubeconfig.yaml kubectl get pods -A
|
||||
Reference in New Issue
Block a user