diff --git a/docker-compose.yaml b/docker-compose.yaml index 1c6dcc6..e51bc21 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -45,7 +45,7 @@ services: command: - | set -e - for m in medium small; do + for m in medium small base; do if [ -f /app/models/ggml-$$m.bin ]; then echo "Model ggml-$$m.bin already present, skipping download." else @@ -55,9 +55,10 @@ services: done # Primary whisper.cpp server: NVIDIA RTX 5070 Ti via CUDA (Blackwell sm_120). - # Uses ggml-small.bin (~850 MiB VRAM) — fits alongside gemma 3 12b which runs - # with `--parallel 1` (frees ~900 MiB of VRAM). Benchmarked at ~150 ms per - # short clip, ~93x faster than the CPU server below with identical WER. + # Uses ggml-base.bin to keep the service alive while llama-server owns most of + # the laptop GPU VRAM. The previous ggml-small.bin profile needed ~465 MiB + # contiguous CUDA memory and restarted when only ~560 MiB fragmented VRAM was + # free. CPU whisper-server below remains the higher-accuracy fallback. # # The official `ghcr.io/ggml-org/whisper.cpp:main-cuda` ships kernels only # for sm_75/80/86/90 and fails to init CUDA on Blackwell. We build a custom @@ -79,7 +80,7 @@ services: entrypoint: ["whisper-server"] command: - --model - - /app/models/ggml-small.bin + - /app/models/ggml-base.bin - --host - 0.0.0.0 - --port @@ -224,7 +225,7 @@ services: # Optional local dependency: liteLLM proxy for unified LLM API. # Start with: docker compose --profile api up -d litellm litellm: - image: litellm/litellm:v1.82.3-stable.patch.2 + image: litellm/litellm:v1.83.7-stable container_name: litellm restart: unless-stopped profiles: ["api"] @@ -309,7 +310,7 @@ services: # Dedicated local n8n instance for agent-oriented workflows. # Start with: docker compose --profile automation up -d n8n-agent n8n-agent: - image: docker.n8n.io/n8nio/n8n:2.11.3 + image: docker.n8n.io/n8nio/n8n:2.22.1 container_name: n8n-agent restart: unless-stopped profiles: ["automation"]