diff --git a/docker-compose.yaml b/docker-compose.yaml index 28a7e08..e27db25 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -30,31 +30,121 @@ services: # start_period: 15s # retries: 3 - # Optional local dependency: whisper.cpp server for audio transcription. - # Start with: docker compose --profile voice up -d whisper-server - whisper-server: - image: ghcr.io/ggml-org/whisper.cpp@sha256:3a39e86d5a0e911086b5cbebc9029cac71b02fbd08e217b775857de1490f55bf - container_name: whisper-server + # One-shot init: download whisper models into the shared volume if missing. + # The base image only ships ggml-base.en.bin; the servers below require: + # - ggml-medium.bin for the CPU server + # - ggml-small.bin for the GPU server (small fits in the limited VRAM left after gemma) + whisper-init: + image: ghcr.io/ggml-org/whisper.cpp:main + container_name: whisper-init + profiles: ["voice"] + restart: "no" + volumes: + - whisper-models:/app/models + entrypoint: ["sh", "-c"] + command: + - | + set -e + for m in medium small; do + if [ -f /app/models/ggml-$$m.bin ]; then + echo "Model ggml-$$m.bin already present, skipping download." + else + echo "Downloading ggml-$$m.bin..." + sh /app/models/download-ggml-model.sh $$m /app/models + fi + done + + # Primary whisper.cpp server: NVIDIA RTX 5070 Ti via CUDA (Blackwell sm_120). + # Uses ggml-small.bin (~850 MiB VRAM) — fits alongside gemma 3 12b which runs + # with `--parallel 1` (frees ~900 MiB of VRAM). Benchmarked at ~150 ms per + # short clip, ~93x faster than the CPU server below with identical WER. + # + # The official `ghcr.io/ggml-org/whisper.cpp:main-cuda` ships kernels only + # for sm_75/80/86/90 and fails to init CUDA on Blackwell. We build a custom + # image with `CMAKE_CUDA_ARCHITECTURES=120` from the local Dockerfile. + # Build manually with: docker build -t whisper.cpp:cuda-blackwell ./whisper-cuda-blackwell + # Or `docker compose --profile voice build whisper-server-gpu`. + whisper-server-gpu: + image: whisper.cpp:cuda-blackwell + build: + context: ./whisper-cuda-blackwell + dockerfile: Dockerfile + container_name: whisper-server-gpu restart: unless-stopped profiles: ["voice"] ports: - "18801:8080" volumes: - whisper-models:/app/models - # Override image entrypoint so args are passed directly to whisper-server. entrypoint: ["whisper-server"] command: - --model - - /app/models/ggml-base.en.bin + - /app/models/ggml-small.bin - --host - 0.0.0.0 - --port - "8080" - --convert - --language - - en + - auto - --inference-path - /v1/audio/transcriptions + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + depends_on: + whisper-init: + condition: service_completed_successfully + healthcheck: + test: + [ + "CMD-SHELL", + "curl -f http://localhost:8080/ >/dev/null 2>&1 || exit 1", + ] + interval: 30s + timeout: 5s + start_period: 30s + retries: 3 + labels: + agentmon.monitor: "true" + agentmon.role: "voice" + agentmon.port: "18801" + + # Fallback whisper.cpp server: CPU-only, medium model. + # Kept around for resilience — runs if the GPU server is down (driver issue, + # gemma takes all VRAM, custom image broken, etc.). Uses no GPU resources. + # ~14 s per short clip (medium-on-CPU is 90x slower than small-on-GPU above). + # Start with: docker compose --profile voice up -d whisper-server + whisper-server: + image: ghcr.io/ggml-org/whisper.cpp:main + container_name: whisper-server + restart: unless-stopped + profiles: ["voice"] + ports: + - "18811:8080" + volumes: + - whisper-models:/app/models + # Override image entrypoint so args are passed directly to whisper-server. + entrypoint: ["whisper-server"] + command: + - --model + - /app/models/ggml-medium.bin + - --host + - 0.0.0.0 + - --port + - "8080" + - --convert + - --language + - auto + - --inference-path + - /v1/audio/transcriptions + depends_on: + whisper-init: + condition: service_completed_successfully healthcheck: test: [ @@ -68,7 +158,7 @@ services: labels: agentmon.monitor: "true" agentmon.role: "voice" - agentmon.port: "18801" + agentmon.port: "18811" # kokoro TTS kokoro-tts: diff --git a/whisper-cuda-blackwell/Dockerfile b/whisper-cuda-blackwell/Dockerfile new file mode 100644 index 0000000..ca06472 --- /dev/null +++ b/whisper-cuda-blackwell/Dockerfile @@ -0,0 +1,91 @@ +# whisper.cpp built for NVIDIA Blackwell (compute capability 12.0 / sm_120). +# +# Why this exists: the official `ghcr.io/ggml-org/whisper.cpp:main-cuda` image +# only ships CUDA kernels for sm_75/80/86/90 (Turing -> Hopper) and includes +# no PTX, so it fails to initialize CUDA on RTX 50-series Blackwell GPUs with +# `ggml_cuda_init: failed to initialize CUDA: system has unsupported display +# driver / cuda driver combination`. +# +# Build: +# docker build -t whisper.cpp:cuda-blackwell ./whisper-cuda-blackwell +# +# Override args if needed: +# --build-arg CUDA_ARCH=120 # set to your GPU's compute capability +# --build-arg WHISPER_REF=v1.7.6 # pin to a specific whisper.cpp tag + +ARG CUDA_VERSION=12.9.1 +ARG UBUNTU_VERSION=24.04 + +# ---------- Build stage ---------- +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS builder + +ARG CUDA_ARCH=120 +ARG WHISPER_REF=master + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + git \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /src +RUN git clone --depth 1 --branch "${WHISPER_REF}" https://github.com/ggml-org/whisper.cpp.git . 2>/dev/null \ + || git clone --depth 1 https://github.com/ggml-org/whisper.cpp.git . + +# Build with CUDA enabled and explicit Blackwell architecture. +# Adding "${CUDA_ARCH}-virtual" emits PTX too so JIT can target newer chips +# from the same family if Blackwell ever gets minor variants. +# +# Why the linker dance: +# `libggml-cuda.so` calls into the CUDA driver API (`cuGetErrorString`, +# `cuMemMap`, etc.) but doesn't declare a `DT_NEEDED` for libcuda. When the +# whisper-server executable links against libggml-cuda.so, ld then sees those +# unresolved symbols and refuses. We fix it two ways at once: +# 1. Symlink the driver stub to /usr/local/cuda/lib64/stubs/libcuda.so.1 +# and register the dir with ldconfig so `-lcuda` resolves. +# 2. Force `-lcuda` onto the link line via CMAKE_*_LINKER_FLAGS, with +# `--no-as-needed` so ld keeps it even when ordering would normally drop it. +RUN ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ + && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/cuda-stubs.conf \ + && ldconfig \ + && cmake -B build \ + -DGGML_CUDA=ON \ + -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCH};${CUDA_ARCH}-virtual" \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs -Wl,--no-as-needed -lcuda -Wl,--as-needed" \ + -DCMAKE_SHARED_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs -Wl,--no-as-needed -lcuda -Wl,--as-needed" \ + && cmake --build build --config Release -j "$(nproc)" \ + --target whisper-server whisper-cli whisper-bench + +# ---------- Runtime stage ---------- +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ffmpeg \ + curl \ + ca-certificates \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +# Binaries +COPY --from=builder /src/build/bin/whisper-server /usr/local/bin/whisper-server +COPY --from=builder /src/build/bin/whisper-cli /usr/local/bin/whisper-cli +COPY --from=builder /src/build/bin/whisper-bench /usr/local/bin/whisper-bench + +# whisper.cpp builds dynamic libs by default; copy them all. +COPY --from=builder /src/build/src/libwhisper.so* /usr/local/lib/ +COPY --from=builder /src/build/ggml/src/libggml.so* /usr/local/lib/ +COPY --from=builder /src/build/ggml/src/libggml-base.so* /usr/local/lib/ +COPY --from=builder /src/build/ggml/src/libggml-cpu.so* /usr/local/lib/ +COPY --from=builder /src/build/ggml/src/ggml-cuda/libggml-cuda.so* /usr/local/lib/ + +# Helpful extras: the model-download script and the JFK sample. +COPY --from=builder /src/models/download-ggml-model.sh /app/models/download-ggml-model.sh +COPY --from=builder /src/samples /app/samples + +RUN ldconfig + +WORKDIR /app +EXPOSE 8080 +ENTRYPOINT ["whisper-server"]