diff --git a/Makefile b/Makefile index f724f86..6ace92e 100644 --- a/Makefile +++ b/Makefile @@ -137,23 +137,23 @@ api-dedup: ## Remove duplicate LiteLLM model DB entries. api-logs: ## Follow LiteLLM logs. $(DC) logs -f --tail="$(LOGS_TAIL)" litellm litellm-db litellm-init -voice-up: ## Start all voice services. +voice-up: ## Start default voice services: NPU Whisper and Kokoro TTS. $(DC) --profile voice up -d -voice-gpu: ## Start GPU whisper server and Kokoro TTS. - $(DC) --profile voice up -d whisper-server-gpu kokoro-tts +voice-gpu: ## Start manual GPU whisper fallback and Kokoro TTS. + $(DC) --profile voice-gpu --profile voice up -d whisper-server-gpu kokoro-tts voice-cpu: ## Start CPU whisper server and Kokoro TTS. - $(DC) --profile voice up -d whisper-server kokoro-tts + $(DC) --profile voice-cpu-backup --profile voice up -d whisper-server kokoro-tts voice-down: ## Stop voice profile services. - $(DC) --profile voice down + $(DC) --profile voice --profile voice-gpu --profile voice-cpu-backup down voice-build: ## Build the custom Blackwell CUDA whisper image. - $(DC) --profile voice build whisper-server-gpu + $(DC) --profile voice-gpu build whisper-server-gpu -voice-logs: ## Follow voice service logs. - $(DC) logs -f --tail="$(LOGS_TAIL)" whisper-server-gpu whisper-server kokoro-tts +voice-logs: ## Follow default voice service logs. + $(DC) logs -f --tail="$(LOGS_TAIL)" whisper-server-npu kokoro-tts search-up: ## Start Brave Search MCP and SearXNG. $(DC) --profile search up -d diff --git a/docker-compose.yaml b/docker-compose.yaml index e51bc21..0a92e16 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -37,7 +37,7 @@ services: whisper-init: image: ghcr.io/ggml-org/whisper.cpp@sha256:672650b5e67f9cb86af7ac6e09dea8eac12a024086e1e5c0172fdccf336aba09 container_name: whisper-init - profiles: ["voice"] + profiles: ["voice", "voice-cpu-backup"] restart: "no" volumes: - whisper-models:/app/models @@ -54,17 +54,15 @@ services: fi done - # Primary whisper.cpp server: NVIDIA RTX 5070 Ti via CUDA (Blackwell sm_120). - # Uses ggml-base.bin to keep the service alive while llama-server owns most of - # the laptop GPU VRAM. The previous ggml-small.bin profile needed ~465 MiB - # contiguous CUDA memory and restarted when only ~560 MiB fragmented VRAM was - # free. CPU whisper-server below remains the higher-accuracy fallback. + # Manual GPU whisper.cpp fallback: NVIDIA RTX 5070 Ti via CUDA (Blackwell sm_120). + # Kept out of the normal `voice` profile because the OpenVINO NPU Whisper + # service is the default and this container consumes GPU resources. # # The official `ghcr.io/ggml-org/whisper.cpp:main-cuda` ships kernels only # for sm_75/80/86/90 and fails to init CUDA on Blackwell. We build a custom # image with `CMAKE_CUDA_ARCHITECTURES=120` from the local Dockerfile. # Build manually with: docker build -t whisper.cpp:cuda-blackwell ./whisper-cuda-blackwell - # Or `docker compose --profile voice build whisper-server-gpu`. + # Or `docker compose --profile voice-gpu build whisper-server-gpu`. whisper-server-gpu: image: whisper.cpp:cuda-blackwell build: @@ -72,7 +70,7 @@ services: dockerfile: Dockerfile container_name: whisper-server-gpu restart: unless-stopped - profiles: ["voice"] + profiles: ["voice-gpu"] ports: - "18801:8080" volumes: @@ -115,16 +113,62 @@ services: agentmon.role: "voice" agentmon.port: "18801" - # Fallback whisper.cpp server: CPU-only, medium model. - # Kept around for resilience — runs if the GPU server is down (driver issue, - # gemma takes all VRAM, custom image broken, etc.). Uses no GPU resources. - # ~14 s per short clip (medium-on-CPU is 90x slower than small-on-GPU above). - # Start with: docker compose --profile voice up -d whisper-server + # Experimental OpenVINO GenAI Whisper server using the Intel NPU. + # This is not whisper.cpp; it implements the same OpenAI-style + # /v1/audio/transcriptions route using OpenVINO WhisperPipeline on NPU. + # Host requirements: intel-npu-driver-bin installed, /dev/accel/accel0 present, + # and the host NPU Level Zero driver/compiler libraries mounted below. + whisper-server-npu: + image: whisper-openvino-npu:local + build: + context: ./whisper-openvino-npu + dockerfile: Dockerfile + container_name: whisper-server-npu + restart: unless-stopped + profiles: ["voice"] + ports: + - "18816:8080" + devices: + - /dev/accel/accel0:/dev/accel/accel0 + group_add: + - "987" # host render group gid on willlaptop + environment: + - WHISPER_DEVICE=NPU + - WHISPER_MODEL_DIR=/models/whisper-tiny-fp16-ov + - LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu + - ZE_ENABLE_ALT_DRIVERS=/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1 + volumes: + - /home/will/.cache/openvino-models/whisper-tiny-fp16-ov:/models/whisper-tiny-fp16-ov:ro + - /usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1.32.1:/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1.32.1:ro + - /usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1:/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1:ro + - /usr/lib/x86_64-linux-gnu/libze_intel_npu.so:/usr/lib/x86_64-linux-gnu/libze_intel_npu.so:ro + - /usr/lib/x86_64-linux-gnu/libnpu_driver_compiler.so:/usr/lib/x86_64-linux-gnu/libnpu_driver_compiler.so:ro + healthcheck: + test: + [ + "CMD-SHELL", + "curl -f http://localhost:8080/health >/dev/null 2>&1 || exit 1", + ] + interval: 30s + timeout: 5s + start_period: 30s + retries: 3 + labels: + agentmon.monitor: "true" + agentmon.role: "voice" + agentmon.port: "18816" + + # Manual fallback whisper.cpp server: CPU-only, medium model. + # Kept around for resilience — runs if the NPU/GPU servers are down. Uses no + # accelerator resources, but is slow (~14 s per short clip). + # Disabled from the normal `voice` profile now that `whisper-server-npu` is + # the trial default. Start manually with: + # docker compose --profile voice-cpu-backup up -d whisper-server whisper-server: image: ghcr.io/ggml-org/whisper.cpp@sha256:672650b5e67f9cb86af7ac6e09dea8eac12a024086e1e5c0172fdccf336aba09 container_name: whisper-server restart: unless-stopped - profiles: ["voice"] + profiles: ["voice-cpu-backup"] ports: - "18811:8080" volumes: diff --git a/docs/swarm-infrastructure.html b/docs/swarm-infrastructure.html index a5ca953..f01e4b4 100644 --- a/docs/swarm-infrastructure.html +++ b/docs/swarm-infrastructure.html @@ -83,7 +83,7 @@ LiteLLMLLM router + DB:18804 SearchSearXNG + Brave MCP:18803 / :18802 - VoiceKokoro + Whisper:18805 / :18811 + VoiceKokoro + Whisper:18805 / :18816 Docker servicesagentmon.monitor=trueswarm/service snapshots OpenClaw VMscurrently dormantopenclaw.snapshot Obsidian / RAG:27123/:27124 + ChromaDB diff --git a/docs/swarm-infrastructure.md b/docs/swarm-infrastructure.md index 12fb01b..0ca8ab4 100644 --- a/docs/swarm-infrastructure.md +++ b/docs/swarm-infrastructure.md @@ -34,7 +34,7 @@ local AI/search/voice services +--> llama.cpp :18806 +--> Ollama embeddings :18807 +--> Kokoro TTS :18805 - +--> Whisper :18811 + +--> Whisper NPU :18816 ``` See also: @@ -115,7 +115,7 @@ Docker services: - `searxng` — `:18803`, local metasearch - `brave-search` — `:18802`, Brave Search MCP server - `kokoro-tts` — `:18805`, local TTS -- `whisper-server` — `:18811`, local transcription +- `whisper-server-npu` — `:18816`, OpenVINO NPU local transcription - `n8n-agent` — `:18808`, automation Host/user services: diff --git a/scripts/docker-health-server.py b/scripts/docker-health-server.py index 62c2c88..b235dd7 100644 --- a/scripts/docker-health-server.py +++ b/scripts/docker-health-server.py @@ -24,7 +24,7 @@ CONTAINERS = [ "litellm-db", "n8n-agent", "searxng", - "whisper-server", + "whisper-server-npu", ] diff --git a/scripts/voice-memo-processor.py b/scripts/voice-memo-processor.py index 33fbf5d..ab6ed02 100644 --- a/scripts/voice-memo-processor.py +++ b/scripts/voice-memo-processor.py @@ -32,7 +32,7 @@ AUDIO_DIR = os.path.join(tempfile.gettempdir(), "voice-memo-audio") os.makedirs(AUDIO_DIR, exist_ok=True) # Service endpoints (from host perspective) -WHISPER_URL = os.environ.get("WHISPER_URL", "http://127.0.0.1:18811/v1/audio/transcriptions") +WHISPER_URL = os.environ.get("WHISPER_URL", "http://127.0.0.1:18816/v1/audio/transcriptions") LLM_URL = os.environ.get("LLM_URL", "http://127.0.0.1:18806/v1/chat/completions") KOKORO_URL = os.environ.get("KOKORO_URL", "http://127.0.0.1:18805/v1/audio/speech") diff --git a/scripts/voice-memo-service.py b/scripts/voice-memo-service.py index ccdaed0..b4599c6 100644 --- a/scripts/voice-memo-service.py +++ b/scripts/voice-memo-service.py @@ -7,7 +7,7 @@ from http.server import HTTPServer, BaseHTTPRequestHandler from pathlib import Path PORT = int(os.environ.get("VOICE_MEMO_PORT", "18813")) -WHISPER_URL = os.environ.get("WHISPER_BASE_URL", "http://127.0.0.1:18811") +WHISPER_URL = os.environ.get("WHISPER_BASE_URL", "http://127.0.0.1:18816") LLM_URL = os.environ.get("LLAMA_CPP_BASE_URL", "http://127.0.0.1:18806") KOKORO_URL = os.environ.get("KOKORO_BASE_URL", "http://127.0.0.1:18805") TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "") diff --git a/swarm-common/n8n-workflows/swarm-health-watchdog.json b/swarm-common/n8n-workflows/swarm-health-watchdog.json index 8f0fdf0..097db4b 100644 --- a/swarm-common/n8n-workflows/swarm-health-watchdog.json +++ b/swarm-common/n8n-workflows/swarm-health-watchdog.json @@ -41,7 +41,7 @@ { "parameters": { "mode": "runOnceForAllItems", - "jsCode": "const staticData = $getWorkflowStaticData('global');\nconst CONFIG = {\n timeoutMs: 60000,\n failureThreshold: 2,\n reminderEveryFailedRuns: 6,\n};\nconst services = [\n { key: 'brave', name: 'Brave MCP', port: 18802, url: 'http://172.19.0.1:18802/mcp', ok: [200, 400, 404, 405, 406], docker: 'brave-search' },\n { key: 'searxng', name: 'SearXNG', port: 18803, url: 'http://172.19.0.1:18803/search?q=health&format=json', ok: [200], docker: 'searxng' },\n { key: 'litellm', name: 'LiteLLM', port: 18804, url: 'http://172.19.0.1:18804/health/liveliness', ok: [200], docker: 'litellm' },\n { key: 'kokoro', name: 'Kokoro TTS', port: 18805, url: 'http://172.19.0.1:18805/health', ok: [200], docker: 'kokoro-tts' },\n { key: 'llamacpp', name: 'llama.cpp', port: 18806, url: 'http://172.19.0.1:18806/health', ok: [200] },\n { key: 'ollama', name: 'Ollama embeddings', port: 18807, url: 'http://172.19.0.1:18807/api/version', ok: [200] },\n { key: 'n8n', name: 'n8n', port: 18808, url: 'http://127.0.0.1:5678/healthz', ok: [200], docker: 'n8n-agent' },\n { key: 'whisper', name: 'Whisper', port: 18811, url: 'http://172.19.0.1:18811/', ok: [200, 404], docker: 'whisper-server' },\n];\n\nconst httpRequest = this.helpers.httpRequest.bind(this.helpers);\n\nfunction responseLike(response) {\n const status = response.statusCode || response.status;\n const body = response.body === undefined || response.body === null ? '' : response.body;\n return {\n status,\n ok: status >= 200 && status < 300,\n async text() {\n return typeof body === 'string' ? body : JSON.stringify(body);\n },\n async json() {\n if (typeof body === 'string') return JSON.parse(body);\n return body;\n },\n };\n}\n\nasync function fetchWithTimeout(url, options = {}, timeoutMs = CONFIG.timeoutMs) {\n const method = options.method || 'GET';\n try {\n const response = await httpRequest({\n method,\n url,\n timeout: timeoutMs,\n json: false,\n simple: false,\n resolveWithFullResponse: true,\n returnFullResponse: true,\n ignoreHttpStatusErrors: true,\n });\n return responseLike(response);\n } catch (error) {\n if (error.response) return responseLike(error.response);\n throw error;\n }\n}\n\n// Fetch Docker container health from host-side endpoint\nlet dockerHealth = {};\ntry {\n const dhRes = await fetchWithTimeout('http://172.19.0.1:18809/health', { method: 'GET' }, 3000);\n if (dhRes.ok) {\n const dhData = await dhRes.json();\n for (const c of (dhData.containers || [])) {\n dockerHealth[c.name] = c;\n }\n }\n} catch (_) {\n // Docker health endpoint unavailable - continue without it\n}\n\nasync function check(svc) {\n const started = Date.now();\n try {\n const res = await fetchWithTimeout(svc.url, { method: 'GET' }, CONFIG.timeoutMs);\n const ms = Date.now() - started;\n const body = await res.text().catch(() => '');\n return {\n ...svc,\n healthy: svc.ok.includes(res.status),\n status: res.status,\n ms,\n detail: body.slice(0, 160).replace(/\\s+/g, ' ').trim(),\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n } catch (error) {\n return {\n ...svc,\n healthy: false,\n status: 'error',\n ms: Date.now() - started,\n detail: error.message,\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n }\n}\n\nfunction suggestedFix(r) {\n const dh = r.docker;\n const dockerInfo = dh ? ` [docker: ${dh.status}/${dh.health} restarts=${dh.restarts}]` : '';\n if (r.key === 'llamacpp') return 'systemctl status llama-server.service; restart only if it is down.' + dockerInfo;\n if (r.key === 'ollama') return 'systemctl --user status ollama.service; verify port 18807 and nomic-embed-text.' + dockerInfo;\n if (r.key === 'n8n') return 'docker logs n8n-agent --tail 100; check database/API health.' + dockerInfo;\n if (['searxng','litellm','kokoro','whisper','brave'].includes(r.key)) return `cd ~/lab/swarm && docker compose ps; inspect ${r.name} logs.${dockerInfo}`;\n return 'Check service logs and port listener.' + dockerInfo;\n}\n\nconst results = await Promise.all(services.map(check));\nconst now = new Date().toISOString();\nstaticData.services = staticData.services || {};\nconst alerts = [];\nconst recoveries = [];\nfor (const r of results) {\n const prev = staticData.services[r.key] || { failedRuns: 0, alerted: false };\n if (r.healthy) {\n if (prev.alerted) recoveries.push({ ...r, previousFailedRuns: prev.failedRuns });\n staticData.services[r.key] = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: r.status, lastDetail: r.detail };\n } else {\n const failedRuns = (prev.failedRuns || 0) + 1;\n const shouldAlert = failedRuns >= CONFIG.failureThreshold && (!prev.alerted || (CONFIG.reminderEveryFailedRuns > 0 && failedRuns % CONFIG.reminderEveryFailedRuns === 0));\n staticData.services[r.key] = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: r.status, lastDetail: r.detail };\n if (shouldAlert) alerts.push({ ...r, failedRuns, suggestedFix: suggestedFix(r) });\n }\n}\n\nif (!alerts.length && !recoveries.length) return [];\nlet lines = [];\nif (alerts.length) {\n lines.push('\\u{1F6A8} Swarm Health Watchdog');\n for (const a of alerts) {\n const dh = a.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}/restarts=${dh.restarts}` : '';\n lines.push(`- ${a.name} :${a.port} failed ${a.failedRuns} checks; status=${a.status}${dockerStr}; detail=${a.detail || 'n/a'}; fix=${a.suggestedFix}`);\n }\n}\nif (recoveries.length) {\n lines.push('\\u2705 Swarm service recovered');\n for (const r of recoveries) {\n const dh = r.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}` : '';\n lines.push(`- ${r.name} :${r.port} healthy again; status=${r.status}; latency=${r.ms}ms${dockerStr}`);\n }\n}\nlines.push(`checked=${now}`);\nreturn [{ json: { text: lines.join('\\n'), alerts, recoveries, results, dockerHealth, checkedAt: now } }];" + "jsCode": "const staticData = $getWorkflowStaticData('global');\nconst CONFIG = {\n timeoutMs: 60000,\n failureThreshold: 2,\n reminderEveryFailedRuns: 6,\n};\nconst services = [\n { key: 'brave', name: 'Brave MCP', port: 18802, url: 'http://172.19.0.1:18802/mcp', ok: [200, 400, 404, 405, 406], docker: 'brave-search' },\n { key: 'searxng', name: 'SearXNG', port: 18803, url: 'http://172.19.0.1:18803/search?q=health&format=json', ok: [200], docker: 'searxng' },\n { key: 'litellm', name: 'LiteLLM', port: 18804, url: 'http://172.19.0.1:18804/health/liveliness', ok: [200], docker: 'litellm' },\n { key: 'kokoro', name: 'Kokoro TTS', port: 18805, url: 'http://172.19.0.1:18805/health', ok: [200], docker: 'kokoro-tts' },\n { key: 'llamacpp', name: 'llama.cpp', port: 18806, url: 'http://172.19.0.1:18806/health', ok: [200] },\n { key: 'ollama', name: 'Ollama embeddings', port: 18807, url: 'http://172.19.0.1:18807/api/version', ok: [200] },\n { key: 'n8n', name: 'n8n', port: 18808, url: 'http://127.0.0.1:5678/healthz', ok: [200], docker: 'n8n-agent' },\n { key: 'whisper', name: 'Whisper NPU', port: 18816, url: 'http://172.19.0.1:18816/', ok: [200, 404], docker: 'whisper-server-npu' },\n];\n\nconst httpRequest = this.helpers.httpRequest.bind(this.helpers);\n\nfunction responseLike(response) {\n const status = response.statusCode || response.status;\n const body = response.body === undefined || response.body === null ? '' : response.body;\n return {\n status,\n ok: status >= 200 && status < 300,\n async text() {\n return typeof body === 'string' ? body : JSON.stringify(body);\n },\n async json() {\n if (typeof body === 'string') return JSON.parse(body);\n return body;\n },\n };\n}\n\nasync function fetchWithTimeout(url, options = {}, timeoutMs = CONFIG.timeoutMs) {\n const method = options.method || 'GET';\n try {\n const response = await httpRequest({\n method,\n url,\n timeout: timeoutMs,\n json: false,\n simple: false,\n resolveWithFullResponse: true,\n returnFullResponse: true,\n ignoreHttpStatusErrors: true,\n });\n return responseLike(response);\n } catch (error) {\n if (error.response) return responseLike(error.response);\n throw error;\n }\n}\n\n// Fetch Docker container health from host-side endpoint\nlet dockerHealth = {};\ntry {\n const dhRes = await fetchWithTimeout('http://172.19.0.1:18809/health', { method: 'GET' }, 3000);\n if (dhRes.ok) {\n const dhData = await dhRes.json();\n for (const c of (dhData.containers || [])) {\n dockerHealth[c.name] = c;\n }\n }\n} catch (_) {\n // Docker health endpoint unavailable - continue without it\n}\n\nasync function check(svc) {\n const started = Date.now();\n try {\n const res = await fetchWithTimeout(svc.url, { method: 'GET' }, CONFIG.timeoutMs);\n const ms = Date.now() - started;\n const body = await res.text().catch(() => '');\n return {\n ...svc,\n healthy: svc.ok.includes(res.status),\n status: res.status,\n ms,\n detail: body.slice(0, 160).replace(/\\s+/g, ' ').trim(),\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n } catch (error) {\n return {\n ...svc,\n healthy: false,\n status: 'error',\n ms: Date.now() - started,\n detail: error.message,\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n }\n}\n\nfunction suggestedFix(r) {\n const dh = r.docker;\n const dockerInfo = dh ? ` [docker: ${dh.status}/${dh.health} restarts=${dh.restarts}]` : '';\n if (r.key === 'llamacpp') return 'systemctl status llama-server.service; restart only if it is down.' + dockerInfo;\n if (r.key === 'ollama') return 'systemctl --user status ollama.service; verify port 18807 and nomic-embed-text.' + dockerInfo;\n if (r.key === 'n8n') return 'docker logs n8n-agent --tail 100; check database/API health.' + dockerInfo;\n if (['searxng','litellm','kokoro','whisper','brave'].includes(r.key)) return `cd ~/lab/swarm && docker compose ps; inspect ${r.name} logs.${dockerInfo}`;\n return 'Check service logs and port listener.' + dockerInfo;\n}\n\nconst results = await Promise.all(services.map(check));\nconst now = new Date().toISOString();\nstaticData.services = staticData.services || {};\nconst alerts = [];\nconst recoveries = [];\nfor (const r of results) {\n const prev = staticData.services[r.key] || { failedRuns: 0, alerted: false };\n if (r.healthy) {\n if (prev.alerted) recoveries.push({ ...r, previousFailedRuns: prev.failedRuns });\n staticData.services[r.key] = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: r.status, lastDetail: r.detail };\n } else {\n const failedRuns = (prev.failedRuns || 0) + 1;\n const shouldAlert = failedRuns >= CONFIG.failureThreshold && (!prev.alerted || (CONFIG.reminderEveryFailedRuns > 0 && failedRuns % CONFIG.reminderEveryFailedRuns === 0));\n staticData.services[r.key] = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: r.status, lastDetail: r.detail };\n if (shouldAlert) alerts.push({ ...r, failedRuns, suggestedFix: suggestedFix(r) });\n }\n}\n\nif (!alerts.length && !recoveries.length) return [];\nlet lines = [];\nif (alerts.length) {\n lines.push('\\u{1F6A8} Swarm Health Watchdog');\n for (const a of alerts) {\n const dh = a.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}/restarts=${dh.restarts}` : '';\n lines.push(`- ${a.name} :${a.port} failed ${a.failedRuns} checks; status=${a.status}${dockerStr}; detail=${a.detail || 'n/a'}; fix=${a.suggestedFix}`);\n }\n}\nif (recoveries.length) {\n lines.push('\\u2705 Swarm service recovered');\n for (const r of recoveries) {\n const dh = r.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}` : '';\n lines.push(`- ${r.name} :${r.port} healthy again; status=${r.status}; latency=${r.ms}ms${dockerStr}`);\n }\n}\nlines.push(`checked=${now}`);\nreturn [{ json: { text: lines.join('\\n'), alerts, recoveries, results, dockerHealth, checkedAt: now } }];" }, "type": "n8n-nodes-base.code", "typeVersion": 2, @@ -280,7 +280,7 @@ { "parameters": { "mode": "runOnceForAllItems", - "jsCode": "const staticData = $getWorkflowStaticData('global');\nconst CONFIG = {\n timeoutMs: 5000,\n failureThreshold: 2,\n reminderEveryFailedRuns: 6,\n};\nconst services = [\n { key: 'brave', name: 'Brave MCP', port: 18802, url: 'http://172.19.0.1:18802/mcp', ok: [200, 400, 404, 405, 406], docker: 'brave-search' },\n { key: 'searxng', name: 'SearXNG', port: 18803, url: 'http://172.19.0.1:18803/search?q=health&format=json', ok: [200], docker: 'searxng' },\n { key: 'litellm', name: 'LiteLLM', port: 18804, url: 'http://172.19.0.1:18804/health/liveliness', ok: [200], docker: 'litellm' },\n { key: 'kokoro', name: 'Kokoro TTS', port: 18805, url: 'http://172.19.0.1:18805/health', ok: [200], docker: 'kokoro-tts' },\n { key: 'llamacpp', name: 'llama.cpp', port: 18806, url: 'http://172.19.0.1:18806/health', ok: [200] },\n { key: 'ollama', name: 'Ollama embeddings', port: 18807, url: 'http://172.19.0.1:18807/api/version', ok: [200] },\n { key: 'n8n', name: 'n8n', port: 18808, url: 'http://127.0.0.1:5678/healthz', ok: [200], docker: 'n8n-agent' },\n { key: 'whisper', name: 'Whisper', port: 18811, url: 'http://172.19.0.1:18811/', ok: [200, 404], docker: 'whisper-server' },\n];\n\nasync function fetchWithTimeout(url, options = {}, timeoutMs = CONFIG.timeoutMs) {\n let timer;\n const timeoutPromise = new Promise((_, reject) => {\n timer = setTimeout(() => reject(new Error(`Request timed out after ${timeoutMs}ms`)), timeoutMs);\n });\n try {\n return await Promise.race([fetch(url, options), timeoutPromise]);\n } finally {\n if (timer) clearTimeout(timer);\n }\n}\n\n// Fetch Docker container health from host-side endpoint\nlet dockerHealth = {};\ntry {\n const dhRes = await fetchWithTimeout('http://172.19.0.1:18809/health', { method: 'GET' }, 3000);\n if (dhRes.ok) {\n const dhData = await dhRes.json();\n for (const c of (dhData.containers || [])) {\n dockerHealth[c.name] = c;\n }\n }\n} catch (_) {\n // Docker health endpoint unavailable - continue without it\n}\n\nasync function check(svc) {\n const started = Date.now();\n try {\n const res = await fetchWithTimeout(svc.url, { method: 'GET' }, CONFIG.timeoutMs);\n const ms = Date.now() - started;\n const body = await res.text().catch(() => '');\n return {\n ...svc,\n healthy: svc.ok.includes(res.status),\n status: res.status,\n ms,\n detail: body.slice(0, 160).replace(/\\s+/g, ' ').trim(),\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n } catch (error) {\n return {\n ...svc,\n healthy: false,\n status: 'error',\n ms: Date.now() - started,\n detail: error.message,\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n }\n}\n\nfunction suggestedFix(r) {\n const dh = r.docker;\n const dockerInfo = dh ? ` [docker: ${dh.status}/${dh.health} restarts=${dh.restarts}]` : '';\n if (r.key === 'llamacpp') return 'systemctl status llama-server.service; restart only if it is down.' + dockerInfo;\n if (r.key === 'ollama') return 'systemctl --user status ollama.service; verify port 18807 and nomic-embed-text.' + dockerInfo;\n if (r.key === 'n8n') return 'docker logs n8n-agent --tail 100; check database/API health.' + dockerInfo;\n if (['searxng','litellm','kokoro','whisper','brave'].includes(r.key)) return `cd ~/lab/swarm && docker compose ps; inspect ${r.name} logs.${dockerInfo}`;\n return 'Check service logs and port listener.' + dockerInfo;\n}\n\nconst results = await Promise.all(services.map(check));\nconst now = new Date().toISOString();\nstaticData.services = staticData.services || {};\nconst alerts = [];\nconst recoveries = [];\nfor (const r of results) {\n const prev = staticData.services[r.key] || { failedRuns: 0, alerted: false };\n if (r.healthy) {\n if (prev.alerted) recoveries.push({ ...r, previousFailedRuns: prev.failedRuns });\n staticData.services[r.key] = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: r.status, lastDetail: r.detail };\n } else {\n const failedRuns = (prev.failedRuns || 0) + 1;\n const shouldAlert = failedRuns >= CONFIG.failureThreshold && (!prev.alerted || (CONFIG.reminderEveryFailedRuns > 0 && failedRuns % CONFIG.reminderEveryFailedRuns === 0));\n staticData.services[r.key] = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: r.status, lastDetail: r.detail };\n if (shouldAlert) alerts.push({ ...r, failedRuns, suggestedFix: suggestedFix(r) });\n }\n}\n\nif (!alerts.length && !recoveries.length) return [];\nlet lines = [];\nif (alerts.length) {\n lines.push('\\u{1F6A8} Swarm Health Watchdog');\n for (const a of alerts) {\n const dh = a.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}/restarts=${dh.restarts}` : '';\n lines.push(`- ${a.name} :${a.port} failed ${a.failedRuns} checks; status=${a.status}${dockerStr}; detail=${a.detail || 'n/a'}; fix=${a.suggestedFix}`);\n }\n}\nif (recoveries.length) {\n lines.push('\\u2705 Swarm service recovered');\n for (const r of recoveries) {\n const dh = r.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}` : '';\n lines.push(`- ${r.name} :${r.port} healthy again; status=${r.status}; latency=${r.ms}ms${dockerStr}`);\n }\n}\nlines.push(`checked=${now}`);\nreturn [{ json: { text: lines.join('\\n'), alerts, recoveries, results, dockerHealth, checkedAt: now } }];" + "jsCode": "const staticData = $getWorkflowStaticData('global');\nconst CONFIG = {\n timeoutMs: 5000,\n failureThreshold: 2,\n reminderEveryFailedRuns: 6,\n};\nconst services = [\n { key: 'brave', name: 'Brave MCP', port: 18802, url: 'http://172.19.0.1:18802/mcp', ok: [200, 400, 404, 405, 406], docker: 'brave-search' },\n { key: 'searxng', name: 'SearXNG', port: 18803, url: 'http://172.19.0.1:18803/search?q=health&format=json', ok: [200], docker: 'searxng' },\n { key: 'litellm', name: 'LiteLLM', port: 18804, url: 'http://172.19.0.1:18804/health/liveliness', ok: [200], docker: 'litellm' },\n { key: 'kokoro', name: 'Kokoro TTS', port: 18805, url: 'http://172.19.0.1:18805/health', ok: [200], docker: 'kokoro-tts' },\n { key: 'llamacpp', name: 'llama.cpp', port: 18806, url: 'http://172.19.0.1:18806/health', ok: [200] },\n { key: 'ollama', name: 'Ollama embeddings', port: 18807, url: 'http://172.19.0.1:18807/api/version', ok: [200] },\n { key: 'n8n', name: 'n8n', port: 18808, url: 'http://127.0.0.1:5678/healthz', ok: [200], docker: 'n8n-agent' },\n { key: 'whisper', name: 'Whisper NPU', port: 18816, url: 'http://172.19.0.1:18816/', ok: [200, 404], docker: 'whisper-server-npu' },\n];\n\nasync function fetchWithTimeout(url, options = {}, timeoutMs = CONFIG.timeoutMs) {\n let timer;\n const timeoutPromise = new Promise((_, reject) => {\n timer = setTimeout(() => reject(new Error(`Request timed out after ${timeoutMs}ms`)), timeoutMs);\n });\n try {\n return await Promise.race([fetch(url, options), timeoutPromise]);\n } finally {\n if (timer) clearTimeout(timer);\n }\n}\n\n// Fetch Docker container health from host-side endpoint\nlet dockerHealth = {};\ntry {\n const dhRes = await fetchWithTimeout('http://172.19.0.1:18809/health', { method: 'GET' }, 3000);\n if (dhRes.ok) {\n const dhData = await dhRes.json();\n for (const c of (dhData.containers || [])) {\n dockerHealth[c.name] = c;\n }\n }\n} catch (_) {\n // Docker health endpoint unavailable - continue without it\n}\n\nasync function check(svc) {\n const started = Date.now();\n try {\n const res = await fetchWithTimeout(svc.url, { method: 'GET' }, CONFIG.timeoutMs);\n const ms = Date.now() - started;\n const body = await res.text().catch(() => '');\n return {\n ...svc,\n healthy: svc.ok.includes(res.status),\n status: res.status,\n ms,\n detail: body.slice(0, 160).replace(/\\s+/g, ' ').trim(),\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n } catch (error) {\n return {\n ...svc,\n healthy: false,\n status: 'error',\n ms: Date.now() - started,\n detail: error.message,\n docker: svc.docker ? (dockerHealth[svc.docker] || { name: svc.docker, status: 'unknown', health: 'unknown', restarts: -1 }) : null,\n };\n }\n}\n\nfunction suggestedFix(r) {\n const dh = r.docker;\n const dockerInfo = dh ? ` [docker: ${dh.status}/${dh.health} restarts=${dh.restarts}]` : '';\n if (r.key === 'llamacpp') return 'systemctl status llama-server.service; restart only if it is down.' + dockerInfo;\n if (r.key === 'ollama') return 'systemctl --user status ollama.service; verify port 18807 and nomic-embed-text.' + dockerInfo;\n if (r.key === 'n8n') return 'docker logs n8n-agent --tail 100; check database/API health.' + dockerInfo;\n if (['searxng','litellm','kokoro','whisper','brave'].includes(r.key)) return `cd ~/lab/swarm && docker compose ps; inspect ${r.name} logs.${dockerInfo}`;\n return 'Check service logs and port listener.' + dockerInfo;\n}\n\nconst results = await Promise.all(services.map(check));\nconst now = new Date().toISOString();\nstaticData.services = staticData.services || {};\nconst alerts = [];\nconst recoveries = [];\nfor (const r of results) {\n const prev = staticData.services[r.key] || { failedRuns: 0, alerted: false };\n if (r.healthy) {\n if (prev.alerted) recoveries.push({ ...r, previousFailedRuns: prev.failedRuns });\n staticData.services[r.key] = { failedRuns: 0, alerted: false, lastOk: now, lastStatus: r.status, lastDetail: r.detail };\n } else {\n const failedRuns = (prev.failedRuns || 0) + 1;\n const shouldAlert = failedRuns >= CONFIG.failureThreshold && (!prev.alerted || (CONFIG.reminderEveryFailedRuns > 0 && failedRuns % CONFIG.reminderEveryFailedRuns === 0));\n staticData.services[r.key] = { failedRuns, alerted: prev.alerted || shouldAlert, lastFailure: now, lastStatus: r.status, lastDetail: r.detail };\n if (shouldAlert) alerts.push({ ...r, failedRuns, suggestedFix: suggestedFix(r) });\n }\n}\n\nif (!alerts.length && !recoveries.length) return [];\nlet lines = [];\nif (alerts.length) {\n lines.push('\\u{1F6A8} Swarm Health Watchdog');\n for (const a of alerts) {\n const dh = a.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}/restarts=${dh.restarts}` : '';\n lines.push(`- ${a.name} :${a.port} failed ${a.failedRuns} checks; status=${a.status}${dockerStr}; detail=${a.detail || 'n/a'}; fix=${a.suggestedFix}`);\n }\n}\nif (recoveries.length) {\n lines.push('\\u2705 Swarm service recovered');\n for (const r of recoveries) {\n const dh = r.docker;\n const dockerStr = dh ? ` | docker:${dh.status}/${dh.health}` : '';\n lines.push(`- ${r.name} :${r.port} healthy again; status=${r.status}; latency=${r.ms}ms${dockerStr}`);\n }\n}\nlines.push(`checked=${now}`);\nreturn [{ json: { text: lines.join('\\n'), alerts, recoveries, results, dockerHealth, checkedAt: now } }];" }, "type": "n8n-nodes-base.code", "typeVersion": 2, diff --git a/whisper-openvino-npu/Dockerfile b/whisper-openvino-npu/Dockerfile new file mode 100644 index 0000000..24759ae --- /dev/null +++ b/whisper-openvino-npu/Dockerfile @@ -0,0 +1,31 @@ +FROM python:3.14-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu \ + ZE_ENABLE_ALT_DRIVERS=/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1 + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ffmpeg libze1 ca-certificates curl \ + && rm -rf /var/lib/apt/lists/* + +RUN python -m pip install --upgrade pip \ + && python -m pip install \ + fastapi==0.126.0 \ + uvicorn[standard]==0.38.0 \ + python-multipart==0.0.22 \ + openvino==2026.2.0 \ + openvino-genai==2026.2.0.0 \ + soundfile==0.13.1 \ + numpy==2.4.6 + +WORKDIR /app +COPY server.py /app/server.py + +EXPOSE 8080 +HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \ + CMD curl -fsS http://localhost:8080/health >/dev/null || exit 1 + +CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/whisper-openvino-npu/server.py b/whisper-openvino-npu/server.py new file mode 100644 index 0000000..f7af663 --- /dev/null +++ b/whisper-openvino-npu/server.py @@ -0,0 +1,147 @@ +import os +import subprocess +import tempfile +import threading +import time +from pathlib import Path +from typing import Optional + +import numpy as np +import openvino as ov +import openvino_genai as ov_genai +import soundfile as sf +from fastapi import FastAPI, File, Form, UploadFile +from fastapi.responses import JSONResponse, PlainTextResponse + +MODEL_DIR = Path(os.environ.get("WHISPER_MODEL_DIR", "/models/whisper-tiny-fp16-ov")) +DEVICE = os.environ.get("WHISPER_DEVICE", "NPU") +BUSY_PATH = Path("/sys/class/accel/accel0/device/npu_busy_time_us") + +app = FastAPI(title="OpenVINO NPU Whisper server", version="0.1.0") +_lock = threading.Lock() +_pipe = None +_core = None + + +def busy_us() -> Optional[int]: + try: + return int(BUSY_PATH.read_text().strip()) + except Exception: + return None + + +def get_core(): + global _core + if _core is None: + _core = ov.Core() + return _core + + +def get_pipe(): + global _pipe + if _pipe is None: + _pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE) + return _pipe + + +def load_audio(upload_path: Path) -> tuple[np.ndarray, int]: + """Decode arbitrary uploaded audio to mono 16 kHz float32 using ffmpeg + soundfile.""" + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav: + wav_path = Path(wav.name) + try: + subprocess.run( + [ + "ffmpeg", + "-nostdin", + "-hide_banner", + "-loglevel", + "error", + "-y", + "-i", + str(upload_path), + "-ac", + "1", + "-ar", + "16000", + "-f", + "wav", + str(wav_path), + ], + check=True, + ) + audio, sr = sf.read(wav_path, dtype="float32") + if audio.ndim > 1: + audio = audio.mean(axis=1) + return audio, int(sr) + finally: + try: + wav_path.unlink() + except FileNotFoundError: + pass + + +@app.get("/") +def root(): + return PlainTextResponse("OpenVINO NPU Whisper server\n") + + +@app.get("/health") +def health(): + try: + core = get_core() + devices = core.available_devices + npu_name = core.get_property("NPU", "FULL_DEVICE_NAME") if "NPU" in devices else None + return { + "ok": "NPU" in devices, + "device": DEVICE, + "devices": devices, + "npu": npu_name, + "model_dir": str(MODEL_DIR), + "model_exists": MODEL_DIR.exists(), + "npu_busy_time_us": busy_us(), + } + except Exception as e: + return JSONResponse(status_code=500, content={"ok": False, "error": f"{type(e).__name__}: {e}"}) + + +@app.post("/v1/audio/transcriptions") +async def transcriptions( + file: UploadFile = File(...), + model: Optional[str] = Form(default=None), + language: Optional[str] = Form(default=None), + response_format: Optional[str] = Form(default="json"), +): + suffix = Path(file.filename or "audio").suffix or ".audio" + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: + upload_path = Path(tmp.name) + tmp.write(await file.read()) + + before = busy_us() + t0 = time.perf_counter() + try: + audio, sr = load_audio(upload_path) + # OpenVINO GenAI WhisperPipeline appears stateful for Whisper generation on + # this stack: reusing one pipeline produced unstable language detection on + # repeated short clips. Recreate per request for correctness; OpenVINO's + # compiled-cache path keeps warm init reasonably fast. + with _lock: + pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE) + result = pipe.generate(audio) + text = str(result).strip() + elapsed = time.perf_counter() - t0 + after = busy_us() + if response_format == "text": + return PlainTextResponse(text) + return { + "text": text, + "duration_seconds": round(elapsed, 4), + "sample_rate": sr, + "device": DEVICE, + "model": model or MODEL_DIR.name, + "npu_busy_delta_us": None if before is None or after is None else after - before, + } + finally: + try: + upload_path.unlink() + except FileNotFoundError: + pass