4 Commits

Author SHA1 Message Date
William Valentin 1a674e854e feat(rag): switch Obsidian endpoint to NPU embeddings 2026-06-03 20:04:43 -07:00
William Valentin bcc652e5aa fix(rag): distinguish query and document embeddings 2026-06-03 19:51:55 -07:00
William Valentin fe4dea0f07 feat(rag): add OpenVINO NPU embeddings service 2026-06-03 18:28:16 -07:00
William Valentin 7745648a13 chore(voice): make NPU Whisper the default 2026-06-03 17:24:49 -07:00
16 changed files with 572 additions and 52 deletions
+20 -16
View File
@@ -16,7 +16,8 @@ OPENCLAW_PORT ?= 18789
QEMU_URI ?= qemu:///system
LLAMA_CPP_URL ?= http://127.0.0.1:18806
OLLAMA_URL ?= http://127.0.0.1:18807
OLLAMA_EMBED_MODEL ?= nomic-embed-text
OPENVINO_EMBED_URL ?= http://127.0.0.1:18817
OPENVINO_EMBED_MODEL ?= bge-base-en-v1.5-int8-ov
DC := $(COMPOSE) -f $(COMPOSE_FILE)
COMMON_DC := $(COMPOSE) -f $(COMMON_COMPOSE_FILE)
@@ -28,7 +29,7 @@ REQUIRE_CONFIRM = test "$(CONFIRM)" = "yes" || { echo "This target changes VM/ga
REQUIRE_INSTANCE = test -n "$(OPENCLAW_HOST)" -a -n "$(OPENCLAW_DOMAIN)" || { echo "Unknown OpenClaw HOST=$(HOST) in $(OPENCLAW_REGISTRY)"; exit 2; }
.DEFAULT_GOAL := help
.PHONY: help config ps status local-ai-health ollama-embed-health up down restart pull build logs shell clean \
.PHONY: help config ps status local-ai-health openvino-embed-health up down restart pull build logs shell clean \
api-up api-down api-restart api-init api-init-force api-health api-dedup api-logs \
voice-up voice-gpu voice-cpu voice-down voice-build voice-logs \
search-up search-down automation-up automation-down n8n-logs \
@@ -53,7 +54,7 @@ ps: ## Show root Docker Compose service status.
status: ps local-ai-health ## Show Docker service status plus host-side local AI endpoints.
local-ai-health: ## Check host-side llama.cpp LLM and Ollama embeddings endpoints.
local-ai-health: ## Check host-side llama.cpp LLM, Ollama fallback, and OpenVINO NPU embeddings endpoints.
@printf "\nHost-side local AI endpoints:\n"
@printf "llama.cpp (%s): " "$(LLAMA_CPP_URL)"; \
if curl -fsS --max-time 3 "$(LLAMA_CPP_URL)/v1/models" >/tmp/swarm-llama-models.json 2>/dev/null; then \
@@ -62,14 +63,17 @@ local-ai-health: ## Check host-side llama.cpp LLM and Ollama embeddings endpoint
printf "FAILED\n"; \
fi
@printf "ollama.service: "; systemctl --user is-active ollama.service 2>/dev/null || true
@printf "Ollama API (%s): " "$(OLLAMA_URL)"; \
@printf "Ollama fallback API (%s): " "$(OLLAMA_URL)"; \
curl -fsS --max-time 3 "$(OLLAMA_URL)/api/version" 2>/dev/null | jq -r '"OK version=" + .version' || printf "FAILED\n"
@printf "openvino-embeddings.service: "; systemctl --user is-active openvino-embeddings.service 2>/dev/null || true
@printf "OpenVINO NPU embeddings (%s): " "$(OPENVINO_EMBED_URL)"; \
curl -fsS --max-time 3 "$(OPENVINO_EMBED_URL)/healthz" 2>/dev/null | jq -r '"OK model=" + .model + " device=" + .device' || printf "FAILED\n"
ollama-embed-health: ## Smoke-test Ollama embeddings using OLLAMA_EMBED_MODEL=nomic-embed-text.
@curl -fsS --max-time 20 "$(OLLAMA_URL)/api/embed" \
openvino-embed-health: ## Smoke-test OpenVINO NPU embeddings using OPENVINO_EMBED_MODEL=bge-base-en-v1.5-int8-ov.
@curl -fsS --max-time 20 "$(OPENVINO_EMBED_URL)/v1/embeddings" \
-H 'Content-Type: application/json' \
-d '{"model":"$(OLLAMA_EMBED_MODEL)","input":"socket check"}' \
| jq -r '"embeddings=" + ((.embeddings // []) | length | tostring) + " dim=" + (((.embeddings // [[]])[0] // []) | length | tostring)'
-d '{"model":"$(OPENVINO_EMBED_MODEL)","input":"socket check"}' \
| jq -r '"embeddings=" + ((.data // []) | length | tostring) + " dim=" + (((.data // [{embedding: []}])[0].embedding // []) | length | tostring) + " npu_busy_delta_us=" + ((.npu_busy_delta_us // 0) | tostring)'
up: ## Start root compose services. Use PROFILE=api,voice,search,automation or SERVICE=name.
@if [ -n "$(PROFILE)" ]; then \
@@ -137,23 +141,23 @@ api-dedup: ## Remove duplicate LiteLLM model DB entries.
api-logs: ## Follow LiteLLM logs.
$(DC) logs -f --tail="$(LOGS_TAIL)" litellm litellm-db litellm-init
voice-up: ## Start all voice services.
voice-up: ## Start default voice services: NPU Whisper and Kokoro TTS.
$(DC) --profile voice up -d
voice-gpu: ## Start GPU whisper server and Kokoro TTS.
$(DC) --profile voice up -d whisper-server-gpu kokoro-tts
voice-gpu: ## Start manual GPU whisper fallback and Kokoro TTS.
$(DC) --profile voice-gpu --profile voice up -d whisper-server-gpu kokoro-tts
voice-cpu: ## Start CPU whisper server and Kokoro TTS.
$(DC) --profile voice up -d whisper-server kokoro-tts
$(DC) --profile voice-cpu-backup --profile voice up -d whisper-server kokoro-tts
voice-down: ## Stop voice profile services.
$(DC) --profile voice down
$(DC) --profile voice --profile voice-gpu --profile voice-cpu-backup down
voice-build: ## Build the custom Blackwell CUDA whisper image.
$(DC) --profile voice build whisper-server-gpu
$(DC) --profile voice-gpu build whisper-server-gpu
voice-logs: ## Follow voice service logs.
$(DC) logs -f --tail="$(LOGS_TAIL)" whisper-server-gpu whisper-server kokoro-tts
voice-logs: ## Follow default voice service logs.
$(DC) logs -f --tail="$(LOGS_TAIL)" whisper-server-npu kokoro-tts
search-up: ## Start Brave Search MCP and SearXNG.
$(DC) --profile search up -d
+58 -14
View File
@@ -37,7 +37,7 @@ services:
whisper-init:
image: ghcr.io/ggml-org/whisper.cpp@sha256:672650b5e67f9cb86af7ac6e09dea8eac12a024086e1e5c0172fdccf336aba09
container_name: whisper-init
profiles: ["voice"]
profiles: ["voice", "voice-cpu-backup"]
restart: "no"
volumes:
- whisper-models:/app/models
@@ -54,17 +54,15 @@ services:
fi
done
# Primary whisper.cpp server: NVIDIA RTX 5070 Ti via CUDA (Blackwell sm_120).
# Uses ggml-base.bin to keep the service alive while llama-server owns most of
# the laptop GPU VRAM. The previous ggml-small.bin profile needed ~465 MiB
# contiguous CUDA memory and restarted when only ~560 MiB fragmented VRAM was
# free. CPU whisper-server below remains the higher-accuracy fallback.
# Manual GPU whisper.cpp fallback: NVIDIA RTX 5070 Ti via CUDA (Blackwell sm_120).
# Kept out of the normal `voice` profile because the OpenVINO NPU Whisper
# service is the default and this container consumes GPU resources.
#
# The official `ghcr.io/ggml-org/whisper.cpp:main-cuda` ships kernels only
# for sm_75/80/86/90 and fails to init CUDA on Blackwell. We build a custom
# image with `CMAKE_CUDA_ARCHITECTURES=120` from the local Dockerfile.
# Build manually with: docker build -t whisper.cpp:cuda-blackwell ./whisper-cuda-blackwell
# Or `docker compose --profile voice build whisper-server-gpu`.
# Or `docker compose --profile voice-gpu build whisper-server-gpu`.
whisper-server-gpu:
image: whisper.cpp:cuda-blackwell
build:
@@ -72,7 +70,7 @@ services:
dockerfile: Dockerfile
container_name: whisper-server-gpu
restart: unless-stopped
profiles: ["voice"]
profiles: ["voice-gpu"]
ports:
- "18801:8080"
volumes:
@@ -115,16 +113,62 @@ services:
agentmon.role: "voice"
agentmon.port: "18801"
# Fallback whisper.cpp server: CPU-only, medium model.
# Kept around for resilience — runs if the GPU server is down (driver issue,
# gemma takes all VRAM, custom image broken, etc.). Uses no GPU resources.
# ~14 s per short clip (medium-on-CPU is 90x slower than small-on-GPU above).
# Start with: docker compose --profile voice up -d whisper-server
# Experimental OpenVINO GenAI Whisper server using the Intel NPU.
# This is not whisper.cpp; it implements the same OpenAI-style
# /v1/audio/transcriptions route using OpenVINO WhisperPipeline on NPU.
# Host requirements: intel-npu-driver-bin installed, /dev/accel/accel0 present,
# and the host NPU Level Zero driver/compiler libraries mounted below.
whisper-server-npu:
image: whisper-openvino-npu:local
build:
context: ./whisper-openvino-npu
dockerfile: Dockerfile
container_name: whisper-server-npu
restart: unless-stopped
profiles: ["voice"]
ports:
- "18816:8080"
devices:
- /dev/accel/accel0:/dev/accel/accel0
group_add:
- "987" # host render group gid on willlaptop
environment:
- WHISPER_DEVICE=NPU
- WHISPER_MODEL_DIR=/models/whisper-tiny-fp16-ov
- LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu
- ZE_ENABLE_ALT_DRIVERS=/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1
volumes:
- /home/will/.cache/openvino-models/whisper-tiny-fp16-ov:/models/whisper-tiny-fp16-ov:ro
- /usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1.32.1:/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1.32.1:ro
- /usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1:/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1:ro
- /usr/lib/x86_64-linux-gnu/libze_intel_npu.so:/usr/lib/x86_64-linux-gnu/libze_intel_npu.so:ro
- /usr/lib/x86_64-linux-gnu/libnpu_driver_compiler.so:/usr/lib/x86_64-linux-gnu/libnpu_driver_compiler.so:ro
healthcheck:
test:
[
"CMD-SHELL",
"curl -f http://localhost:8080/health >/dev/null 2>&1 || exit 1",
]
interval: 30s
timeout: 5s
start_period: 30s
retries: 3
labels:
agentmon.monitor: "true"
agentmon.role: "voice"
agentmon.port: "18816"
# Manual fallback whisper.cpp server: CPU-only, medium model.
# Kept around for resilience — runs if the NPU/GPU servers are down. Uses no
# accelerator resources, but is slow (~14 s per short clip).
# Disabled from the normal `voice` profile now that `whisper-server-npu` is
# the trial default. Start manually with:
# docker compose --profile voice-cpu-backup up -d whisper-server
whisper-server:
image: ghcr.io/ggml-org/whisper.cpp@sha256:672650b5e67f9cb86af7ac6e09dea8eac12a024086e1e5c0172fdccf336aba09
container_name: whisper-server
restart: unless-stopped
profiles: ["voice"]
profiles: ["voice-cpu-backup"]
ports:
- "18811:8080"
volumes:
+3 -3
View File
@@ -83,13 +83,13 @@
<!-- Local services -->
<g><rect x="965" y="165" width="210" height="80" rx="9" fill="#0f172a"/><rect x="965" y="165" width="210" height="80" rx="9" fill="rgba(6,78,59,.4)" stroke="#34d399" stroke-width="1.6"/><text x="1070" y="195" text-anchor="middle" class="title">LiteLLM</text><text x="1070" y="216" text-anchor="middle" class="tiny">LLM router + DB</text><text x="1070" y="234" text-anchor="middle" class="port">:18804</text></g>
<g><rect x="965" y="275" width="210" height="80" rx="9" fill="#0f172a"/><rect x="965" y="275" width="210" height="80" rx="9" fill="rgba(8,51,68,.4)" stroke="#22d3ee" stroke-width="1.6"/><text x="1070" y="305" text-anchor="middle" class="title">Search</text><text x="1070" y="326" text-anchor="middle" class="tiny">SearXNG + Brave MCP</text><text x="1070" y="344" text-anchor="middle" class="port">:18803 / :18802</text></g>
<g><rect x="965" y="385" width="210" height="80" rx="9" fill="#0f172a"/><rect x="965" y="385" width="210" height="80" rx="9" fill="rgba(8,51,68,.4)" stroke="#22d3ee" stroke-width="1.6"/><text x="1070" y="415" text-anchor="middle" class="title">Voice</text><text x="1070" y="436" text-anchor="middle" class="tiny">Kokoro + Whisper</text><text x="1070" y="454" text-anchor="middle" class="port">:18805 / :18811</text></g>
<g><rect x="965" y="385" width="210" height="80" rx="9" fill="#0f172a"/><rect x="965" y="385" width="210" height="80" rx="9" fill="rgba(8,51,68,.4)" stroke="#22d3ee" stroke-width="1.6"/><text x="1070" y="415" text-anchor="middle" class="title">Voice</text><text x="1070" y="436" text-anchor="middle" class="tiny">Kokoro + Whisper</text><text x="1070" y="454" text-anchor="middle" class="port">:18805 / :18816</text></g>
<g><rect x="965" y="555" width="210" height="80" rx="9" fill="#0f172a"/><rect x="965" y="555" width="210" height="80" rx="9" fill="rgba(76,29,149,.4)" stroke="#a78bfa" stroke-width="1.6"/><text x="1070" y="585" text-anchor="middle" class="title">Docker services</text><text x="1070" y="606" text-anchor="middle" class="tiny">agentmon.monitor=true</text><text x="1070" y="624" text-anchor="middle" class="port">swarm/service snapshots</text></g>
<g><rect x="965" y="665" width="210" height="80" rx="9" fill="#0f172a"/><rect x="965" y="665" width="210" height="80" rx="9" fill="rgba(120,53,15,.3)" stroke="#fbbf24" stroke-width="1.6"/><text x="1070" y="695" text-anchor="middle" class="title">OpenClaw VMs</text><text x="1070" y="716" text-anchor="middle" class="tiny">currently dormant</text><text x="1070" y="734" text-anchor="middle" class="port">openclaw.snapshot</text></g>
<g><rect x="965" y="775" width="210" height="60" rx="9" fill="#0f172a"/><rect x="965" y="775" width="210" height="60" rx="9" fill="rgba(76,29,149,.4)" stroke="#a78bfa" stroke-width="1.6"/><text x="1070" y="802" text-anchor="middle" class="title">Obsidian / RAG</text><text x="1070" y="822" text-anchor="middle" class="port">:27123/:27124 + ChromaDB</text></g>
<!-- host local ai box -->
<g><rect x="280" y="675" width="190" height="100" rx="10" fill="#0f172a"/><rect x="280" y="675" width="190" height="100" rx="10" fill="rgba(76,29,149,.4)" stroke="#a78bfa" stroke-width="1.8"/><text x="375" y="706" text-anchor="middle" class="title">host local AI</text><text x="375" y="730" text-anchor="middle" class="tiny">llama.cpp :18806</text><text x="375" y="752" text-anchor="middle" class="tiny">Ollama embed :18807</text></g>
<g><rect x="280" y="675" width="210" height="120" rx="10" fill="#0f172a"/><rect x="280" y="675" width="210" height="120" rx="10" fill="rgba(76,29,149,.4)" stroke="#a78bfa" stroke-width="1.8"/><text x="385" y="706" text-anchor="middle" class="title">host local AI</text><text x="385" y="730" text-anchor="middle" class="tiny">llama.cpp :18806</text><text x="385" y="752" text-anchor="middle" class="tiny">Ollama fallback :18807</text><text x="385" y="774" text-anchor="middle" class="tiny">OpenVINO NPU embed :18817</text></g>
<!-- legend -->
<g transform="translate(40,820)">
@@ -104,7 +104,7 @@
</div>
<div class="cards">
<div class="info"><h3>Monitoring model</h3><ul><li>• n8n direct probes critical ports</li><li>• agentmon aggregates Docker/OpenClaw snapshots</li><li>• n8n polls agentmon for stale/degraded state</li></ul></div>
<div class="info"><h3>Operational endpoints</h3><ul><li>• n8n: 127.0.0.1:18808</li><li>• agentmon query/UI: 8081 / 8082</li><li>• local LLM/embed: 18806 / 18807</li></ul></div>
<div class="info"><h3>Operational endpoints</h3><ul><li>• n8n: 127.0.0.1:18808</li><li>• agentmon query/UI: 8081 / 8082</li><li>• local LLM/embed: 18806 / 18817</li><li>• Ollama fallback: 18807</li></ul></div>
<div class="info"><h3>Source paths</h3><ul><li>• Swarm repo: ~/lab/swarm</li><li>• Agentmon repo: ~/lab/agentmon</li><li>• Workflows: swarm-common/n8n-workflows</li></ul></div>
</div>
<div class="footer">Generated as repo documentation. Open locally in a browser; no JavaScript, all SVG inline.</div>
+10 -7
View File
@@ -32,9 +32,10 @@ local AI/search/voice services
+--> SearXNG :18803
+--> Brave MCP :18802
+--> llama.cpp :18806
+--> Ollama embeddings :18807
+--> Ollama embeddings :18807 (legacy/CPU fallback)
+--> OpenVINO NPU embeddings :18817
+--> Kokoro TTS :18805
+--> Whisper :18811
+--> Whisper NPU :18816
```
See also:
@@ -115,15 +116,16 @@ Docker services:
- `searxng``:18803`, local metasearch
- `brave-search``:18802`, Brave Search MCP server
- `kokoro-tts``:18805`, local TTS
- `whisper-server``:18811`, local transcription
- `whisper-server-npu``:18816`, OpenVINO NPU local transcription
- `n8n-agent``:18808`, automation
Host/user services:
- `llama-server.service``:18806`, local llama.cpp OpenAI-compatible LLM
- `ollama.service``:18807`, embeddings API
- `ollama.service``:18807`, legacy/CPU embeddings API fallback
- `openvino-embeddings.service``:18817`, OpenVINO NPU embeddings API (`/v1/embeddings`, `/api/embed`, `/api/embeddings`)
- `docker-health-endpoint.service``:18809`, read-only container health for n8n
- `obsidian-reindex-endpoint.service``:18810`, Obsidian/RAG reindex trigger
- `obsidian-reindex-endpoint.service``:18810`, Obsidian/RAG reindex trigger; default collection `obsidian_bge_npu` using OpenVINO NPU embeddings
- `url-content-extractor.service``:18812`, YouTube/PDF/web extraction
- `voice-memo-processor.service``:18813`, voice memo processing
- `rag-embedding-health.service``:18814`, RAG/embedding health wrapper
@@ -142,8 +144,9 @@ Local REST API:
RAG/vector store:
- ChromaDB path: `~/.hermes/data/rag-search/chroma/`
- Reindex state/progress: `~/.hermes/data/rag-search/obsidian_index_state.json` and `obsidian_reindex_progress.json`
- Embeddings backend: Ollama on `:18807`, normally `nomic-embed-text`
- Reindex state/progress: active BGE/NPU state in `~/.hermes/data/rag-search/obsidian_bge_npu_index_state.json` and `obsidian_bge_npu_reindex_progress.json`; legacy Ollama state in `obsidian_index_state.json` remains for comparison/fallback.
- Active RAG query/reindex embedding backend: OpenVINO NPU embeddings service on `:18817`, currently `bge-base-en-v1.5-int8-ov`, collection `obsidian_bge_npu`.
- Legacy comparison/fallback collection: `obsidian`, built with Ollama on `:18807` using `nomic-embed-text`.
- Reindex endpoint: `POST :18810/reindex` for incremental updates, `POST :18810/reindex?full=true` for full semantic rebuilds, `GET :18810/semantic-health` to verify vectors plus a search smoke test.
## Monitoring model
+1 -1
View File
@@ -24,7 +24,7 @@ CONTAINERS = [
"litellm-db",
"n8n-agent",
"searxng",
"whisper-server",
"whisper-server-npu",
]
+23 -6
View File
@@ -26,14 +26,20 @@ from urllib.parse import parse_qs, urlparse
PORT = int(os.environ.get("PORT", 18810))
REINDEX_TIMEOUT = int(os.environ.get("REINDEX_TIMEOUT", "1800"))
RAG_COLLECTION = os.environ.get("RAG_COLLECTION", "obsidian").strip() or "obsidian"
RAG_EMBED_MODEL = os.environ.get("RAG_EMBED_MODEL", "nomic-embed-text").strip() or "nomic-embed-text"
OLLAMA_BASE_URL = (os.environ.get("OLLAMA_BASE_URL") or "http://127.0.0.1:18807").rstrip("/")
REINDEX_SCRIPT = str(
Path.home()
/ ".hermes/skills/note-taking/rag-search/scripts/reindex_obsidian.sh"
)
STATE_FILE = (
Path.home() / ".hermes/data/rag-search/obsidian_index_state.json"
)
STATE_FILE = Path(
os.environ.get("RAG_STATE_FILE")
or Path.home() / ".hermes/data/rag-search" / (
"obsidian_index_state.json" if RAG_COLLECTION == "obsidian" else f"{RAG_COLLECTION}_index_state.json"
)
).expanduser()
SEARCH_SCRIPT = str(Path.home() / ".hermes/skills/note-taking/rag-search/scripts/search.py")
VENV_PYTHON = str(Path.home() / ".hermes/skills/note-taking/rag-search/venv/bin/python")
@@ -50,11 +56,16 @@ def run_reindex(full: bool = False) -> dict:
cmd = [REINDEX_SCRIPT]
if full:
cmd.append("--full")
env = os.environ.copy()
env.setdefault("RAG_COLLECTION", RAG_COLLECTION)
env.setdefault("RAG_EMBED_MODEL", RAG_EMBED_MODEL)
env.setdefault("OLLAMA_BASE_URL", OLLAMA_BASE_URL)
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=REINDEX_TIMEOUT,
env=env,
)
if result.returncode != 0:
return {
@@ -97,12 +108,16 @@ def run_semantic_search(query: str, top_k: int = 5) -> dict:
if not query:
return {"ok": False, "error": "query is required", "results": []}
top_k = max(1, min(int(top_k or 5), 20))
env = os.environ.copy()
env.setdefault("RAG_COLLECTION", RAG_COLLECTION)
env.setdefault("RAG_EMBED_MODEL", RAG_EMBED_MODEL)
env.setdefault("OLLAMA_BASE_URL", OLLAMA_BASE_URL)
result = subprocess.run(
[
VENV_PYTHON if Path(VENV_PYTHON).exists() else sys.executable,
SEARCH_SCRIPT,
"--index",
"obsidian",
RAG_COLLECTION,
"--top-k",
str(top_k),
"--raw",
@@ -111,6 +126,7 @@ def run_semantic_search(query: str, top_k: int = 5) -> dict:
capture_output=True,
text=True,
timeout=90,
env=env,
)
if result.returncode != 0:
return {
@@ -125,7 +141,7 @@ def run_semantic_search(query: str, top_k: int = 5) -> dict:
return {
"ok": True,
"query": query,
"index": payload.get("index", "obsidian"),
"index": payload.get("index", RAG_COLLECTION),
"top_k": top_k,
"result_count": len(results),
"results": results,
@@ -144,7 +160,8 @@ def semantic_health() -> dict:
"note_count",
"vector_count",
"collection",
"chroma_path",
"embedding_backend",
"embedding_model",
"last_full_index",
"last_incremental_index",
)
+236
View File
@@ -0,0 +1,236 @@
#!/usr/bin/env python3
"""OpenVINO GenAI embedding HTTP service for Will's local swarm stack.
Default port: 18817
Default model: OpenVINO/bge-base-en-v1.5-int8-ov, cached under ~/.cache/openvino-models/
Default device: NPU
Exposes a deliberately small compatibility surface:
GET /healthz
GET /api/tags # Ollama-ish model listing for health scripts
POST /api/embed # Ollama-ish batched embeddings
POST /api/embeddings # Ollama-ish single embedding
POST /v1/embeddings # OpenAI-compatible embeddings response
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import threading
import time
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from pathlib import Path
from typing import Any
import openvino as ov
import openvino_genai as ovg
DEFAULT_MODEL_NAME = "bge-base-en-v1.5-int8-ov"
DEFAULT_MODEL_DIR = Path.home() / ".cache/openvino-models" / DEFAULT_MODEL_NAME
DEFAULT_PORT = 18817
NPU_BUSY_FILE = Path("/sys/class/accel/accel0/device/npu_busy_time_us")
def npu_busy_time_us() -> int | None:
try:
return int(NPU_BUSY_FILE.read_text().strip())
except Exception:
return None
class EmbeddingService:
def __init__(self, model_dir: Path, model_name: str, device: str, max_length: int) -> None:
self.model_dir = model_dir
self.model_name = model_name
self.device = device
self.max_length = max_length
self.loaded_at = time.time()
self.lock = threading.Lock()
self.embedding_dim: int | None = None
if not self.model_dir.exists():
raise FileNotFoundError(f"model directory not found: {self.model_dir}")
core = ov.Core()
self.available_devices = list(core.available_devices)
if self.device not in self.available_devices:
raise RuntimeError(f"OpenVINO device {self.device!r} unavailable; available={self.available_devices}")
# Intel NPU currently needs static shape for this embedding pipeline.
# batch_size=1 is intentional: multi-input requests are served by looping
# one text at a time, keeping the model shape acceptable to NPUW.
cfg = ovg.TextEmbeddingPipeline.Config()
cfg.max_length = int(max_length)
cfg.pad_to_max_length = True
cfg.batch_size = 1
self.pipeline = ovg.TextEmbeddingPipeline(self.model_dir, self.device, cfg)
def embed_one(self, text: str, *, purpose: str = "query") -> dict[str, Any]:
text = str(text or "")
if not text.strip():
raise ValueError("embedding input text is empty")
if purpose not in {"query", "document"}:
raise ValueError("embedding purpose must be 'query' or 'document'")
before = npu_busy_time_us()
started = time.perf_counter()
# TextEmbeddingPipeline is a native object; serialize calls until proven
# safe under concurrent NPU use. Tiny silicon clown-car avoidance clause.
with self.lock:
if purpose == "document":
# batch_size=1 means embed_documents must receive exactly one doc.
vec = self.pipeline.embed_documents([text])[0]
else:
vec = self.pipeline.embed_query(text)
after = npu_busy_time_us()
vector = [float(x) for x in vec]
self.embedding_dim = len(vector)
return {
"embedding": vector,
"dim": len(vector),
"purpose": purpose,
"duration_ms": round((time.perf_counter() - started) * 1000, 3),
"npu_busy_delta_us": None if before is None or after is None else after - before,
}
def health(self) -> dict[str, Any]:
return {
"status": "ok",
"service": "openvino-embeddings",
"model": self.model_name,
"model_dir": str(self.model_dir),
"device": self.device,
"available_devices": self.available_devices,
"embedding_dim": self.embedding_dim,
"max_length": self.max_length,
"uptime_s": round(time.time() - self.loaded_at, 3),
"npu_busy_time_us": npu_busy_time_us(),
}
def normalize_input(value: Any) -> list[str]:
if isinstance(value, str):
return [value]
if isinstance(value, list):
texts = [str(item) for item in value]
if texts:
return texts
raise ValueError("input must be a non-empty string or list of strings")
class Handler(BaseHTTPRequestHandler):
server_version = "OpenVINOEmbeddings/0.1"
@property
def svc(self) -> EmbeddingService:
return self.server.embedding_service # type: ignore[attr-defined]
def do_GET(self) -> None:
path = self.path.split("?", 1)[0].rstrip("/") or "/"
if path in {"/", "/healthz", "/readyz"}:
self.write_json(self.svc.health())
elif path == "/api/tags":
self.write_json({"models": [{"name": self.svc.model_name, "model": self.svc.model_name}]})
elif path == "/v1/models":
self.write_json({"object": "list", "data": [{"id": self.svc.model_name, "object": "model", "owned_by": "local"}]})
else:
self.write_json({"error": "not found"}, status=404)
def do_POST(self) -> None:
path = self.path.split("?", 1)[0].rstrip("/") or "/"
try:
payload = self.read_json()
if path == "/api/embed":
texts = normalize_input(payload.get("input"))
purpose = str(payload.get("purpose") or payload.get("task") or "document")
results = [self.svc.embed_one(text, purpose=purpose) for text in texts]
self.write_json({
"model": payload.get("model") or self.svc.model_name,
"embeddings": [item["embedding"] for item in results],
"embedding_dim": results[0]["dim"] if results else None,
"purpose": purpose,
"npu_busy_delta_us": sum((item.get("npu_busy_delta_us") or 0) for item in results),
"durations_ms": [item["duration_ms"] for item in results],
})
elif path == "/api/embeddings":
text = payload.get("prompt") or payload.get("input")
result = self.svc.embed_one(str(text or ""), purpose="query")
self.write_json({
"model": payload.get("model") or self.svc.model_name,
"embedding": result["embedding"],
"embedding_dim": result["dim"],
"npu_busy_delta_us": result["npu_busy_delta_us"],
"duration_ms": result["duration_ms"],
})
elif path == "/v1/embeddings":
texts = normalize_input(payload.get("input"))
purpose = str(payload.get("purpose") or payload.get("task") or "query")
results = [self.svc.embed_one(text, purpose=purpose) for text in texts]
self.write_json({
"object": "list",
"model": payload.get("model") or self.svc.model_name,
"data": [
{"object": "embedding", "index": idx, "embedding": item["embedding"]}
for idx, item in enumerate(results)
],
"usage": {"prompt_tokens": 0, "total_tokens": 0},
"embedding_dim": results[0]["dim"] if results else None,
"purpose": purpose,
"npu_busy_delta_us": sum((item.get("npu_busy_delta_us") or 0) for item in results),
"durations_ms": [item["duration_ms"] for item in results],
})
else:
self.write_json({"error": "not found"}, status=404)
except ValueError as exc:
self.write_json({"error": str(exc)}, status=400)
except Exception as exc:
self.write_json({"error": f"{type(exc).__name__}: {exc}"}, status=500)
def read_json(self) -> dict[str, Any]:
length = int(self.headers.get("Content-Length") or 0)
body = self.rfile.read(length).decode("utf-8", "replace") if length else "{}"
payload = json.loads(body or "{}")
if not isinstance(payload, dict):
raise ValueError("JSON body must be an object")
return payload
def write_json(self, payload: dict[str, Any], status: int = 200) -> None:
body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def log_message(self, format: str, *args: Any) -> None: # noqa: A002 - stdlib override name
print(f"{self.address_string()} - {format % args}", file=sys.stderr, flush=True)
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--host", default=os.environ.get("OPENVINO_EMBED_HOST", "0.0.0.0"))
parser.add_argument("--port", type=int, default=int(os.environ.get("OPENVINO_EMBED_PORT", DEFAULT_PORT)))
parser.add_argument("--model-dir", default=os.environ.get("OPENVINO_EMBED_MODEL_DIR", str(DEFAULT_MODEL_DIR)))
parser.add_argument("--model-name", default=os.environ.get("OPENVINO_EMBED_MODEL", DEFAULT_MODEL_NAME))
parser.add_argument("--device", default=os.environ.get("OPENVINO_EMBED_DEVICE", "NPU"))
parser.add_argument("--max-length", type=int, default=int(os.environ.get("OPENVINO_EMBED_MAX_LENGTH", "512")))
args = parser.parse_args()
service = EmbeddingService(Path(args.model_dir).expanduser(), args.model_name, args.device, args.max_length)
httpd = ThreadingHTTPServer((args.host, args.port), Handler)
httpd.embedding_service = service # type: ignore[attr-defined]
print(
f"openvino-embeddings listening on {args.host}:{args.port} "
f"model={args.model_name} device={args.device}",
flush=True,
)
try:
httpd.serve_forever()
except KeyboardInterrupt:
pass
return 0
if __name__ == "__main__":
raise SystemExit(main())
+2 -1
View File
@@ -51,7 +51,8 @@ class Handler(http.server.BaseHTTPRequestHandler):
env = os.environ.copy()
env.setdefault("HERMES_HOME", "/home/will/.hermes")
env.setdefault("OLLAMA_BASE_URL", "http://127.0.0.1:18807")
env.setdefault("OLLAMA_BASE_URL", "http://127.0.0.1:18817")
env.setdefault("RAG_EMBED_MODEL", "bge-base-en-v1.5-int8-ov")
env.setdefault("N8N_URL", "http://127.0.0.1:18808")
env.setdefault("OBSIDIAN_REINDEX_URL", "http://127.0.0.1:18810")
+1 -1
View File
@@ -32,7 +32,7 @@ AUDIO_DIR = os.path.join(tempfile.gettempdir(), "voice-memo-audio")
os.makedirs(AUDIO_DIR, exist_ok=True)
# Service endpoints (from host perspective)
WHISPER_URL = os.environ.get("WHISPER_URL", "http://127.0.0.1:18811/v1/audio/transcriptions")
WHISPER_URL = os.environ.get("WHISPER_URL", "http://127.0.0.1:18816/v1/audio/transcriptions")
LLM_URL = os.environ.get("LLM_URL", "http://127.0.0.1:18806/v1/chat/completions")
KOKORO_URL = os.environ.get("KOKORO_URL", "http://127.0.0.1:18805/v1/audio/speech")
+1 -1
View File
@@ -7,7 +7,7 @@ from http.server import HTTPServer, BaseHTTPRequestHandler
from pathlib import Path
PORT = int(os.environ.get("VOICE_MEMO_PORT", "18813"))
WHISPER_URL = os.environ.get("WHISPER_BASE_URL", "http://127.0.0.1:18811")
WHISPER_URL = os.environ.get("WHISPER_BASE_URL", "http://127.0.0.1:18816")
LLM_URL = os.environ.get("LLAMA_CPP_BASE_URL", "http://127.0.0.1:18806")
KOKORO_URL = os.environ.get("KOKORO_BASE_URL", "http://127.0.0.1:18805")
TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
File diff suppressed because one or more lines are too long
@@ -0,0 +1,16 @@
[Unit]
Description=Obsidian Vault Reindex Endpoint
After=network.target
[Service]
Type=simple
ExecStart=/usr/bin/python3 /home/will/lab/swarm/scripts/obsidian-reindex-server.py
Restart=on-failure
RestartSec=5
Environment=PORT=18810
Environment=RAG_COLLECTION=obsidian_bge_npu
Environment=RAG_EMBED_MODEL=bge-base-en-v1.5-int8-ov
Environment=OLLAMA_BASE_URL=http://127.0.0.1:18817
[Install]
WantedBy=default.target
+19
View File
@@ -0,0 +1,19 @@
[Unit]
Description=OpenVINO NPU Embeddings HTTP Service (port 18817)
After=network.target
[Service]
Type=simple
WorkingDirectory=/home/will/lab/swarm
ExecStart=/home/will/.venvs/npu/bin/python /home/will/lab/swarm/scripts/openvino-embeddings-server.py
Restart=on-failure
RestartSec=5
Environment=OPENVINO_EMBED_PORT=18817
Environment=OPENVINO_EMBED_HOST=0.0.0.0
Environment=OPENVINO_EMBED_DEVICE=NPU
Environment=OPENVINO_EMBED_MODEL=bge-base-en-v1.5-int8-ov
Environment=OPENVINO_EMBED_MODEL_DIR=/home/will/.cache/openvino-models/bge-base-en-v1.5-int8-ov
Environment=OPENVINO_EMBED_MAX_LENGTH=512
[Install]
WantedBy=default.target
@@ -9,6 +9,8 @@ Restart=on-failure
RestartSec=5
Environment=PORT=18814
Environment=RAG_HEALTH_TIMEOUT=180
Environment=OLLAMA_BASE_URL=http://127.0.0.1:18817
Environment=RAG_EMBED_MODEL=bge-base-en-v1.5-int8-ov
[Install]
WantedBy=default.target
+31
View File
@@ -0,0 +1,31 @@
FROM python:3.14-slim
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu \
ZE_ENABLE_ALT_DRIVERS=/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ffmpeg libze1 ca-certificates curl \
&& rm -rf /var/lib/apt/lists/*
RUN python -m pip install --upgrade pip \
&& python -m pip install \
fastapi==0.126.0 \
uvicorn[standard]==0.38.0 \
python-multipart==0.0.22 \
openvino==2026.2.0 \
openvino-genai==2026.2.0.0 \
soundfile==0.13.1 \
numpy==2.4.6
WORKDIR /app
COPY server.py /app/server.py
EXPOSE 8080
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
CMD curl -fsS http://localhost:8080/health >/dev/null || exit 1
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8080"]
+147
View File
@@ -0,0 +1,147 @@
import os
import subprocess
import tempfile
import threading
import time
from pathlib import Path
from typing import Optional
import numpy as np
import openvino as ov
import openvino_genai as ov_genai
import soundfile as sf
from fastapi import FastAPI, File, Form, UploadFile
from fastapi.responses import JSONResponse, PlainTextResponse
MODEL_DIR = Path(os.environ.get("WHISPER_MODEL_DIR", "/models/whisper-tiny-fp16-ov"))
DEVICE = os.environ.get("WHISPER_DEVICE", "NPU")
BUSY_PATH = Path("/sys/class/accel/accel0/device/npu_busy_time_us")
app = FastAPI(title="OpenVINO NPU Whisper server", version="0.1.0")
_lock = threading.Lock()
_pipe = None
_core = None
def busy_us() -> Optional[int]:
try:
return int(BUSY_PATH.read_text().strip())
except Exception:
return None
def get_core():
global _core
if _core is None:
_core = ov.Core()
return _core
def get_pipe():
global _pipe
if _pipe is None:
_pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE)
return _pipe
def load_audio(upload_path: Path) -> tuple[np.ndarray, int]:
"""Decode arbitrary uploaded audio to mono 16 kHz float32 using ffmpeg + soundfile."""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav:
wav_path = Path(wav.name)
try:
subprocess.run(
[
"ffmpeg",
"-nostdin",
"-hide_banner",
"-loglevel",
"error",
"-y",
"-i",
str(upload_path),
"-ac",
"1",
"-ar",
"16000",
"-f",
"wav",
str(wav_path),
],
check=True,
)
audio, sr = sf.read(wav_path, dtype="float32")
if audio.ndim > 1:
audio = audio.mean(axis=1)
return audio, int(sr)
finally:
try:
wav_path.unlink()
except FileNotFoundError:
pass
@app.get("/")
def root():
return PlainTextResponse("OpenVINO NPU Whisper server\n")
@app.get("/health")
def health():
try:
core = get_core()
devices = core.available_devices
npu_name = core.get_property("NPU", "FULL_DEVICE_NAME") if "NPU" in devices else None
return {
"ok": "NPU" in devices,
"device": DEVICE,
"devices": devices,
"npu": npu_name,
"model_dir": str(MODEL_DIR),
"model_exists": MODEL_DIR.exists(),
"npu_busy_time_us": busy_us(),
}
except Exception as e:
return JSONResponse(status_code=500, content={"ok": False, "error": f"{type(e).__name__}: {e}"})
@app.post("/v1/audio/transcriptions")
async def transcriptions(
file: UploadFile = File(...),
model: Optional[str] = Form(default=None),
language: Optional[str] = Form(default=None),
response_format: Optional[str] = Form(default="json"),
):
suffix = Path(file.filename or "audio").suffix or ".audio"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
upload_path = Path(tmp.name)
tmp.write(await file.read())
before = busy_us()
t0 = time.perf_counter()
try:
audio, sr = load_audio(upload_path)
# OpenVINO GenAI WhisperPipeline appears stateful for Whisper generation on
# this stack: reusing one pipeline produced unstable language detection on
# repeated short clips. Recreate per request for correctness; OpenVINO's
# compiled-cache path keeps warm init reasonably fast.
with _lock:
pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE)
result = pipe.generate(audio)
text = str(result).strip()
elapsed = time.perf_counter() - t0
after = busy_us()
if response_format == "text":
return PlainTextResponse(text)
return {
"text": text,
"duration_seconds": round(elapsed, 4),
"sample_rate": sr,
"device": DEVICE,
"model": model or MODEL_DIR.name,
"npu_busy_delta_us": None if before is None or after is None else after - before,
}
finally:
try:
upload_path.unlink()
except FileNotFoundError:
pass