feat(rag): switch Obsidian endpoint to NPU embeddings

fix(rag): distinguish query and document embeddings
feat(rag): add OpenVINO NPU embeddings service
2026-06-03 20:04:43 -07:00 · 2026-06-03 19:51:55 -07:00 · 2026-06-03 18:28:16 -07:00 · 2026-06-03 17:24:49 -07:00
16 changed files with 572 additions and 52 deletions
@@ -16,7 +16,8 @@ OPENCLAW_PORT ?= 18789
 QEMU_URI ?= qemu:///system
 LLAMA_CPP_URL ?= http://127.0.0.1:18806
 OLLAMA_URL ?= http://127.0.0.1:18807
-OLLAMA_EMBED_MODEL ?= nomic-embed-text
+OPENVINO_EMBED_URL ?= http://127.0.0.1:18817
 OPENVINO_EMBED_MODEL ?= bge-base-en-v1.5-int8-ov
 DC := $(COMPOSE) -f $(COMPOSE_FILE)
 COMMON_DC := $(COMPOSE) -f $(COMMON_COMPOSE_FILE)
@@ -28,7 +29,7 @@ REQUIRE_CONFIRM = test "$(CONFIRM)" = "yes" || { echo "This target changes VM/ga
 REQUIRE_INSTANCE = test -n "$(OPENCLAW_HOST)" -a -n "$(OPENCLAW_DOMAIN)" || { echo "Unknown OpenClaw HOST=$(HOST) in $(OPENCLAW_REGISTRY)"; exit 2; }
 .DEFAULT_GOAL := help
-.PHONY: help config ps status local-ai-health ollama-embed-health up down restart pull build logs shell clean \
+.PHONY: help config ps status local-ai-health openvino-embed-health up down restart pull build logs shell clean \
 	api-up api-down api-restart api-init api-init-force api-health api-dedup api-logs \
 	voice-up voice-gpu voice-cpu voice-down voice-build voice-logs \
 	search-up search-down automation-up automation-down n8n-logs \
@@ -53,7 +54,7 @@ ps: ## Show root Docker Compose service status.
 status: ps local-ai-health ## Show Docker service status plus host-side local AI endpoints.
-local-ai-health: ## Check host-side llama.cpp LLM and Ollama embeddings endpoints.
+local-ai-health: ## Check host-side llama.cpp LLM, Ollama fallback, and OpenVINO NPU embeddings endpoints.
 	@printf "\nHost-side local AI endpoints:\n"
 	@printf "llama.cpp (%s): " "$(LLAMA_CPP_URL)"; \
 		if curl -fsS --max-time 3 "$(LLAMA_CPP_URL)/v1/models" >/tmp/swarm-llama-models.json 2>/dev/null; then \
@@ -62,14 +63,17 @@ local-ai-health: ## Check host-side llama.cpp LLM and Ollama embeddings endpoint
 			printf "FAILED\n"; \
 		fi
 	@printf "ollama.service: "; systemctl --user is-active ollama.service 2>/dev/null || true
-	@printf "Ollama API (%s): " "$(OLLAMA_URL)"; \
+	@printf "Ollama fallback API (%s): " "$(OLLAMA_URL)"; \
 		curl -fsS --max-time 3 "$(OLLAMA_URL)/api/version" 2>/dev/null | jq -r '"OK version=" + .version' || printf "FAILED\n"
 	@printf "openvino-embeddings.service: "; systemctl --user is-active openvino-embeddings.service 2>/dev/null || true
 	@printf "OpenVINO NPU embeddings (%s): " "$(OPENVINO_EMBED_URL)"; \
 		curl -fsS --max-time 3 "$(OPENVINO_EMBED_URL)/healthz" 2>/dev/null | jq -r '"OK model=" + .model + " device=" + .device' || printf "FAILED\n"
-ollama-embed-health: ## Smoke-test Ollama embeddings using OLLAMA_EMBED_MODEL=nomic-embed-text.
+openvino-embed-health: ## Smoke-test OpenVINO NPU embeddings using OPENVINO_EMBED_MODEL=bge-base-en-v1.5-int8-ov.
-	@curl -fsS --max-time 20 "$(OLLAMA_URL)/api/embed" \
+	@curl -fsS --max-time 20 "$(OPENVINO_EMBED_URL)/v1/embeddings" \
 		-H 'Content-Type: application/json' \
-		-d '{"model":"$(OLLAMA_EMBED_MODEL)","input":"socket check"}' \
+		-d '{"model":"$(OPENVINO_EMBED_MODEL)","input":"socket check"}' \
-		| jq -r '"embeddings=" + ((.embeddings // []) | length | tostring) + " dim=" + (((.embeddings // [[]])[0] // []) | length | tostring)'
+		| jq -r '"embeddings=" + ((.data // []) | length | tostring) + " dim=" + (((.data // [{embedding: []}])[0].embedding // []) | length | tostring) + " npu_busy_delta_us=" + ((.npu_busy_delta_us // 0) | tostring)'
 up: ## Start root compose services. Use PROFILE=api,voice,search,automation or SERVICE=name.
 	@if [ -n "$(PROFILE)" ]; then \
@@ -137,23 +141,23 @@ api-dedup: ## Remove duplicate LiteLLM model DB entries.
 api-logs: ## Follow LiteLLM logs.
 	$(DC) logs -f --tail="$(LOGS_TAIL)" litellm litellm-db litellm-init
-voice-up: ## Start all voice services.
+voice-up: ## Start default voice services: NPU Whisper and Kokoro TTS.
 	$(DC) --profile voice up -d
-voice-gpu: ## Start GPU whisper server and Kokoro TTS.
+voice-gpu: ## Start manual GPU whisper fallback and Kokoro TTS.
-	$(DC) --profile voice up -d whisper-server-gpu kokoro-tts
+	$(DC) --profile voice-gpu --profile voice up -d whisper-server-gpu kokoro-tts
 voice-cpu: ## Start CPU whisper server and Kokoro TTS.
-	$(DC) --profile voice up -d whisper-server kokoro-tts
+	$(DC) --profile voice-cpu-backup --profile voice up -d whisper-server kokoro-tts
 voice-down: ## Stop voice profile services.
-	$(DC) --profile voice down
+	$(DC) --profile voice --profile voice-gpu --profile voice-cpu-backup down
 voice-build: ## Build the custom Blackwell CUDA whisper image.
-	$(DC) --profile voice build whisper-server-gpu
+	$(DC) --profile voice-gpu build whisper-server-gpu
-voice-logs: ## Follow voice service logs.
+voice-logs: ## Follow default voice service logs.
-	$(DC) logs -f --tail="$(LOGS_TAIL)" whisper-server-gpu whisper-server kokoro-tts
+	$(DC) logs -f --tail="$(LOGS_TAIL)" whisper-server-npu kokoro-tts
 search-up: ## Start Brave Search MCP and SearXNG.
 	$(DC) --profile search up -d
@@ -37,7 +37,7 @@ services:
  whisper-init:
    image: ghcr.io/ggml-org/whisper.cpp@sha256:672650b5e67f9cb86af7ac6e09dea8eac12a024086e1e5c0172fdccf336aba09
    container_name: whisper-init
-    profiles: ["voice"]
+    profiles: ["voice", "voice-cpu-backup"]
    restart: "no"
    volumes:
      - whisper-models:/app/models
@@ -54,17 +54,15 @@ services:
          fi
        done
-  # Primary whisper.cpp server: NVIDIA RTX 5070 Ti via CUDA (Blackwell sm_120).
+  # Manual GPU whisper.cpp fallback: NVIDIA RTX 5070 Ti via CUDA (Blackwell sm_120).
-  # Uses ggml-base.bin to keep the service alive while llama-server owns most of
+  # Kept out of the normal `voice` profile because the OpenVINO NPU Whisper
-  # the laptop GPU VRAM. The previous ggml-small.bin profile needed ~465 MiB
+  # service is the default and this container consumes GPU resources.
  # contiguous CUDA memory and restarted when only ~560 MiB fragmented VRAM was
  # free. CPU whisper-server below remains the higher-accuracy fallback.
  #
  # The official `ghcr.io/ggml-org/whisper.cpp:main-cuda` ships kernels only
  # for sm_75/80/86/90 and fails to init CUDA on Blackwell. We build a custom
  # image with `CMAKE_CUDA_ARCHITECTURES=120` from the local Dockerfile.
  # Build manually with: docker build -t whisper.cpp:cuda-blackwell ./whisper-cuda-blackwell
-  # Or `docker compose --profile voice build whisper-server-gpu`.
+  # Or `docker compose --profile voice-gpu build whisper-server-gpu`.
  whisper-server-gpu:
    image: whisper.cpp:cuda-blackwell
    build:
@@ -72,7 +70,7 @@ services:
      dockerfile: Dockerfile
    container_name: whisper-server-gpu
    restart: unless-stopped
-    profiles: ["voice"]
+    profiles: ["voice-gpu"]
    ports:
      - "18801:8080"
    volumes:
@@ -115,16 +113,62 @@ services:
      agentmon.role: "voice"
      agentmon.port: "18801"
-  # Fallback whisper.cpp server: CPU-only, medium model.
+  # Experimental OpenVINO GenAI Whisper server using the Intel NPU.
-  # Kept around for resilience — runs if the GPU server is down (driver issue,
+  # This is not whisper.cpp; it implements the same OpenAI-style
-  # gemma takes all VRAM, custom image broken, etc.). Uses no GPU resources.
+  # /v1/audio/transcriptions route using OpenVINO WhisperPipeline on NPU.
-  # ~14 s per short clip (medium-on-CPU is 90x slower than small-on-GPU above).
+  # Host requirements: intel-npu-driver-bin installed, /dev/accel/accel0 present,
-  # Start with: docker compose --profile voice up -d whisper-server
+  # and the host NPU Level Zero driver/compiler libraries mounted below.
  whisper-server-npu:
    image: whisper-openvino-npu:local
    build:
      context: ./whisper-openvino-npu
      dockerfile: Dockerfile
    container_name: whisper-server-npu
    restart: unless-stopped
    profiles: ["voice"]
    ports:
      - "18816:8080"
    devices:
      - /dev/accel/accel0:/dev/accel/accel0
    group_add:
      - "987" # host render group gid on willlaptop
    environment:
      - WHISPER_DEVICE=NPU
      - WHISPER_MODEL_DIR=/models/whisper-tiny-fp16-ov
      - LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu
      - ZE_ENABLE_ALT_DRIVERS=/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1
    volumes:
      - /home/will/.cache/openvino-models/whisper-tiny-fp16-ov:/models/whisper-tiny-fp16-ov:ro
      - /usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1.32.1:/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1.32.1:ro
      - /usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1:/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1:ro
      - /usr/lib/x86_64-linux-gnu/libze_intel_npu.so:/usr/lib/x86_64-linux-gnu/libze_intel_npu.so:ro
      - /usr/lib/x86_64-linux-gnu/libnpu_driver_compiler.so:/usr/lib/x86_64-linux-gnu/libnpu_driver_compiler.so:ro
    healthcheck:
      test:
        [
          "CMD-SHELL",
          "curl -f http://localhost:8080/health >/dev/null 2>&1 || exit 1",
        ]
      interval: 30s
      timeout: 5s
      start_period: 30s
      retries: 3
    labels:
      agentmon.monitor: "true"
      agentmon.role: "voice"
      agentmon.port: "18816"
  # Manual fallback whisper.cpp server: CPU-only, medium model.
  # Kept around for resilience — runs if the NPU/GPU servers are down. Uses no
  # accelerator resources, but is slow (~14 s per short clip).
  # Disabled from the normal `voice` profile now that `whisper-server-npu` is
  # the trial default. Start manually with:
  #   docker compose --profile voice-cpu-backup up -d whisper-server
  whisper-server:
    image: ghcr.io/ggml-org/whisper.cpp@sha256:672650b5e67f9cb86af7ac6e09dea8eac12a024086e1e5c0172fdccf336aba09
    container_name: whisper-server
    restart: unless-stopped
-    profiles: ["voice"]
+    profiles: ["voice-cpu-backup"]
    ports:
      - "18811:8080"
    volumes:
@@ -83,13 +83,13 @@
        <!-- Local services -->
        <g><rect x="965" y="165" width="210" height="80" rx="9" fill="#0f172a"/><rect x="965" y="165" width="210" height="80" rx="9" fill="rgba(6,78,59,.4)" stroke="#34d399" stroke-width="1.6"/><text x="1070" y="195" text-anchor="middle" class="title">LiteLLM</text><text x="1070" y="216" text-anchor="middle" class="tiny">LLM router + DB</text><text x="1070" y="234" text-anchor="middle" class="port">:18804</text></g>
        <g><rect x="965" y="275" width="210" height="80" rx="9" fill="#0f172a"/><rect x="965" y="275" width="210" height="80" rx="9" fill="rgba(8,51,68,.4)" stroke="#22d3ee" stroke-width="1.6"/><text x="1070" y="305" text-anchor="middle" class="title">Search</text><text x="1070" y="326" text-anchor="middle" class="tiny">SearXNG + Brave MCP</text><text x="1070" y="344" text-anchor="middle" class="port">:18803 / :18802</text></g>
-        <g><rect x="965" y="385" width="210" height="80" rx="9" fill="#0f172a"/><rect x="965" y="385" width="210" height="80" rx="9" fill="rgba(8,51,68,.4)" stroke="#22d3ee" stroke-width="1.6"/><text x="1070" y="415" text-anchor="middle" class="title">Voice</text><text x="1070" y="436" text-anchor="middle" class="tiny">Kokoro + Whisper</text><text x="1070" y="454" text-anchor="middle" class="port">:18805 / :18811</text></g>
+        <g><rect x="965" y="385" width="210" height="80" rx="9" fill="#0f172a"/><rect x="965" y="385" width="210" height="80" rx="9" fill="rgba(8,51,68,.4)" stroke="#22d3ee" stroke-width="1.6"/><text x="1070" y="415" text-anchor="middle" class="title">Voice</text><text x="1070" y="436" text-anchor="middle" class="tiny">Kokoro + Whisper</text><text x="1070" y="454" text-anchor="middle" class="port">:18805 / :18816</text></g>
        <g><rect x="965" y="555" width="210" height="80" rx="9" fill="#0f172a"/><rect x="965" y="555" width="210" height="80" rx="9" fill="rgba(76,29,149,.4)" stroke="#a78bfa" stroke-width="1.6"/><text x="1070" y="585" text-anchor="middle" class="title">Docker services</text><text x="1070" y="606" text-anchor="middle" class="tiny">agentmon.monitor=true</text><text x="1070" y="624" text-anchor="middle" class="port">swarm/service snapshots</text></g>
        <g><rect x="965" y="665" width="210" height="80" rx="9" fill="#0f172a"/><rect x="965" y="665" width="210" height="80" rx="9" fill="rgba(120,53,15,.3)" stroke="#fbbf24" stroke-width="1.6"/><text x="1070" y="695" text-anchor="middle" class="title">OpenClaw VMs</text><text x="1070" y="716" text-anchor="middle" class="tiny">currently dormant</text><text x="1070" y="734" text-anchor="middle" class="port">openclaw.snapshot</text></g>
        <g><rect x="965" y="775" width="210" height="60" rx="9" fill="#0f172a"/><rect x="965" y="775" width="210" height="60" rx="9" fill="rgba(76,29,149,.4)" stroke="#a78bfa" stroke-width="1.6"/><text x="1070" y="802" text-anchor="middle" class="title">Obsidian / RAG</text><text x="1070" y="822" text-anchor="middle" class="port">:27123/:27124 + ChromaDB</text></g>
        <!-- host local ai box -->
-        <g><rect x="280" y="675" width="190" height="100" rx="10" fill="#0f172a"/><rect x="280" y="675" width="190" height="100" rx="10" fill="rgba(76,29,149,.4)" stroke="#a78bfa" stroke-width="1.8"/><text x="375" y="706" text-anchor="middle" class="title">host local AI</text><text x="375" y="730" text-anchor="middle" class="tiny">llama.cpp :18806</text><text x="375" y="752" text-anchor="middle" class="tiny">Ollama embed :18807</text></g>
+        <g><rect x="280" y="675" width="210" height="120" rx="10" fill="#0f172a"/><rect x="280" y="675" width="210" height="120" rx="10" fill="rgba(76,29,149,.4)" stroke="#a78bfa" stroke-width="1.8"/><text x="385" y="706" text-anchor="middle" class="title">host local AI</text><text x="385" y="730" text-anchor="middle" class="tiny">llama.cpp :18806</text><text x="385" y="752" text-anchor="middle" class="tiny">Ollama fallback :18807</text><text x="385" y="774" text-anchor="middle" class="tiny">OpenVINO NPU embed :18817</text></g>
        <!-- legend -->
        <g transform="translate(40,820)">
@@ -104,7 +104,7 @@
    </div>
    <div class="cards">
      <div class="info"><h3>Monitoring model</h3><ul><li>• n8n direct probes critical ports</li><li>• agentmon aggregates Docker/OpenClaw snapshots</li><li>• n8n polls agentmon for stale/degraded state</li></ul></div>
-      <div class="info"><h3>Operational endpoints</h3><ul><li>• n8n: 127.0.0.1:18808</li><li>• agentmon query/UI: 8081 / 8082</li><li>• local LLM/embed: 18806 / 18807</li></ul></div>
+      <div class="info"><h3>Operational endpoints</h3><ul><li>• n8n: 127.0.0.1:18808</li><li>• agentmon query/UI: 8081 / 8082</li><li>• local LLM/embed: 18806 / 18817</li><li>• Ollama fallback: 18807</li></ul></div>
      <div class="info"><h3>Source paths</h3><ul><li>• Swarm repo: ~/lab/swarm</li><li>• Agentmon repo: ~/lab/agentmon</li><li>• Workflows: swarm-common/n8n-workflows</li></ul></div>
    </div>
    <div class="footer">Generated as repo documentation. Open locally in a browser; no JavaScript, all SVG inline.</div>
@@ -32,9 +32,10 @@ local AI/search/voice services
        +--> SearXNG :18803
        +--> Brave MCP :18802
        +--> llama.cpp :18806
-        +--> Ollama embeddings :18807
+        +--> Ollama embeddings :18807 (legacy/CPU fallback)
        +--> OpenVINO NPU embeddings :18817
        +--> Kokoro TTS :18805
-        +--> Whisper :18811
+        +--> Whisper NPU :18816
 ```
 See also:
@@ -115,15 +116,16 @@ Docker services:
 - `searxng` — `:18803`, local metasearch
 - `brave-search` — `:18802`, Brave Search MCP server
 - `kokoro-tts` — `:18805`, local TTS
- `whisper-server` — `:18811`, local transcription
+- `whisper-server-npu` — `:18816`, OpenVINO NPU local transcription
 - `n8n-agent` — `:18808`, automation
 Host/user services:
 - `llama-server.service` — `:18806`, local llama.cpp OpenAI-compatible LLM
- `ollama.service` — `:18807`, embeddings API
+- `ollama.service` — `:18807`, legacy/CPU embeddings API fallback
 - `openvino-embeddings.service` — `:18817`, OpenVINO NPU embeddings API (`/v1/embeddings`, `/api/embed`, `/api/embeddings`)
 - `docker-health-endpoint.service` — `:18809`, read-only container health for n8n
- `obsidian-reindex-endpoint.service` — `:18810`, Obsidian/RAG reindex trigger
+- `obsidian-reindex-endpoint.service` — `:18810`, Obsidian/RAG reindex trigger; default collection `obsidian_bge_npu` using OpenVINO NPU embeddings
 - `url-content-extractor.service` — `:18812`, YouTube/PDF/web extraction
 - `voice-memo-processor.service` — `:18813`, voice memo processing
 - `rag-embedding-health.service` — `:18814`, RAG/embedding health wrapper
@@ -142,8 +144,9 @@ Local REST API:
 RAG/vector store:
 - ChromaDB path: `~/.hermes/data/rag-search/chroma/`
- Reindex state/progress: `~/.hermes/data/rag-search/obsidian_index_state.json` and `obsidian_reindex_progress.json`
+- Reindex state/progress: active BGE/NPU state in `~/.hermes/data/rag-search/obsidian_bge_npu_index_state.json` and `obsidian_bge_npu_reindex_progress.json`; legacy Ollama state in `obsidian_index_state.json` remains for comparison/fallback.
- Embeddings backend: Ollama on `:18807`, normally `nomic-embed-text`
+- Active RAG query/reindex embedding backend: OpenVINO NPU embeddings service on `:18817`, currently `bge-base-en-v1.5-int8-ov`, collection `obsidian_bge_npu`.
 - Legacy comparison/fallback collection: `obsidian`, built with Ollama on `:18807` using `nomic-embed-text`.
 - Reindex endpoint: `POST :18810/reindex` for incremental updates, `POST :18810/reindex?full=true` for full semantic rebuilds, `GET :18810/semantic-health` to verify vectors plus a search smoke test.
 ## Monitoring model
@@ -24,7 +24,7 @@ CONTAINERS = [
    "litellm-db",
    "n8n-agent",
    "searxng",
-    "whisper-server",
+    "whisper-server-npu",
 ]
@@ -26,14 +26,20 @@ from urllib.parse import parse_qs, urlparse
 PORT = int(os.environ.get("PORT", 18810))
 REINDEX_TIMEOUT = int(os.environ.get("REINDEX_TIMEOUT", "1800"))
 RAG_COLLECTION = os.environ.get("RAG_COLLECTION", "obsidian").strip() or "obsidian"
 RAG_EMBED_MODEL = os.environ.get("RAG_EMBED_MODEL", "nomic-embed-text").strip() or "nomic-embed-text"
 OLLAMA_BASE_URL = (os.environ.get("OLLAMA_BASE_URL") or "http://127.0.0.1:18807").rstrip("/")
 REINDEX_SCRIPT = str(
    Path.home()
    / ".hermes/skills/note-taking/rag-search/scripts/reindex_obsidian.sh"
 )
-STATE_FILE = (
+STATE_FILE = Path(
-    Path.home() / ".hermes/data/rag-search/obsidian_index_state.json"
+    os.environ.get("RAG_STATE_FILE")
-)
+    or Path.home() / ".hermes/data/rag-search" / (
        "obsidian_index_state.json" if RAG_COLLECTION == "obsidian" else f"{RAG_COLLECTION}_index_state.json"
    )
 ).expanduser()
 SEARCH_SCRIPT = str(Path.home() / ".hermes/skills/note-taking/rag-search/scripts/search.py")
 VENV_PYTHON = str(Path.home() / ".hermes/skills/note-taking/rag-search/venv/bin/python")
@@ -50,11 +56,16 @@ def run_reindex(full: bool = False) -> dict:
        cmd = [REINDEX_SCRIPT]
        if full:
            cmd.append("--full")
        env = os.environ.copy()
        env.setdefault("RAG_COLLECTION", RAG_COLLECTION)
        env.setdefault("RAG_EMBED_MODEL", RAG_EMBED_MODEL)
        env.setdefault("OLLAMA_BASE_URL", OLLAMA_BASE_URL)
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=REINDEX_TIMEOUT,
            env=env,
        )
        if result.returncode != 0:
            return {
@@ -97,12 +108,16 @@ def run_semantic_search(query: str, top_k: int = 5) -> dict:
    if not query:
        return {"ok": False, "error": "query is required", "results": []}
    top_k = max(1, min(int(top_k or 5), 20))
    env = os.environ.copy()
    env.setdefault("RAG_COLLECTION", RAG_COLLECTION)
    env.setdefault("RAG_EMBED_MODEL", RAG_EMBED_MODEL)
    env.setdefault("OLLAMA_BASE_URL", OLLAMA_BASE_URL)
    result = subprocess.run(
        [
            VENV_PYTHON if Path(VENV_PYTHON).exists() else sys.executable,
            SEARCH_SCRIPT,
            "--index",
-            "obsidian",
+            RAG_COLLECTION,
            "--top-k",
            str(top_k),
            "--raw",
@@ -111,6 +126,7 @@ def run_semantic_search(query: str, top_k: int = 5) -> dict:
        capture_output=True,
        text=True,
        timeout=90,
        env=env,
    )
    if result.returncode != 0:
        return {
@@ -125,7 +141,7 @@ def run_semantic_search(query: str, top_k: int = 5) -> dict:
    return {
        "ok": True,
        "query": query,
-        "index": payload.get("index", "obsidian"),
+        "index": payload.get("index", RAG_COLLECTION),
        "top_k": top_k,
        "result_count": len(results),
        "results": results,
@@ -144,7 +160,8 @@ def semantic_health() -> dict:
                "note_count",
                "vector_count",
                "collection",
-                "chroma_path",
+                "embedding_backend",
                "embedding_model",
                "last_full_index",
                "last_incremental_index",
            )
@@ -0,0 +1,236 @@
 #!/usr/bin/env python3
 """OpenVINO GenAI embedding HTTP service for Will's local swarm stack.
 Default port: 18817
 Default model: OpenVINO/bge-base-en-v1.5-int8-ov, cached under ~/.cache/openvino-models/
 Default device: NPU
 Exposes a deliberately small compatibility surface:
  GET  /healthz
  GET  /api/tags                       # Ollama-ish model listing for health scripts
  POST /api/embed                      # Ollama-ish batched embeddings
  POST /api/embeddings                 # Ollama-ish single embedding
  POST /v1/embeddings                  # OpenAI-compatible embeddings response
 """
 from __future__ import annotations
 import argparse
 import json
 import os
 import sys
 import threading
 import time
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 from pathlib import Path
 from typing import Any
 import openvino as ov
 import openvino_genai as ovg
 DEFAULT_MODEL_NAME = "bge-base-en-v1.5-int8-ov"
 DEFAULT_MODEL_DIR = Path.home() / ".cache/openvino-models" / DEFAULT_MODEL_NAME
 DEFAULT_PORT = 18817
 NPU_BUSY_FILE = Path("/sys/class/accel/accel0/device/npu_busy_time_us")
 def npu_busy_time_us() -> int | None:
    try:
        return int(NPU_BUSY_FILE.read_text().strip())
    except Exception:
        return None
 class EmbeddingService:
    def __init__(self, model_dir: Path, model_name: str, device: str, max_length: int) -> None:
        self.model_dir = model_dir
        self.model_name = model_name
        self.device = device
        self.max_length = max_length
        self.loaded_at = time.time()
        self.lock = threading.Lock()
        self.embedding_dim: int | None = None
        if not self.model_dir.exists():
            raise FileNotFoundError(f"model directory not found: {self.model_dir}")
        core = ov.Core()
        self.available_devices = list(core.available_devices)
        if self.device not in self.available_devices:
            raise RuntimeError(f"OpenVINO device {self.device!r} unavailable; available={self.available_devices}")
        # Intel NPU currently needs static shape for this embedding pipeline.
        # batch_size=1 is intentional: multi-input requests are served by looping
        # one text at a time, keeping the model shape acceptable to NPUW.
        cfg = ovg.TextEmbeddingPipeline.Config()
        cfg.max_length = int(max_length)
        cfg.pad_to_max_length = True
        cfg.batch_size = 1
        self.pipeline = ovg.TextEmbeddingPipeline(self.model_dir, self.device, cfg)
    def embed_one(self, text: str, *, purpose: str = "query") -> dict[str, Any]:
        text = str(text or "")
        if not text.strip():
            raise ValueError("embedding input text is empty")
        if purpose not in {"query", "document"}:
            raise ValueError("embedding purpose must be 'query' or 'document'")
        before = npu_busy_time_us()
        started = time.perf_counter()
        # TextEmbeddingPipeline is a native object; serialize calls until proven
        # safe under concurrent NPU use. Tiny silicon clown-car avoidance clause.
        with self.lock:
            if purpose == "document":
                # batch_size=1 means embed_documents must receive exactly one doc.
                vec = self.pipeline.embed_documents([text])[0]
            else:
                vec = self.pipeline.embed_query(text)
        after = npu_busy_time_us()
        vector = [float(x) for x in vec]
        self.embedding_dim = len(vector)
        return {
            "embedding": vector,
            "dim": len(vector),
            "purpose": purpose,
            "duration_ms": round((time.perf_counter() - started) * 1000, 3),
            "npu_busy_delta_us": None if before is None or after is None else after - before,
        }
    def health(self) -> dict[str, Any]:
        return {
            "status": "ok",
            "service": "openvino-embeddings",
            "model": self.model_name,
            "model_dir": str(self.model_dir),
            "device": self.device,
            "available_devices": self.available_devices,
            "embedding_dim": self.embedding_dim,
            "max_length": self.max_length,
            "uptime_s": round(time.time() - self.loaded_at, 3),
            "npu_busy_time_us": npu_busy_time_us(),
        }
 def normalize_input(value: Any) -> list[str]:
    if isinstance(value, str):
        return [value]
    if isinstance(value, list):
        texts = [str(item) for item in value]
        if texts:
            return texts
    raise ValueError("input must be a non-empty string or list of strings")
 class Handler(BaseHTTPRequestHandler):
    server_version = "OpenVINOEmbeddings/0.1"
    @property
    def svc(self) -> EmbeddingService:
        return self.server.embedding_service  # type: ignore[attr-defined]
    def do_GET(self) -> None:
        path = self.path.split("?", 1)[0].rstrip("/") or "/"
        if path in {"/", "/healthz", "/readyz"}:
            self.write_json(self.svc.health())
        elif path == "/api/tags":
            self.write_json({"models": [{"name": self.svc.model_name, "model": self.svc.model_name}]})
        elif path == "/v1/models":
            self.write_json({"object": "list", "data": [{"id": self.svc.model_name, "object": "model", "owned_by": "local"}]})
        else:
            self.write_json({"error": "not found"}, status=404)
    def do_POST(self) -> None:
        path = self.path.split("?", 1)[0].rstrip("/") or "/"
        try:
            payload = self.read_json()
            if path == "/api/embed":
                texts = normalize_input(payload.get("input"))
                purpose = str(payload.get("purpose") or payload.get("task") or "document")
                results = [self.svc.embed_one(text, purpose=purpose) for text in texts]
                self.write_json({
                    "model": payload.get("model") or self.svc.model_name,
                    "embeddings": [item["embedding"] for item in results],
                    "embedding_dim": results[0]["dim"] if results else None,
                    "purpose": purpose,
                    "npu_busy_delta_us": sum((item.get("npu_busy_delta_us") or 0) for item in results),
                    "durations_ms": [item["duration_ms"] for item in results],
                })
            elif path == "/api/embeddings":
                text = payload.get("prompt") or payload.get("input")
                result = self.svc.embed_one(str(text or ""), purpose="query")
                self.write_json({
                    "model": payload.get("model") or self.svc.model_name,
                    "embedding": result["embedding"],
                    "embedding_dim": result["dim"],
                    "npu_busy_delta_us": result["npu_busy_delta_us"],
                    "duration_ms": result["duration_ms"],
                })
            elif path == "/v1/embeddings":
                texts = normalize_input(payload.get("input"))
                purpose = str(payload.get("purpose") or payload.get("task") or "query")
                results = [self.svc.embed_one(text, purpose=purpose) for text in texts]
                self.write_json({
                    "object": "list",
                    "model": payload.get("model") or self.svc.model_name,
                    "data": [
                        {"object": "embedding", "index": idx, "embedding": item["embedding"]}
                        for idx, item in enumerate(results)
                    ],
                    "usage": {"prompt_tokens": 0, "total_tokens": 0},
                    "embedding_dim": results[0]["dim"] if results else None,
                    "purpose": purpose,
                    "npu_busy_delta_us": sum((item.get("npu_busy_delta_us") or 0) for item in results),
                    "durations_ms": [item["duration_ms"] for item in results],
                })
            else:
                self.write_json({"error": "not found"}, status=404)
        except ValueError as exc:
            self.write_json({"error": str(exc)}, status=400)
        except Exception as exc:
            self.write_json({"error": f"{type(exc).__name__}: {exc}"}, status=500)
    def read_json(self) -> dict[str, Any]:
        length = int(self.headers.get("Content-Length") or 0)
        body = self.rfile.read(length).decode("utf-8", "replace") if length else "{}"
        payload = json.loads(body or "{}")
        if not isinstance(payload, dict):
            raise ValueError("JSON body must be an object")
        return payload
    def write_json(self, payload: dict[str, Any], status: int = 200) -> None:
        body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
        self.send_response(status)
        self.send_header("Content-Type", "application/json")
        self.send_header("Content-Length", str(len(body)))
        self.end_headers()
        self.wfile.write(body)
    def log_message(self, format: str, *args: Any) -> None:  # noqa: A002 - stdlib override name
        print(f"{self.address_string()} - {format % args}", file=sys.stderr, flush=True)
 def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--host", default=os.environ.get("OPENVINO_EMBED_HOST", "0.0.0.0"))
    parser.add_argument("--port", type=int, default=int(os.environ.get("OPENVINO_EMBED_PORT", DEFAULT_PORT)))
    parser.add_argument("--model-dir", default=os.environ.get("OPENVINO_EMBED_MODEL_DIR", str(DEFAULT_MODEL_DIR)))
    parser.add_argument("--model-name", default=os.environ.get("OPENVINO_EMBED_MODEL", DEFAULT_MODEL_NAME))
    parser.add_argument("--device", default=os.environ.get("OPENVINO_EMBED_DEVICE", "NPU"))
    parser.add_argument("--max-length", type=int, default=int(os.environ.get("OPENVINO_EMBED_MAX_LENGTH", "512")))
    args = parser.parse_args()
    service = EmbeddingService(Path(args.model_dir).expanduser(), args.model_name, args.device, args.max_length)
    httpd = ThreadingHTTPServer((args.host, args.port), Handler)
    httpd.embedding_service = service  # type: ignore[attr-defined]
    print(
        f"openvino-embeddings listening on {args.host}:{args.port} "
        f"model={args.model_name} device={args.device}",
        flush=True,
    )
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
        pass
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
@@ -51,7 +51,8 @@ class Handler(http.server.BaseHTTPRequestHandler):
        env = os.environ.copy()
        env.setdefault("HERMES_HOME", "/home/will/.hermes")
-        env.setdefault("OLLAMA_BASE_URL", "http://127.0.0.1:18807")
+        env.setdefault("OLLAMA_BASE_URL", "http://127.0.0.1:18817")
        env.setdefault("RAG_EMBED_MODEL", "bge-base-en-v1.5-int8-ov")
        env.setdefault("N8N_URL", "http://127.0.0.1:18808")
        env.setdefault("OBSIDIAN_REINDEX_URL", "http://127.0.0.1:18810")
@@ -32,7 +32,7 @@ AUDIO_DIR = os.path.join(tempfile.gettempdir(), "voice-memo-audio")
 os.makedirs(AUDIO_DIR, exist_ok=True)
 # Service endpoints (from host perspective)
-WHISPER_URL = os.environ.get("WHISPER_URL", "http://127.0.0.1:18811/v1/audio/transcriptions")
+WHISPER_URL = os.environ.get("WHISPER_URL", "http://127.0.0.1:18816/v1/audio/transcriptions")
 LLM_URL = os.environ.get("LLM_URL", "http://127.0.0.1:18806/v1/chat/completions")
 KOKORO_URL = os.environ.get("KOKORO_URL", "http://127.0.0.1:18805/v1/audio/speech")
@@ -7,7 +7,7 @@ from http.server import HTTPServer, BaseHTTPRequestHandler
 from pathlib import Path
 PORT = int(os.environ.get("VOICE_MEMO_PORT", "18813"))
-WHISPER_URL = os.environ.get("WHISPER_BASE_URL", "http://127.0.0.1:18811")
+WHISPER_URL = os.environ.get("WHISPER_BASE_URL", "http://127.0.0.1:18816")
 LLM_URL = os.environ.get("LLAMA_CPP_BASE_URL", "http://127.0.0.1:18806")
 KOKORO_URL = os.environ.get("KOKORO_BASE_URL", "http://127.0.0.1:18805")
 TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
@@ -0,0 +1,16 @@
 [Unit]
 Description=Obsidian Vault Reindex Endpoint
 After=network.target
 [Service]
 Type=simple
 ExecStart=/usr/bin/python3 /home/will/lab/swarm/scripts/obsidian-reindex-server.py
 Restart=on-failure
 RestartSec=5
 Environment=PORT=18810
 Environment=RAG_COLLECTION=obsidian_bge_npu
 Environment=RAG_EMBED_MODEL=bge-base-en-v1.5-int8-ov
 Environment=OLLAMA_BASE_URL=http://127.0.0.1:18817
 [Install]
 WantedBy=default.target
@@ -0,0 +1,19 @@
 [Unit]
 Description=OpenVINO NPU Embeddings HTTP Service (port 18817)
 After=network.target
 [Service]
 Type=simple
 WorkingDirectory=/home/will/lab/swarm
 ExecStart=/home/will/.venvs/npu/bin/python /home/will/lab/swarm/scripts/openvino-embeddings-server.py
 Restart=on-failure
 RestartSec=5
 Environment=OPENVINO_EMBED_PORT=18817
 Environment=OPENVINO_EMBED_HOST=0.0.0.0
 Environment=OPENVINO_EMBED_DEVICE=NPU
 Environment=OPENVINO_EMBED_MODEL=bge-base-en-v1.5-int8-ov
 Environment=OPENVINO_EMBED_MODEL_DIR=/home/will/.cache/openvino-models/bge-base-en-v1.5-int8-ov
 Environment=OPENVINO_EMBED_MAX_LENGTH=512
 [Install]
 WantedBy=default.target
@@ -9,6 +9,8 @@ Restart=on-failure
 RestartSec=5
 Environment=PORT=18814
 Environment=RAG_HEALTH_TIMEOUT=180
 Environment=OLLAMA_BASE_URL=http://127.0.0.1:18817
 Environment=RAG_EMBED_MODEL=bge-base-en-v1.5-int8-ov
 [Install]
 WantedBy=default.target
@@ -0,0 +1,31 @@
 FROM python:3.14-slim
 ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    PIP_NO_CACHE_DIR=1 \
    LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu \
    ZE_ENABLE_ALT_DRIVERS=/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
       ffmpeg libze1 ca-certificates curl \
    && rm -rf /var/lib/apt/lists/*
 RUN python -m pip install --upgrade pip \
    && python -m pip install \
       fastapi==0.126.0 \
       uvicorn[standard]==0.38.0 \
       python-multipart==0.0.22 \
       openvino==2026.2.0 \
       openvino-genai==2026.2.0.0 \
       soundfile==0.13.1 \
       numpy==2.4.6
 WORKDIR /app
 COPY server.py /app/server.py
 EXPOSE 8080
 HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
  CMD curl -fsS http://localhost:8080/health >/dev/null || exit 1
 CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8080"]
@@ -0,0 +1,147 @@
 import os
 import subprocess
 import tempfile
 import threading
 import time
 from pathlib import Path
 from typing import Optional
 import numpy as np
 import openvino as ov
 import openvino_genai as ov_genai
 import soundfile as sf
 from fastapi import FastAPI, File, Form, UploadFile
 from fastapi.responses import JSONResponse, PlainTextResponse
 MODEL_DIR = Path(os.environ.get("WHISPER_MODEL_DIR", "/models/whisper-tiny-fp16-ov"))
 DEVICE = os.environ.get("WHISPER_DEVICE", "NPU")
 BUSY_PATH = Path("/sys/class/accel/accel0/device/npu_busy_time_us")
 app = FastAPI(title="OpenVINO NPU Whisper server", version="0.1.0")
 _lock = threading.Lock()
 _pipe = None
 _core = None
 def busy_us() -> Optional[int]:
    try:
        return int(BUSY_PATH.read_text().strip())
    except Exception:
        return None
 def get_core():
    global _core
    if _core is None:
        _core = ov.Core()
    return _core
 def get_pipe():
    global _pipe
    if _pipe is None:
        _pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE)
    return _pipe
 def load_audio(upload_path: Path) -> tuple[np.ndarray, int]:
    """Decode arbitrary uploaded audio to mono 16 kHz float32 using ffmpeg + soundfile."""
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav:
        wav_path = Path(wav.name)
    try:
        subprocess.run(
            [
                "ffmpeg",
                "-nostdin",
                "-hide_banner",
                "-loglevel",
                "error",
                "-y",
                "-i",
                str(upload_path),
                "-ac",
                "1",
                "-ar",
                "16000",
                "-f",
                "wav",
                str(wav_path),
            ],
            check=True,
        )
        audio, sr = sf.read(wav_path, dtype="float32")
        if audio.ndim > 1:
            audio = audio.mean(axis=1)
        return audio, int(sr)
    finally:
        try:
            wav_path.unlink()
        except FileNotFoundError:
            pass
@app.get("/")
 def root():
    return PlainTextResponse("OpenVINO NPU Whisper server\n")
@app.get("/health")
 def health():
    try:
        core = get_core()
        devices = core.available_devices
        npu_name = core.get_property("NPU", "FULL_DEVICE_NAME") if "NPU" in devices else None
        return {
            "ok": "NPU" in devices,
            "device": DEVICE,
            "devices": devices,
            "npu": npu_name,
            "model_dir": str(MODEL_DIR),
            "model_exists": MODEL_DIR.exists(),
            "npu_busy_time_us": busy_us(),
        }
    except Exception as e:
        return JSONResponse(status_code=500, content={"ok": False, "error": f"{type(e).__name__}: {e}"})
@app.post("/v1/audio/transcriptions")
 async def transcriptions(
    file: UploadFile = File(...),
    model: Optional[str] = Form(default=None),
    language: Optional[str] = Form(default=None),
    response_format: Optional[str] = Form(default="json"),
 ):
    suffix = Path(file.filename or "audio").suffix or ".audio"
    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
        upload_path = Path(tmp.name)
        tmp.write(await file.read())
    before = busy_us()
    t0 = time.perf_counter()
    try:
        audio, sr = load_audio(upload_path)
        # OpenVINO GenAI WhisperPipeline appears stateful for Whisper generation on
        # this stack: reusing one pipeline produced unstable language detection on
        # repeated short clips. Recreate per request for correctness; OpenVINO's
        # compiled-cache path keeps warm init reasonably fast.
        with _lock:
            pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE)
            result = pipe.generate(audio)
        text = str(result).strip()
        elapsed = time.perf_counter() - t0
        after = busy_us()
        if response_format == "text":
            return PlainTextResponse(text)
        return {
            "text": text,
            "duration_seconds": round(elapsed, 4),
            "sample_rate": sr,
            "device": DEVICE,
            "model": model or MODEL_DIR.name,
            "npu_busy_delta_us": None if before is None or after is None else after - before,
        }
    finally:
        try:
            upload_path.unlink()
        except FileNotFoundError:
            pass
Author	SHA1	Message	Date
William Valentin	1a674e854e	feat(rag): switch Obsidian endpoint to NPU embeddings	2026-06-03 20:04:43 -07:00
William Valentin	bcc652e5aa	fix(rag): distinguish query and document embeddings	2026-06-03 19:51:55 -07:00
William Valentin	fe4dea0f07	feat(rag): add OpenVINO NPU embeddings service	2026-06-03 18:28:16 -07:00
William Valentin	7745648a13	chore(voice): make NPU Whisper the default	2026-06-03 17:24:49 -07:00