From fe4dea0f0726a8cc3d6af63ecc2d556401d947ba Mon Sep 17 00:00:00 2001 From: William Valentin Date: Wed, 3 Jun 2026 18:28:16 -0700 Subject: [PATCH] feat(rag): add OpenVINO NPU embeddings service --- Makefile | 20 +- docs/swarm-infrastructure.html | 4 +- docs/swarm-infrastructure.md | 9 +- scripts/openvino-embeddings-server.py | 225 ++++++++++++++++++++++ scripts/rag-embedding-health-server.py | 3 +- swarm-common/openvino-embeddings.service | 19 ++ swarm-common/rag-embedding-health.service | 2 + 7 files changed, 268 insertions(+), 14 deletions(-) create mode 100755 scripts/openvino-embeddings-server.py create mode 100644 swarm-common/openvino-embeddings.service diff --git a/Makefile b/Makefile index 6ace92e..d8a123b 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,8 @@ OPENCLAW_PORT ?= 18789 QEMU_URI ?= qemu:///system LLAMA_CPP_URL ?= http://127.0.0.1:18806 OLLAMA_URL ?= http://127.0.0.1:18807 -OLLAMA_EMBED_MODEL ?= nomic-embed-text +OPENVINO_EMBED_URL ?= http://127.0.0.1:18817 +OPENVINO_EMBED_MODEL ?= bge-base-en-v1.5-int8-ov DC := $(COMPOSE) -f $(COMPOSE_FILE) COMMON_DC := $(COMPOSE) -f $(COMMON_COMPOSE_FILE) @@ -28,7 +29,7 @@ REQUIRE_CONFIRM = test "$(CONFIRM)" = "yes" || { echo "This target changes VM/ga REQUIRE_INSTANCE = test -n "$(OPENCLAW_HOST)" -a -n "$(OPENCLAW_DOMAIN)" || { echo "Unknown OpenClaw HOST=$(HOST) in $(OPENCLAW_REGISTRY)"; exit 2; } .DEFAULT_GOAL := help -.PHONY: help config ps status local-ai-health ollama-embed-health up down restart pull build logs shell clean \ +.PHONY: help config ps status local-ai-health openvino-embed-health up down restart pull build logs shell clean \ api-up api-down api-restart api-init api-init-force api-health api-dedup api-logs \ voice-up voice-gpu voice-cpu voice-down voice-build voice-logs \ search-up search-down automation-up automation-down n8n-logs \ @@ -53,7 +54,7 @@ ps: ## Show root Docker Compose service status. status: ps local-ai-health ## Show Docker service status plus host-side local AI endpoints. -local-ai-health: ## Check host-side llama.cpp LLM and Ollama embeddings endpoints. +local-ai-health: ## Check host-side llama.cpp LLM, Ollama fallback, and OpenVINO NPU embeddings endpoints. @printf "\nHost-side local AI endpoints:\n" @printf "llama.cpp (%s): " "$(LLAMA_CPP_URL)"; \ if curl -fsS --max-time 3 "$(LLAMA_CPP_URL)/v1/models" >/tmp/swarm-llama-models.json 2>/dev/null; then \ @@ -62,14 +63,17 @@ local-ai-health: ## Check host-side llama.cpp LLM and Ollama embeddings endpoint printf "FAILED\n"; \ fi @printf "ollama.service: "; systemctl --user is-active ollama.service 2>/dev/null || true - @printf "Ollama API (%s): " "$(OLLAMA_URL)"; \ + @printf "Ollama fallback API (%s): " "$(OLLAMA_URL)"; \ curl -fsS --max-time 3 "$(OLLAMA_URL)/api/version" 2>/dev/null | jq -r '"OK version=" + .version' || printf "FAILED\n" + @printf "openvino-embeddings.service: "; systemctl --user is-active openvino-embeddings.service 2>/dev/null || true + @printf "OpenVINO NPU embeddings (%s): " "$(OPENVINO_EMBED_URL)"; \ + curl -fsS --max-time 3 "$(OPENVINO_EMBED_URL)/healthz" 2>/dev/null | jq -r '"OK model=" + .model + " device=" + .device' || printf "FAILED\n" -ollama-embed-health: ## Smoke-test Ollama embeddings using OLLAMA_EMBED_MODEL=nomic-embed-text. - @curl -fsS --max-time 20 "$(OLLAMA_URL)/api/embed" \ +openvino-embed-health: ## Smoke-test OpenVINO NPU embeddings using OPENVINO_EMBED_MODEL=bge-base-en-v1.5-int8-ov. + @curl -fsS --max-time 20 "$(OPENVINO_EMBED_URL)/v1/embeddings" \ -H 'Content-Type: application/json' \ - -d '{"model":"$(OLLAMA_EMBED_MODEL)","input":"socket check"}' \ - | jq -r '"embeddings=" + ((.embeddings // []) | length | tostring) + " dim=" + (((.embeddings // [[]])[0] // []) | length | tostring)' + -d '{"model":"$(OPENVINO_EMBED_MODEL)","input":"socket check"}' \ + | jq -r '"embeddings=" + ((.data // []) | length | tostring) + " dim=" + (((.data // [{embedding: []}])[0].embedding // []) | length | tostring) + " npu_busy_delta_us=" + ((.npu_busy_delta_us // 0) | tostring)' up: ## Start root compose services. Use PROFILE=api,voice,search,automation or SERVICE=name. @if [ -n "$(PROFILE)" ]; then \ diff --git a/docs/swarm-infrastructure.html b/docs/swarm-infrastructure.html index f01e4b4..06de736 100644 --- a/docs/swarm-infrastructure.html +++ b/docs/swarm-infrastructure.html @@ -89,7 +89,7 @@ Obsidian / RAG:27123/:27124 + ChromaDB - host local AIllama.cpp :18806Ollama embed :18807 + host local AIllama.cpp :18806Ollama fallback :18807OpenVINO NPU embed :18817 @@ -104,7 +104,7 @@

Monitoring model

  • • n8n direct probes critical ports
  • • agentmon aggregates Docker/OpenClaw snapshots
  • • n8n polls agentmon for stale/degraded state
-

Operational endpoints

  • • n8n: 127.0.0.1:18808
  • • agentmon query/UI: 8081 / 8082
  • • local LLM/embed: 18806 / 18807
+

Operational endpoints

  • • n8n: 127.0.0.1:18808
  • • agentmon query/UI: 8081 / 8082
  • • local LLM/embed: 18806 / 18817
  • • Ollama fallback: 18807

Source paths

  • • Swarm repo: ~/lab/swarm
  • • Agentmon repo: ~/lab/agentmon
  • • Workflows: swarm-common/n8n-workflows
diff --git a/docs/swarm-infrastructure.md b/docs/swarm-infrastructure.md index 0ca8ab4..a44423e 100644 --- a/docs/swarm-infrastructure.md +++ b/docs/swarm-infrastructure.md @@ -32,7 +32,8 @@ local AI/search/voice services +--> SearXNG :18803 +--> Brave MCP :18802 +--> llama.cpp :18806 - +--> Ollama embeddings :18807 + +--> Ollama embeddings :18807 (legacy/CPU fallback) + +--> OpenVINO NPU embeddings :18817 +--> Kokoro TTS :18805 +--> Whisper NPU :18816 ``` @@ -121,7 +122,8 @@ Docker services: Host/user services: - `llama-server.service` — `:18806`, local llama.cpp OpenAI-compatible LLM -- `ollama.service` — `:18807`, embeddings API +- `ollama.service` — `:18807`, legacy/CPU embeddings API fallback +- `openvino-embeddings.service` — `:18817`, OpenVINO NPU embeddings API (`/v1/embeddings`, `/api/embed`, `/api/embeddings`) - `docker-health-endpoint.service` — `:18809`, read-only container health for n8n - `obsidian-reindex-endpoint.service` — `:18810`, Obsidian/RAG reindex trigger - `url-content-extractor.service` — `:18812`, YouTube/PDF/web extraction @@ -143,7 +145,8 @@ RAG/vector store: - ChromaDB path: `~/.hermes/data/rag-search/chroma/` - Reindex state/progress: `~/.hermes/data/rag-search/obsidian_index_state.json` and `obsidian_reindex_progress.json` -- Embeddings backend: Ollama on `:18807`, normally `nomic-embed-text` +- RAG query/reindex embedding backend: still Ollama on `:18807` with `nomic-embed-text` until a deliberate full Chroma rebuild/migration is run. +- RAG/embedding health probe backend: OpenVINO NPU embeddings service on `:18817`, currently `bge-base-en-v1.5-int8-ov`. - Reindex endpoint: `POST :18810/reindex` for incremental updates, `POST :18810/reindex?full=true` for full semantic rebuilds, `GET :18810/semantic-health` to verify vectors plus a search smoke test. ## Monitoring model diff --git a/scripts/openvino-embeddings-server.py b/scripts/openvino-embeddings-server.py new file mode 100755 index 0000000..a74dfa3 --- /dev/null +++ b/scripts/openvino-embeddings-server.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +"""OpenVINO GenAI embedding HTTP service for Will's local swarm stack. + +Default port: 18817 +Default model: OpenVINO/bge-base-en-v1.5-int8-ov, cached under ~/.cache/openvino-models/ +Default device: NPU + +Exposes a deliberately small compatibility surface: + GET /healthz + GET /api/tags # Ollama-ish model listing for health scripts + POST /api/embed # Ollama-ish batched embeddings + POST /api/embeddings # Ollama-ish single embedding + POST /v1/embeddings # OpenAI-compatible embeddings response +""" +from __future__ import annotations + +import argparse +import json +import os +import sys +import threading +import time +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path +from typing import Any + +import openvino as ov +import openvino_genai as ovg + +DEFAULT_MODEL_NAME = "bge-base-en-v1.5-int8-ov" +DEFAULT_MODEL_DIR = Path.home() / ".cache/openvino-models" / DEFAULT_MODEL_NAME +DEFAULT_PORT = 18817 +NPU_BUSY_FILE = Path("/sys/class/accel/accel0/device/npu_busy_time_us") + + +def npu_busy_time_us() -> int | None: + try: + return int(NPU_BUSY_FILE.read_text().strip()) + except Exception: + return None + + +class EmbeddingService: + def __init__(self, model_dir: Path, model_name: str, device: str, max_length: int) -> None: + self.model_dir = model_dir + self.model_name = model_name + self.device = device + self.max_length = max_length + self.loaded_at = time.time() + self.lock = threading.Lock() + self.embedding_dim: int | None = None + + if not self.model_dir.exists(): + raise FileNotFoundError(f"model directory not found: {self.model_dir}") + + core = ov.Core() + self.available_devices = list(core.available_devices) + if self.device not in self.available_devices: + raise RuntimeError(f"OpenVINO device {self.device!r} unavailable; available={self.available_devices}") + + # Intel NPU currently needs static shape for this embedding pipeline. + # batch_size=1 is intentional: multi-input requests are served by looping + # one text at a time, keeping the model shape acceptable to NPUW. + cfg = ovg.TextEmbeddingPipeline.Config() + cfg.max_length = int(max_length) + cfg.pad_to_max_length = True + cfg.batch_size = 1 + self.pipeline = ovg.TextEmbeddingPipeline(self.model_dir, self.device, cfg) + + def embed_one(self, text: str) -> dict[str, Any]: + text = str(text or "") + if not text.strip(): + raise ValueError("embedding input text is empty") + before = npu_busy_time_us() + started = time.perf_counter() + # TextEmbeddingPipeline is a native object; serialize calls until proven + # safe under concurrent NPU use. Tiny silicon clown-car avoidance clause. + with self.lock: + vec = self.pipeline.embed_query(text) + after = npu_busy_time_us() + vector = [float(x) for x in vec] + self.embedding_dim = len(vector) + return { + "embedding": vector, + "dim": len(vector), + "duration_ms": round((time.perf_counter() - started) * 1000, 3), + "npu_busy_delta_us": None if before is None or after is None else after - before, + } + + def health(self) -> dict[str, Any]: + return { + "status": "ok", + "service": "openvino-embeddings", + "model": self.model_name, + "model_dir": str(self.model_dir), + "device": self.device, + "available_devices": self.available_devices, + "embedding_dim": self.embedding_dim, + "max_length": self.max_length, + "uptime_s": round(time.time() - self.loaded_at, 3), + "npu_busy_time_us": npu_busy_time_us(), + } + + +def normalize_input(value: Any) -> list[str]: + if isinstance(value, str): + return [value] + if isinstance(value, list): + texts = [str(item) for item in value] + if texts: + return texts + raise ValueError("input must be a non-empty string or list of strings") + + +class Handler(BaseHTTPRequestHandler): + server_version = "OpenVINOEmbeddings/0.1" + + @property + def svc(self) -> EmbeddingService: + return self.server.embedding_service # type: ignore[attr-defined] + + def do_GET(self) -> None: + path = self.path.split("?", 1)[0].rstrip("/") or "/" + if path in {"/", "/healthz", "/readyz"}: + self.write_json(self.svc.health()) + elif path == "/api/tags": + self.write_json({"models": [{"name": self.svc.model_name, "model": self.svc.model_name}]}) + elif path == "/v1/models": + self.write_json({"object": "list", "data": [{"id": self.svc.model_name, "object": "model", "owned_by": "local"}]}) + else: + self.write_json({"error": "not found"}, status=404) + + def do_POST(self) -> None: + path = self.path.split("?", 1)[0].rstrip("/") or "/" + try: + payload = self.read_json() + if path == "/api/embed": + texts = normalize_input(payload.get("input")) + results = [self.svc.embed_one(text) for text in texts] + self.write_json({ + "model": payload.get("model") or self.svc.model_name, + "embeddings": [item["embedding"] for item in results], + "embedding_dim": results[0]["dim"] if results else None, + "npu_busy_delta_us": sum((item.get("npu_busy_delta_us") or 0) for item in results), + "durations_ms": [item["duration_ms"] for item in results], + }) + elif path == "/api/embeddings": + text = payload.get("prompt") or payload.get("input") + result = self.svc.embed_one(str(text or "")) + self.write_json({ + "model": payload.get("model") or self.svc.model_name, + "embedding": result["embedding"], + "embedding_dim": result["dim"], + "npu_busy_delta_us": result["npu_busy_delta_us"], + "duration_ms": result["duration_ms"], + }) + elif path == "/v1/embeddings": + texts = normalize_input(payload.get("input")) + results = [self.svc.embed_one(text) for text in texts] + self.write_json({ + "object": "list", + "model": payload.get("model") or self.svc.model_name, + "data": [ + {"object": "embedding", "index": idx, "embedding": item["embedding"]} + for idx, item in enumerate(results) + ], + "usage": {"prompt_tokens": 0, "total_tokens": 0}, + "embedding_dim": results[0]["dim"] if results else None, + "npu_busy_delta_us": sum((item.get("npu_busy_delta_us") or 0) for item in results), + "durations_ms": [item["duration_ms"] for item in results], + }) + else: + self.write_json({"error": "not found"}, status=404) + except ValueError as exc: + self.write_json({"error": str(exc)}, status=400) + except Exception as exc: + self.write_json({"error": f"{type(exc).__name__}: {exc}"}, status=500) + + def read_json(self) -> dict[str, Any]: + length = int(self.headers.get("Content-Length") or 0) + body = self.rfile.read(length).decode("utf-8", "replace") if length else "{}" + payload = json.loads(body or "{}") + if not isinstance(payload, dict): + raise ValueError("JSON body must be an object") + return payload + + def write_json(self, payload: dict[str, Any], status: int = 200) -> None: + body = json.dumps(payload, ensure_ascii=False).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format: str, *args: Any) -> None: # noqa: A002 - stdlib override name + print(f"{self.address_string()} - {format % args}", file=sys.stderr, flush=True) + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--host", default=os.environ.get("OPENVINO_EMBED_HOST", "0.0.0.0")) + parser.add_argument("--port", type=int, default=int(os.environ.get("OPENVINO_EMBED_PORT", DEFAULT_PORT))) + parser.add_argument("--model-dir", default=os.environ.get("OPENVINO_EMBED_MODEL_DIR", str(DEFAULT_MODEL_DIR))) + parser.add_argument("--model-name", default=os.environ.get("OPENVINO_EMBED_MODEL", DEFAULT_MODEL_NAME)) + parser.add_argument("--device", default=os.environ.get("OPENVINO_EMBED_DEVICE", "NPU")) + parser.add_argument("--max-length", type=int, default=int(os.environ.get("OPENVINO_EMBED_MAX_LENGTH", "512"))) + args = parser.parse_args() + + service = EmbeddingService(Path(args.model_dir).expanduser(), args.model_name, args.device, args.max_length) + httpd = ThreadingHTTPServer((args.host, args.port), Handler) + httpd.embedding_service = service # type: ignore[attr-defined] + print( + f"openvino-embeddings listening on {args.host}:{args.port} " + f"model={args.model_name} device={args.device}", + flush=True, + ) + try: + httpd.serve_forever() + except KeyboardInterrupt: + pass + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/rag-embedding-health-server.py b/scripts/rag-embedding-health-server.py index 601ab73..24d60b4 100644 --- a/scripts/rag-embedding-health-server.py +++ b/scripts/rag-embedding-health-server.py @@ -51,7 +51,8 @@ class Handler(http.server.BaseHTTPRequestHandler): env = os.environ.copy() env.setdefault("HERMES_HOME", "/home/will/.hermes") - env.setdefault("OLLAMA_BASE_URL", "http://127.0.0.1:18807") + env.setdefault("OLLAMA_BASE_URL", "http://127.0.0.1:18817") + env.setdefault("RAG_EMBED_MODEL", "bge-base-en-v1.5-int8-ov") env.setdefault("N8N_URL", "http://127.0.0.1:18808") env.setdefault("OBSIDIAN_REINDEX_URL", "http://127.0.0.1:18810") diff --git a/swarm-common/openvino-embeddings.service b/swarm-common/openvino-embeddings.service new file mode 100644 index 0000000..2b595f1 --- /dev/null +++ b/swarm-common/openvino-embeddings.service @@ -0,0 +1,19 @@ +[Unit] +Description=OpenVINO NPU Embeddings HTTP Service (port 18817) +After=network.target + +[Service] +Type=simple +WorkingDirectory=/home/will/lab/swarm +ExecStart=/home/will/.venvs/npu/bin/python /home/will/lab/swarm/scripts/openvino-embeddings-server.py +Restart=on-failure +RestartSec=5 +Environment=OPENVINO_EMBED_PORT=18817 +Environment=OPENVINO_EMBED_HOST=0.0.0.0 +Environment=OPENVINO_EMBED_DEVICE=NPU +Environment=OPENVINO_EMBED_MODEL=bge-base-en-v1.5-int8-ov +Environment=OPENVINO_EMBED_MODEL_DIR=/home/will/.cache/openvino-models/bge-base-en-v1.5-int8-ov +Environment=OPENVINO_EMBED_MAX_LENGTH=512 + +[Install] +WantedBy=default.target diff --git a/swarm-common/rag-embedding-health.service b/swarm-common/rag-embedding-health.service index aa72e76..6bf92bc 100644 --- a/swarm-common/rag-embedding-health.service +++ b/swarm-common/rag-embedding-health.service @@ -9,6 +9,8 @@ Restart=on-failure RestartSec=5 Environment=PORT=18814 Environment=RAG_HEALTH_TIMEOUT=180 +Environment=OLLAMA_BASE_URL=http://127.0.0.1:18817 +Environment=RAG_EMBED_MODEL=bge-base-en-v1.5-int8-ov [Install] WantedBy=default.target