feat(rag): add OpenVINO NPU embeddings service

This commit is contained in:
William Valentin
2026-06-03 18:28:16 -07:00
parent 7745648a13
commit fe4dea0f07
7 changed files with 268 additions and 14 deletions
+12 -8
View File
@@ -16,7 +16,8 @@ OPENCLAW_PORT ?= 18789
QEMU_URI ?= qemu:///system
LLAMA_CPP_URL ?= http://127.0.0.1:18806
OLLAMA_URL ?= http://127.0.0.1:18807
OLLAMA_EMBED_MODEL ?= nomic-embed-text
OPENVINO_EMBED_URL ?= http://127.0.0.1:18817
OPENVINO_EMBED_MODEL ?= bge-base-en-v1.5-int8-ov
DC := $(COMPOSE) -f $(COMPOSE_FILE)
COMMON_DC := $(COMPOSE) -f $(COMMON_COMPOSE_FILE)
@@ -28,7 +29,7 @@ REQUIRE_CONFIRM = test "$(CONFIRM)" = "yes" || { echo "This target changes VM/ga
REQUIRE_INSTANCE = test -n "$(OPENCLAW_HOST)" -a -n "$(OPENCLAW_DOMAIN)" || { echo "Unknown OpenClaw HOST=$(HOST) in $(OPENCLAW_REGISTRY)"; exit 2; }
.DEFAULT_GOAL := help
.PHONY: help config ps status local-ai-health ollama-embed-health up down restart pull build logs shell clean \
.PHONY: help config ps status local-ai-health openvino-embed-health up down restart pull build logs shell clean \
api-up api-down api-restart api-init api-init-force api-health api-dedup api-logs \
voice-up voice-gpu voice-cpu voice-down voice-build voice-logs \
search-up search-down automation-up automation-down n8n-logs \
@@ -53,7 +54,7 @@ ps: ## Show root Docker Compose service status.
status: ps local-ai-health ## Show Docker service status plus host-side local AI endpoints.
local-ai-health: ## Check host-side llama.cpp LLM and Ollama embeddings endpoints.
local-ai-health: ## Check host-side llama.cpp LLM, Ollama fallback, and OpenVINO NPU embeddings endpoints.
@printf "\nHost-side local AI endpoints:\n"
@printf "llama.cpp (%s): " "$(LLAMA_CPP_URL)"; \
if curl -fsS --max-time 3 "$(LLAMA_CPP_URL)/v1/models" >/tmp/swarm-llama-models.json 2>/dev/null; then \
@@ -62,14 +63,17 @@ local-ai-health: ## Check host-side llama.cpp LLM and Ollama embeddings endpoint
printf "FAILED\n"; \
fi
@printf "ollama.service: "; systemctl --user is-active ollama.service 2>/dev/null || true
@printf "Ollama API (%s): " "$(OLLAMA_URL)"; \
@printf "Ollama fallback API (%s): " "$(OLLAMA_URL)"; \
curl -fsS --max-time 3 "$(OLLAMA_URL)/api/version" 2>/dev/null | jq -r '"OK version=" + .version' || printf "FAILED\n"
@printf "openvino-embeddings.service: "; systemctl --user is-active openvino-embeddings.service 2>/dev/null || true
@printf "OpenVINO NPU embeddings (%s): " "$(OPENVINO_EMBED_URL)"; \
curl -fsS --max-time 3 "$(OPENVINO_EMBED_URL)/healthz" 2>/dev/null | jq -r '"OK model=" + .model + " device=" + .device' || printf "FAILED\n"
ollama-embed-health: ## Smoke-test Ollama embeddings using OLLAMA_EMBED_MODEL=nomic-embed-text.
@curl -fsS --max-time 20 "$(OLLAMA_URL)/api/embed" \
openvino-embed-health: ## Smoke-test OpenVINO NPU embeddings using OPENVINO_EMBED_MODEL=bge-base-en-v1.5-int8-ov.
@curl -fsS --max-time 20 "$(OPENVINO_EMBED_URL)/v1/embeddings" \
-H 'Content-Type: application/json' \
-d '{"model":"$(OLLAMA_EMBED_MODEL)","input":"socket check"}' \
| jq -r '"embeddings=" + ((.embeddings // []) | length | tostring) + " dim=" + (((.embeddings // [[]])[0] // []) | length | tostring)'
-d '{"model":"$(OPENVINO_EMBED_MODEL)","input":"socket check"}' \
| jq -r '"embeddings=" + ((.data // []) | length | tostring) + " dim=" + (((.data // [{embedding: []}])[0].embedding // []) | length | tostring) + " npu_busy_delta_us=" + ((.npu_busy_delta_us // 0) | tostring)'
up: ## Start root compose services. Use PROFILE=api,voice,search,automation or SERVICE=name.
@if [ -n "$(PROFILE)" ]; then \
+2 -2
View File
@@ -89,7 +89,7 @@
<g><rect x="965" y="775" width="210" height="60" rx="9" fill="#0f172a"/><rect x="965" y="775" width="210" height="60" rx="9" fill="rgba(76,29,149,.4)" stroke="#a78bfa" stroke-width="1.6"/><text x="1070" y="802" text-anchor="middle" class="title">Obsidian / RAG</text><text x="1070" y="822" text-anchor="middle" class="port">:27123/:27124 + ChromaDB</text></g>
<!-- host local ai box -->
<g><rect x="280" y="675" width="190" height="100" rx="10" fill="#0f172a"/><rect x="280" y="675" width="190" height="100" rx="10" fill="rgba(76,29,149,.4)" stroke="#a78bfa" stroke-width="1.8"/><text x="375" y="706" text-anchor="middle" class="title">host local AI</text><text x="375" y="730" text-anchor="middle" class="tiny">llama.cpp :18806</text><text x="375" y="752" text-anchor="middle" class="tiny">Ollama embed :18807</text></g>
<g><rect x="280" y="675" width="210" height="120" rx="10" fill="#0f172a"/><rect x="280" y="675" width="210" height="120" rx="10" fill="rgba(76,29,149,.4)" stroke="#a78bfa" stroke-width="1.8"/><text x="385" y="706" text-anchor="middle" class="title">host local AI</text><text x="385" y="730" text-anchor="middle" class="tiny">llama.cpp :18806</text><text x="385" y="752" text-anchor="middle" class="tiny">Ollama fallback :18807</text><text x="385" y="774" text-anchor="middle" class="tiny">OpenVINO NPU embed :18817</text></g>
<!-- legend -->
<g transform="translate(40,820)">
@@ -104,7 +104,7 @@
</div>
<div class="cards">
<div class="info"><h3>Monitoring model</h3><ul><li>• n8n direct probes critical ports</li><li>• agentmon aggregates Docker/OpenClaw snapshots</li><li>• n8n polls agentmon for stale/degraded state</li></ul></div>
<div class="info"><h3>Operational endpoints</h3><ul><li>• n8n: 127.0.0.1:18808</li><li>• agentmon query/UI: 8081 / 8082</li><li>• local LLM/embed: 18806 / 18807</li></ul></div>
<div class="info"><h3>Operational endpoints</h3><ul><li>• n8n: 127.0.0.1:18808</li><li>• agentmon query/UI: 8081 / 8082</li><li>• local LLM/embed: 18806 / 18817</li><li>• Ollama fallback: 18807</li></ul></div>
<div class="info"><h3>Source paths</h3><ul><li>• Swarm repo: ~/lab/swarm</li><li>• Agentmon repo: ~/lab/agentmon</li><li>• Workflows: swarm-common/n8n-workflows</li></ul></div>
</div>
<div class="footer">Generated as repo documentation. Open locally in a browser; no JavaScript, all SVG inline.</div>
+6 -3
View File
@@ -32,7 +32,8 @@ local AI/search/voice services
+--> SearXNG :18803
+--> Brave MCP :18802
+--> llama.cpp :18806
+--> Ollama embeddings :18807
+--> Ollama embeddings :18807 (legacy/CPU fallback)
+--> OpenVINO NPU embeddings :18817
+--> Kokoro TTS :18805
+--> Whisper NPU :18816
```
@@ -121,7 +122,8 @@ Docker services:
Host/user services:
- `llama-server.service``:18806`, local llama.cpp OpenAI-compatible LLM
- `ollama.service``:18807`, embeddings API
- `ollama.service``:18807`, legacy/CPU embeddings API fallback
- `openvino-embeddings.service``:18817`, OpenVINO NPU embeddings API (`/v1/embeddings`, `/api/embed`, `/api/embeddings`)
- `docker-health-endpoint.service``:18809`, read-only container health for n8n
- `obsidian-reindex-endpoint.service``:18810`, Obsidian/RAG reindex trigger
- `url-content-extractor.service``:18812`, YouTube/PDF/web extraction
@@ -143,7 +145,8 @@ RAG/vector store:
- ChromaDB path: `~/.hermes/data/rag-search/chroma/`
- Reindex state/progress: `~/.hermes/data/rag-search/obsidian_index_state.json` and `obsidian_reindex_progress.json`
- Embeddings backend: Ollama on `:18807`, normally `nomic-embed-text`
- RAG query/reindex embedding backend: still Ollama on `:18807` with `nomic-embed-text` until a deliberate full Chroma rebuild/migration is run.
- RAG/embedding health probe backend: OpenVINO NPU embeddings service on `:18817`, currently `bge-base-en-v1.5-int8-ov`.
- Reindex endpoint: `POST :18810/reindex` for incremental updates, `POST :18810/reindex?full=true` for full semantic rebuilds, `GET :18810/semantic-health` to verify vectors plus a search smoke test.
## Monitoring model
+225
View File
@@ -0,0 +1,225 @@
#!/usr/bin/env python3
"""OpenVINO GenAI embedding HTTP service for Will's local swarm stack.
Default port: 18817
Default model: OpenVINO/bge-base-en-v1.5-int8-ov, cached under ~/.cache/openvino-models/
Default device: NPU
Exposes a deliberately small compatibility surface:
GET /healthz
GET /api/tags # Ollama-ish model listing for health scripts
POST /api/embed # Ollama-ish batched embeddings
POST /api/embeddings # Ollama-ish single embedding
POST /v1/embeddings # OpenAI-compatible embeddings response
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import threading
import time
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from pathlib import Path
from typing import Any
import openvino as ov
import openvino_genai as ovg
DEFAULT_MODEL_NAME = "bge-base-en-v1.5-int8-ov"
DEFAULT_MODEL_DIR = Path.home() / ".cache/openvino-models" / DEFAULT_MODEL_NAME
DEFAULT_PORT = 18817
NPU_BUSY_FILE = Path("/sys/class/accel/accel0/device/npu_busy_time_us")
def npu_busy_time_us() -> int | None:
try:
return int(NPU_BUSY_FILE.read_text().strip())
except Exception:
return None
class EmbeddingService:
def __init__(self, model_dir: Path, model_name: str, device: str, max_length: int) -> None:
self.model_dir = model_dir
self.model_name = model_name
self.device = device
self.max_length = max_length
self.loaded_at = time.time()
self.lock = threading.Lock()
self.embedding_dim: int | None = None
if not self.model_dir.exists():
raise FileNotFoundError(f"model directory not found: {self.model_dir}")
core = ov.Core()
self.available_devices = list(core.available_devices)
if self.device not in self.available_devices:
raise RuntimeError(f"OpenVINO device {self.device!r} unavailable; available={self.available_devices}")
# Intel NPU currently needs static shape for this embedding pipeline.
# batch_size=1 is intentional: multi-input requests are served by looping
# one text at a time, keeping the model shape acceptable to NPUW.
cfg = ovg.TextEmbeddingPipeline.Config()
cfg.max_length = int(max_length)
cfg.pad_to_max_length = True
cfg.batch_size = 1
self.pipeline = ovg.TextEmbeddingPipeline(self.model_dir, self.device, cfg)
def embed_one(self, text: str) -> dict[str, Any]:
text = str(text or "")
if not text.strip():
raise ValueError("embedding input text is empty")
before = npu_busy_time_us()
started = time.perf_counter()
# TextEmbeddingPipeline is a native object; serialize calls until proven
# safe under concurrent NPU use. Tiny silicon clown-car avoidance clause.
with self.lock:
vec = self.pipeline.embed_query(text)
after = npu_busy_time_us()
vector = [float(x) for x in vec]
self.embedding_dim = len(vector)
return {
"embedding": vector,
"dim": len(vector),
"duration_ms": round((time.perf_counter() - started) * 1000, 3),
"npu_busy_delta_us": None if before is None or after is None else after - before,
}
def health(self) -> dict[str, Any]:
return {
"status": "ok",
"service": "openvino-embeddings",
"model": self.model_name,
"model_dir": str(self.model_dir),
"device": self.device,
"available_devices": self.available_devices,
"embedding_dim": self.embedding_dim,
"max_length": self.max_length,
"uptime_s": round(time.time() - self.loaded_at, 3),
"npu_busy_time_us": npu_busy_time_us(),
}
def normalize_input(value: Any) -> list[str]:
if isinstance(value, str):
return [value]
if isinstance(value, list):
texts = [str(item) for item in value]
if texts:
return texts
raise ValueError("input must be a non-empty string or list of strings")
class Handler(BaseHTTPRequestHandler):
server_version = "OpenVINOEmbeddings/0.1"
@property
def svc(self) -> EmbeddingService:
return self.server.embedding_service # type: ignore[attr-defined]
def do_GET(self) -> None:
path = self.path.split("?", 1)[0].rstrip("/") or "/"
if path in {"/", "/healthz", "/readyz"}:
self.write_json(self.svc.health())
elif path == "/api/tags":
self.write_json({"models": [{"name": self.svc.model_name, "model": self.svc.model_name}]})
elif path == "/v1/models":
self.write_json({"object": "list", "data": [{"id": self.svc.model_name, "object": "model", "owned_by": "local"}]})
else:
self.write_json({"error": "not found"}, status=404)
def do_POST(self) -> None:
path = self.path.split("?", 1)[0].rstrip("/") or "/"
try:
payload = self.read_json()
if path == "/api/embed":
texts = normalize_input(payload.get("input"))
results = [self.svc.embed_one(text) for text in texts]
self.write_json({
"model": payload.get("model") or self.svc.model_name,
"embeddings": [item["embedding"] for item in results],
"embedding_dim": results[0]["dim"] if results else None,
"npu_busy_delta_us": sum((item.get("npu_busy_delta_us") or 0) for item in results),
"durations_ms": [item["duration_ms"] for item in results],
})
elif path == "/api/embeddings":
text = payload.get("prompt") or payload.get("input")
result = self.svc.embed_one(str(text or ""))
self.write_json({
"model": payload.get("model") or self.svc.model_name,
"embedding": result["embedding"],
"embedding_dim": result["dim"],
"npu_busy_delta_us": result["npu_busy_delta_us"],
"duration_ms": result["duration_ms"],
})
elif path == "/v1/embeddings":
texts = normalize_input(payload.get("input"))
results = [self.svc.embed_one(text) for text in texts]
self.write_json({
"object": "list",
"model": payload.get("model") or self.svc.model_name,
"data": [
{"object": "embedding", "index": idx, "embedding": item["embedding"]}
for idx, item in enumerate(results)
],
"usage": {"prompt_tokens": 0, "total_tokens": 0},
"embedding_dim": results[0]["dim"] if results else None,
"npu_busy_delta_us": sum((item.get("npu_busy_delta_us") or 0) for item in results),
"durations_ms": [item["duration_ms"] for item in results],
})
else:
self.write_json({"error": "not found"}, status=404)
except ValueError as exc:
self.write_json({"error": str(exc)}, status=400)
except Exception as exc:
self.write_json({"error": f"{type(exc).__name__}: {exc}"}, status=500)
def read_json(self) -> dict[str, Any]:
length = int(self.headers.get("Content-Length") or 0)
body = self.rfile.read(length).decode("utf-8", "replace") if length else "{}"
payload = json.loads(body or "{}")
if not isinstance(payload, dict):
raise ValueError("JSON body must be an object")
return payload
def write_json(self, payload: dict[str, Any], status: int = 200) -> None:
body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def log_message(self, format: str, *args: Any) -> None: # noqa: A002 - stdlib override name
print(f"{self.address_string()} - {format % args}", file=sys.stderr, flush=True)
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--host", default=os.environ.get("OPENVINO_EMBED_HOST", "0.0.0.0"))
parser.add_argument("--port", type=int, default=int(os.environ.get("OPENVINO_EMBED_PORT", DEFAULT_PORT)))
parser.add_argument("--model-dir", default=os.environ.get("OPENVINO_EMBED_MODEL_DIR", str(DEFAULT_MODEL_DIR)))
parser.add_argument("--model-name", default=os.environ.get("OPENVINO_EMBED_MODEL", DEFAULT_MODEL_NAME))
parser.add_argument("--device", default=os.environ.get("OPENVINO_EMBED_DEVICE", "NPU"))
parser.add_argument("--max-length", type=int, default=int(os.environ.get("OPENVINO_EMBED_MAX_LENGTH", "512")))
args = parser.parse_args()
service = EmbeddingService(Path(args.model_dir).expanduser(), args.model_name, args.device, args.max_length)
httpd = ThreadingHTTPServer((args.host, args.port), Handler)
httpd.embedding_service = service # type: ignore[attr-defined]
print(
f"openvino-embeddings listening on {args.host}:{args.port} "
f"model={args.model_name} device={args.device}",
flush=True,
)
try:
httpd.serve_forever()
except KeyboardInterrupt:
pass
return 0
if __name__ == "__main__":
raise SystemExit(main())
+2 -1
View File
@@ -51,7 +51,8 @@ class Handler(http.server.BaseHTTPRequestHandler):
env = os.environ.copy()
env.setdefault("HERMES_HOME", "/home/will/.hermes")
env.setdefault("OLLAMA_BASE_URL", "http://127.0.0.1:18807")
env.setdefault("OLLAMA_BASE_URL", "http://127.0.0.1:18817")
env.setdefault("RAG_EMBED_MODEL", "bge-base-en-v1.5-int8-ov")
env.setdefault("N8N_URL", "http://127.0.0.1:18808")
env.setdefault("OBSIDIAN_REINDEX_URL", "http://127.0.0.1:18810")
+19
View File
@@ -0,0 +1,19 @@
[Unit]
Description=OpenVINO NPU Embeddings HTTP Service (port 18817)
After=network.target
[Service]
Type=simple
WorkingDirectory=/home/will/lab/swarm
ExecStart=/home/will/.venvs/npu/bin/python /home/will/lab/swarm/scripts/openvino-embeddings-server.py
Restart=on-failure
RestartSec=5
Environment=OPENVINO_EMBED_PORT=18817
Environment=OPENVINO_EMBED_HOST=0.0.0.0
Environment=OPENVINO_EMBED_DEVICE=NPU
Environment=OPENVINO_EMBED_MODEL=bge-base-en-v1.5-int8-ov
Environment=OPENVINO_EMBED_MODEL_DIR=/home/will/.cache/openvino-models/bge-base-en-v1.5-int8-ov
Environment=OPENVINO_EMBED_MAX_LENGTH=512
[Install]
WantedBy=default.target
@@ -9,6 +9,8 @@ Restart=on-failure
RestartSec=5
Environment=PORT=18814
Environment=RAG_HEALTH_TIMEOUT=180
Environment=OLLAMA_BASE_URL=http://127.0.0.1:18817
Environment=RAG_EMBED_MODEL=bge-base-en-v1.5-int8-ov
[Install]
WantedBy=default.target