docs: add OpenVINO NPU services runbook

This commit is contained in:
Atlas Ops
2026-06-04 11:32:31 -07:00
committed by William Valentin
parent 4003198ba9
commit d67c259187
3 changed files with 389 additions and 1 deletions
+110
View File
@@ -0,0 +1,110 @@
#!/usr/bin/env bash
set -euo pipefail
# Read-only health probe for Will's local OpenVINO/NPU services.
# This script intentionally does not start, stop, restart, enable, reindex, or route anything.
BUSY_PATH=${BUSY_PATH:-/sys/class/accel/accel0/device/npu_busy_time_us}
CURL_TIMEOUT=${CURL_TIMEOUT:-8}
EMBED_MODEL=${EMBED_MODEL:-bge-base-en-v1.5-int8-ov}
EMBED_URL=${EMBED_URL:-http://127.0.0.1:18817/v1/embeddings}
have() { command -v "$1" >/dev/null 2>&1; }
json_pretty() {
if have jq; then
jq .
else
python -m json.tool
fi
}
section() {
printf '\n== %s ==\n' "$1"
}
http_json() {
local name=$1 url=$2
printf '\n[%s] %s\n' "$name" "$url"
if ! curl -fsS --max-time "$CURL_TIMEOUT" "$url" | json_pretty; then
printf 'status=unavailable_or_non_json\n'
return 1
fi
}
busy_value() {
if [[ -r "$BUSY_PATH" ]]; then
tr -d '\n' < "$BUSY_PATH"
else
printf 'missing'
fi
}
section "NPU counter"
printf 'busy_path=%s\n' "$BUSY_PATH"
printf 'busy_time_us=%s\n' "$(busy_value)"
section "Listeners"
ss -ltnp | grep -E ':(18810|18814|18816|18817|18818|18819|18820|18828|18829)\b' || true
section "User service states"
for unit in \
openvino-embeddings.service \
rag-embedding-health.service \
openvino-reranker.service \
openvino-router-classifier.service \
openvino-genai-npu-worker.service; do
active=$(systemctl --user is-active "$unit" 2>/dev/null || true)
enabled=$(systemctl --user is-enabled "$unit" 2>/dev/null || true)
printf '%-38s active=%-10s enabled=%s\n' "$unit" "${active:-unknown}" "${enabled:-unknown}"
done
section "Docker service states"
if [[ -d /home/will/lab/swarm ]]; then
(cd /home/will/lab/swarm && docker compose ps whisper-server-npu 2>/dev/null) || true
fi
section "HTTP health"
http_json "RAG endpoint" "http://127.0.0.1:18810/healthz" || true
http_json "RAG/embedding health wrapper" "http://127.0.0.1:18814/healthz" || true
http_json "Whisper NPU" "http://127.0.0.1:18816/health" || true
http_json "OpenVINO embeddings" "http://127.0.0.1:18817/health" || true
# Prototypes are expected to be unavailable until explicitly started/approved.
http_json "NPU reranker prototype" "http://127.0.0.1:18818/readyz" || true
http_json "NPU router classifier prototype" "http://127.0.0.1:18819/healthz" || true
http_json "NPU GenAI worker prototype" "http://127.0.0.1:18820/healthz" || true
section "Embeddings NPU busy-time proof"
if [[ ! -r "$BUSY_PATH" ]]; then
printf 'result=failed reason=missing_busy_counter\n'
exit 2
fi
before=$(busy_value)
response=$(curl -fsS --max-time "$CURL_TIMEOUT" \
"$EMBED_URL" \
-H 'Content-Type: application/json' \
-d "{\"input\":\"non-private npu health probe\",\"model\":\"$EMBED_MODEL\"}" || true)
after=$(busy_value)
if [[ -z "$response" ]]; then
printf 'result=failed reason=embedding_request_failed before_us=%s after_us=%s\n' "$before" "$after"
exit 3
fi
delta=$((after - before))
printf 'sysfs_before_us=%s\nsysfs_after_us=%s\nsysfs_delta_us=%s\n' "$before" "$after" "$delta"
printf '%s' "$response" | python - <<'PY' || true
import json, sys
try:
data = json.load(sys.stdin)
except Exception as exc:
print(f'response_parse_error={type(exc).__name__}: {exc}')
raise SystemExit(0)
print(f"response_object={data.get('object')}")
print(f"response_model={data.get('model')}")
print(f"response_npu_busy_delta_us={data.get('npu_busy_delta_us')}")
print(f"embedding_count={len(data.get('data', []))}")
PY
if (( delta <= 0 )); then
printf 'result=failed reason=no_positive_sysfs_npu_delta\n'
exit 4
fi
printf 'result=ok\n'
@@ -1,7 +1,7 @@
---
type: service-catalog
created: 2026-05-14T14:50:46-07:00
updated: 2026-06-03T21:31:01-07:00
updated: 2026-06-04T11:35:00-07:00
tags:
- service-catalog
- swarm
@@ -54,7 +54,12 @@ Canonical index of local services, automation tools, Hermes capabilities, and wh
| URL extractor | 18812 | OK 200 | URL/PDF/YouTube content extractor | `http://127.0.0.1:18812/healthz` |
| Voice memo processor | 18813 | OK 200 | Voice memo processor | `http://127.0.0.1:18813/healthz` |
| RAG/embedding health | 18814 | OK 200 | RAG/OpenVINO/Obsidian health wrapper | `http://127.0.0.1:18814/healthz` |
| Whisper OpenVINO NPU | 18816 | OK 200 / Docker healthy on 2026-06-04 | Intel NPU Whisper transcription service | `http://127.0.0.1:18816/health` |
| OpenVINO embeddings | 18817 | OK 200 | Intel NPU embeddings service for live Obsidian RAG | `http://127.0.0.1:18817/health` |
| OpenVINO NPU reranker prototype | 18818 | approved prototype; not enabled live | Optional second-stage RAG reranker | `http://127.0.0.1:18818/readyz` |
| OpenVINO router/classifier prototype | 18819 | approved prototype; not enabled live | Dry-run Atlas/Hermes message classifier/router | `http://127.0.0.1:18819/healthz` |
| OpenVINO GenAI NPU worker prototype | 18820 | approved prototype; not enabled live | Bounded local background generation worker | `http://127.0.0.1:18820/healthz` |
| OpenVINO document/image triage prototype | 18828/18829 | approved foreground prototype; not enabled live | Local document/image triage with NPU embeddings stage via `:18817` | `http://127.0.0.1:<port>/healthz` |
| Obsidian REST HTTP | 27123 | OK 200 | Obsidian Local REST API HTTP | `http://127.0.0.1:27123/` |
## Docker services
@@ -77,6 +82,7 @@ make status
make local-ai-health
make api-health
make timers
./scripts/npu-service-health.sh
```
## Host-side systemd/user services
@@ -93,6 +99,9 @@ Important known services:
| `voice-memo-processor.service` | Voice memo processing on 18813 |
| `rag-embedding-health.service` | RAG/OpenVINO/Obsidian health check wrapper on 18814 |
| `openvino-embeddings.service` | Intel NPU BGE embedding service on 18817 |
| `openvino-reranker.service` | Optional NPU reranker prototype on 18818; not installed/enabled without approval |
| `openvino-router-classifier.service` | Optional dry-run router/classifier prototype on 18819; not installed/enabled without approval |
| `openvino-genai-npu-worker.service` | Optional bounded GenAI worker prototype on 18820; not installed/enabled without approval |
Useful checks:
@@ -275,6 +284,7 @@ Profile Model Gateway Alias Distribu
| Hermes CLI/toolsets/gateway/profiles | Hermes skill `hermes-agent`; `hermes --help`; `hermes tools list` |
| Obsidian automation workflows | `~/lab/swarm/swarm-common/n8n-workflows/obsidian-*.json` |
| Runbooks | [[Runbooks Home]] |
| OpenVINO NPU service operations | [[OpenVINO NPU Services Runbook]]; `~/lab/swarm/scripts/npu-service-health.sh` |
## Safety notes
@@ -0,0 +1,268 @@
---
type: runbook
system: openvino-npu-services
status: draft
created: 2026-06-04
updated: 2026-06-04
tags:
- runbook
- openvino
- npu
- swarm
- atlas
related:
- [[Service Catalog]]
- [[Swarm Operating Manual]]
- [[Atlas Capability Upgrade Program]]
---
# OpenVINO NPU Services Runbook
This runbook is the integrated operations view for Will's local Intel NPU/OpenVINO services from the `npu-capability-expansion` board.
Safety posture:
- Do not restart the live Atlas/Hermes gateway from this runbook.
- Do not change primary Atlas/Hermes routing without explicit Will approval.
- Do not delete, overwrite, or in-place reindex existing Chroma/vector collections.
- Treat HTTP 200 as necessary but not sufficient for NPU-backed services; verify `/sys/class/accel/accel0/device/npu_busy_time_us` before/after an inference.
- Keep endpoints local-only unless Will explicitly approves broader exposure.
- Keep raw prompts, private documents, OCR text, and secrets out of logs and durable handoffs.
## Current service map
| Capability | Port | Runtime / service | Path | State | Health endpoint | NPU proof |
| --- | ---: | --- | --- | --- | --- | --- |
| Obsidian/RAG endpoint | 18810 | `obsidian-reindex-endpoint.service` / local Python endpoint | `~/lab/swarm/scripts/` | live baseline; uses collection `obsidian_bge_npu` | `http://127.0.0.1:18810/healthz` | indirect via embeddings `:18817`; do not mutate existing collection |
| RAG/embedding health wrapper | 18814 | `rag-embedding-health.service` | `~/lab/swarm/swarm-common/rag-embedding-health.service` | live baseline | `http://127.0.0.1:18814/healthz` | should exercise embeddings path when configured |
| Whisper transcription, OpenVINO NPU | 18816 | Docker Compose service/container `whisper-server-npu` | `~/lab/swarm/whisper-openvino-npu/` | live baseline | `http://127.0.0.1:18816/health` | transcription response includes `npu_busy_delta_us`; sysfs delta must increase |
| OpenVINO embeddings | 18817 | user systemd `openvino-embeddings.service` | `~/lab/swarm/scripts/openvino-embeddings-server.py`; unit in `~/lab/swarm/swarm-common/openvino-embeddings.service` | live baseline, enabled | `http://127.0.0.1:18817/health` | embedding response and sysfs delta must be positive |
| NPU reranker prototype | 18818 | optional user systemd `openvino-reranker.service` | `~/lab/swarm/openvino-reranker-npu/` | approved prototype; not installed/enabled | `http://127.0.0.1:18818/readyz` | `/readyz` reports `device=NPU`; `/v1/rerank` response and sysfs delta must be positive |
| NPU router/classifier prototype | 18819 | optional user systemd `openvino-router-classifier.service` | `~/lab/swarm/openvino-classifier-npu/` | approved prototype; not installed/enabled | `http://127.0.0.1:18819/healthz` | `/v1/classify` response has positive `npu_busy_delta_us` and `sysfs_npu_busy_delta_us` |
| Small OpenVINO GenAI NPU worker | 18820 | optional user systemd `openvino-genai-npu-worker.service` | `~/lab/swarm/openvino-genai-npu-worker/` | approved prototype; not installed/enabled | `http://127.0.0.1:18820/healthz`; `GET /models` | generation response includes positive `npu_busy_delta_us` |
| Document/image triage prototype | 18828 or 18829 for review only | foreground local-only server; no persistent unit yet | `~/lab/swarm/openvino-doc-image-triage-npu/` | approved prototype; not installed/enabled | `http://127.0.0.1:<port>/healthz`; `GET /models` | v1 NPU stage is semantic embedding through `:18817`; image classification/OCR remain CPU/local |
Port notes:
- `18818`, `18819`, and `18820` are reserved prototype ports from the program plan; check listeners before binding.
- `18820` was used by the GenAI worker prototype. The document/image triage prototype README still contains a `18820` example, but review used `18828`/`18829` to avoid collision. Prefer `18828`/`18829` for triage foreground review until Will approves a final persistent port.
- Existing `:18817` is currently bound on `0.0.0.0` by the user service; prototype services should still default to `127.0.0.1`.
## Read-only unified health check
From the swarm repo:
```bash
cd ~/lab/swarm
./scripts/npu-service-health.sh
```
The script is read-only. It checks listeners, user service state, Docker Compose state for `whisper-server-npu`, JSON health endpoints, and performs a non-private embeddings request while measuring `/sys/class/accel/accel0/device/npu_busy_time_us` before and after. A positive sysfs delta is required for the embeddings proof.
Manual minimal checks:
```bash
BUSY=/sys/class/accel/accel0/device/npu_busy_time_us
cat "$BUSY"
ss -ltnp | grep -E ':(18810|18814|18816|18817|18818|18819|18820|18828|18829)\b' || true
systemctl --user is-active openvino-embeddings.service rag-embedding-health.service
cd ~/lab/swarm && docker compose ps whisper-server-npu
curl -fsS http://127.0.0.1:18817/health | jq .
```
Embedding NPU proof:
```bash
BUSY=/sys/class/accel/accel0/device/npu_busy_time_us
before=$(cat "$BUSY")
curl -fsS http://127.0.0.1:18817/v1/embeddings \
-H 'Content-Type: application/json' \
-d '{"input":"non-private npu health probe","model":"bge-base-en-v1.5-int8-ov"}' | jq '{model, object, npu_busy_delta_us, embedding_count:(.data|length)}'
after=$(cat "$BUSY")
echo "sysfs_npu_busy_delta_us=$((after-before))"
```
A healthy NPU path has:
- HTTP success from the endpoint.
- Response-level `npu_busy_delta_us > 0` when the service reports it.
- Sysfs `after - before > 0`.
## Service-specific smoke checks
### Whisper NPU (`:18816`)
```bash
curl -fsS http://127.0.0.1:18816/health | jq .
# For a real transcription smoke, use a small non-private WAV fixture only.
# Verify both response npu_busy_delta_us and sysfs busy-time delta.
```
Operational notes:
- Managed as Docker Compose service/container `whisper-server-npu` in `~/lab/swarm`.
- Consistent with existing swarm service patterns because it is a containerized service with Compose health.
- Do not restart it from this runbook unless Will asked for remediation.
### OpenVINO embeddings (`:18817`)
```bash
systemctl --user status openvino-embeddings.service --no-pager
curl -fsS http://127.0.0.1:18817/health | jq .
```
Operational notes:
- User systemd unit: `openvino-embeddings.service`.
- Model: `bge-base-en-v1.5-int8-ov`.
- Model directory: `/home/will/.cache/openvino-models/bge-base-en-v1.5-int8-ov`.
- Live RAG `:18810` uses Chroma collection `obsidian_bge_npu` through this service. Do not reindex or replace this collection in place.
### Reranker prototype (`:18818`)
Foreground review start only, after confirming port is free:
```bash
ss -ltnp | grep ':18818\b' || true
cd ~/lab/swarm/openvino-reranker-npu
source /home/will/.venvs/openvino-reranker/bin/activate
OPENVINO_RERANKER_HOST=127.0.0.1 \
OPENVINO_RERANKER_PORT=18818 \
OPENVINO_RERANKER_DEVICE=NPU \
OPENVINO_RERANKER_MODEL_DIR=/home/will/.cache/openvino-models/rerankers/ms-marco-MiniLM-L6-v2-int8-ov \
python server.py
```
From another shell:
```bash
curl -fsS http://127.0.0.1:18818/readyz | jq .
python ~/lab/swarm/openvino-reranker-npu/smoke.py --url http://127.0.0.1:18818
```
Approval gate:
- May be installed as `openvino-reranker.service` only after foreground smoke and Will approval.
- May be integrated into RAG only behind disabled-by-default knobs such as `RAG_RERANK_ENABLED=false`; request-time reranking must not mutate Chroma.
### Router/classifier prototype (`:18819`)
Foreground review start only, after confirming port is free:
```bash
ss -ltnp | grep ':18819\b' || true
cd ~/lab/swarm/openvino-classifier-npu
/home/will/.venvs/npu/bin/python router_classifier.py --host 127.0.0.1 --port 18819
```
Smoke:
```bash
curl -fsS http://127.0.0.1:18819/healthz | jq .
curl -fsS http://127.0.0.1:18819/v1/classify \
-H 'Content-Type: application/json' \
-d '{"id":"smoke","text":"Urgent: check whether port 18817 is listening and inspect systemd logs.","options":{"include_evidence":true,"dry_run":true}}' | jq .
```
Approval gate:
- May be installed as `openvino-router-classifier.service` only after Will approves live service enablement.
- Must remain dry-run and must not alter Hermes/Atlas routing, memory writes, safety confirmation flow, or outbound messages without a separate explicit approval.
### Small GenAI NPU worker (`:18820`)
Foreground review start only, after confirming port is free:
```bash
ss -ltnp | grep ':18820\b' || true
cd ~/lab/swarm/openvino-genai-npu-worker
/home/will/.venvs/npu/bin/python worker.py --host 127.0.0.1 --port 18820
```
Smoke:
```bash
curl -fsS http://127.0.0.1:18820/healthz | jq .
curl -fsS http://127.0.0.1:18820/models | jq .
curl -fsS http://127.0.0.1:18820/v1/worker/condense-notification \
-H 'Content-Type: application/json' \
-d '{"input":"Non-private smoke notification for local NPU worker.","max_new_tokens":64}' | jq .
```
Approval gate:
- May be installed as `openvino-genai-npu-worker.service` only after Will approves persistent service enablement.
- Must not become primary Atlas/Hermes model routing. Use only for bounded background jobs such as title, summary, notification condensation, and memory-candidate drafting.
### Document/image triage prototype (`:18828`/`:18829` review ports)
Foreground review start only, after confirming port is free:
```bash
ss -ltnp | grep -E ':(18828|18829)\b' || true
cd ~/lab/swarm/openvino-doc-image-triage-npu
/home/will/.venvs/npu/bin/python server.py --host 127.0.0.1 --port 18828 --allowed-root "$PWD"
```
Smoke:
```bash
curl -fsS http://127.0.0.1:18828/healthz | jq .
curl -fsS http://127.0.0.1:18828/models | jq .
/home/will/.venvs/npu/bin/python tests/smoke_test.py
```
Approval gate:
- Do not point it at arbitrary directories; allowed roots must be equal to or under configured roots.
- Do not include raw OCR text or full source paths unless Will explicitly asks for a one-off response.
- v1 only uses the NPU through `:18817` embeddings for needs-attention; image category classification and OCR are CPU/local fallbacks.
## Systemd and Compose recommendations
Recommended management split:
- Keep containerized services in Docker Compose when they already have Docker build/runtime shape and Compose health (`whisper-server-npu`).
- Keep host-side OpenVINO Python prototypes as user systemd services when they depend on local venvs, sysfs NPU access, model caches, and localhost-only APIs (`openvino-embeddings`, optional reranker/classifier/GenAI worker).
- Do not add the prototypes to the live gateway or primary routing during installation. Installation and routing are separate approval gates.
User-systemd unit expectations for optional prototypes:
- `WorkingDirectory` points at the service directory under `~/lab/swarm/`.
- `ExecStart` uses the existing venv path documented by the prototype.
- `Environment` pins host to `127.0.0.1`, port, model path, device `NPU`, and any upstream endpoint.
- `Restart=on-failure`, not aggressive restart loops.
- Logs go to user journal; do not log raw request bodies.
- Start manually for smoke; enable on boot only after Will approval.
Compose expectations for existing swarm services:
- Prefer `cd ~/lab/swarm && make ps`, `make status`, and targeted `docker compose ps <service>` for read-only checks.
- Do not run `docker compose up -d`, restart containers, pull images, or prune volumes from this runbook without approval.
## Monitoring and logging notes
Minimum recurring monitoring should include:
- Listener presence for `18816`, `18817`, and any approved optional prototype ports.
- User service state for `openvino-embeddings.service` and any approved optional prototype unit.
- Docker Compose health for `whisper-server-npu`.
- HTTP health endpoint success.
- Positive sysfs NPU busy-time delta on at least one non-private inference probe, preferably embeddings `:18817` because it is already live and central.
- Journal/container logs only at summary level. Avoid raw prompts, raw OCR text, private document names, credentials, and API keys.
Useful log commands:
```bash
journalctl --user -u openvino-embeddings.service -n 100 --no-pager
journalctl --user -u rag-embedding-health.service -n 100 --no-pager
journalctl --user -u openvino-reranker.service -n 100 --no-pager
journalctl --user -u openvino-router-classifier.service -n 100 --no-pager
journalctl --user -u openvino-genai-npu-worker.service -n 100 --no-pager
cd ~/lab/swarm && docker compose logs --tail 100 whisper-server-npu
```
## Approval gates
Requires explicit Will approval before proceeding:
- Installing, enabling, or autostarting `openvino-reranker.service`, `openvino-router-classifier.service`, or `openvino-genai-npu-worker.service`.
- Assigning a final persistent port to document/image triage or enabling it as a persistent service.
- Enabling live RAG reranking or any request path that changes Atlas/RAG answers.
- Changing primary Atlas/Hermes routing or connecting router/classifier outputs to live decisions.
- Connecting the GenAI worker to primary Atlas chat, gateway routing, memory writes, or outbound notifications.
- Restarting the live Atlas/Hermes gateway.
- Deleting, overwriting, or in-place reindexing existing vector collections.
- Broadening bind addresses or exposure beyond local-only defaults.
Approved/parked outcomes:
- Built/approved prototypes: reranker (`:18818`), router/classifier (`:18819`), small GenAI worker (`:18820`), document/image triage (review ports `:18828`/`:18829`).
- Live baseline retained: Whisper NPU (`:18816`), OpenVINO embeddings (`:18817`), RAG endpoint (`:18810`) using `obsidian_bge_npu`.
- Parked: always-on wake-word/audio and conventional vision detection until Will wants a concrete use case.
- Rejected for this NPU program: diffusion/image generation.