From d67c2591878cdee3f24114d035debf3e945ce365 Mon Sep 17 00:00:00 2001 From: Atlas Ops Date: Thu, 4 Jun 2026 11:32:31 -0700 Subject: [PATCH] docs: add OpenVINO NPU services runbook --- scripts/npu-service-health.sh | 110 +++++++ .../Resources/Service Catalog.md | 12 +- .../Runbooks/OpenVINO NPU Services Runbook.md | 268 ++++++++++++++++++ 3 files changed, 389 insertions(+), 1 deletion(-) create mode 100755 scripts/npu-service-health.sh create mode 100644 swarm-common/obsidian-vault/will/will-shared-zap/Runbooks/OpenVINO NPU Services Runbook.md diff --git a/scripts/npu-service-health.sh b/scripts/npu-service-health.sh new file mode 100755 index 0000000..d574ecb --- /dev/null +++ b/scripts/npu-service-health.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Read-only health probe for Will's local OpenVINO/NPU services. +# This script intentionally does not start, stop, restart, enable, reindex, or route anything. + +BUSY_PATH=${BUSY_PATH:-/sys/class/accel/accel0/device/npu_busy_time_us} +CURL_TIMEOUT=${CURL_TIMEOUT:-8} +EMBED_MODEL=${EMBED_MODEL:-bge-base-en-v1.5-int8-ov} +EMBED_URL=${EMBED_URL:-http://127.0.0.1:18817/v1/embeddings} + +have() { command -v "$1" >/dev/null 2>&1; } + +json_pretty() { + if have jq; then + jq . + else + python -m json.tool + fi +} + +section() { + printf '\n== %s ==\n' "$1" +} + +http_json() { + local name=$1 url=$2 + printf '\n[%s] %s\n' "$name" "$url" + if ! curl -fsS --max-time "$CURL_TIMEOUT" "$url" | json_pretty; then + printf 'status=unavailable_or_non_json\n' + return 1 + fi +} + +busy_value() { + if [[ -r "$BUSY_PATH" ]]; then + tr -d '\n' < "$BUSY_PATH" + else + printf 'missing' + fi +} + +section "NPU counter" +printf 'busy_path=%s\n' "$BUSY_PATH" +printf 'busy_time_us=%s\n' "$(busy_value)" + +section "Listeners" +ss -ltnp | grep -E ':(18810|18814|18816|18817|18818|18819|18820|18828|18829)\b' || true + +section "User service states" +for unit in \ + openvino-embeddings.service \ + rag-embedding-health.service \ + openvino-reranker.service \ + openvino-router-classifier.service \ + openvino-genai-npu-worker.service; do + active=$(systemctl --user is-active "$unit" 2>/dev/null || true) + enabled=$(systemctl --user is-enabled "$unit" 2>/dev/null || true) + printf '%-38s active=%-10s enabled=%s\n' "$unit" "${active:-unknown}" "${enabled:-unknown}" +done + +section "Docker service states" +if [[ -d /home/will/lab/swarm ]]; then + (cd /home/will/lab/swarm && docker compose ps whisper-server-npu 2>/dev/null) || true +fi + +section "HTTP health" +http_json "RAG endpoint" "http://127.0.0.1:18810/healthz" || true +http_json "RAG/embedding health wrapper" "http://127.0.0.1:18814/healthz" || true +http_json "Whisper NPU" "http://127.0.0.1:18816/health" || true +http_json "OpenVINO embeddings" "http://127.0.0.1:18817/health" || true +# Prototypes are expected to be unavailable until explicitly started/approved. +http_json "NPU reranker prototype" "http://127.0.0.1:18818/readyz" || true +http_json "NPU router classifier prototype" "http://127.0.0.1:18819/healthz" || true +http_json "NPU GenAI worker prototype" "http://127.0.0.1:18820/healthz" || true + +section "Embeddings NPU busy-time proof" +if [[ ! -r "$BUSY_PATH" ]]; then + printf 'result=failed reason=missing_busy_counter\n' + exit 2 +fi +before=$(busy_value) +response=$(curl -fsS --max-time "$CURL_TIMEOUT" \ + "$EMBED_URL" \ + -H 'Content-Type: application/json' \ + -d "{\"input\":\"non-private npu health probe\",\"model\":\"$EMBED_MODEL\"}" || true) +after=$(busy_value) +if [[ -z "$response" ]]; then + printf 'result=failed reason=embedding_request_failed before_us=%s after_us=%s\n' "$before" "$after" + exit 3 +fi +delta=$((after - before)) +printf 'sysfs_before_us=%s\nsysfs_after_us=%s\nsysfs_delta_us=%s\n' "$before" "$after" "$delta" +printf '%s' "$response" | python - <<'PY' || true +import json, sys +try: + data = json.load(sys.stdin) +except Exception as exc: + print(f'response_parse_error={type(exc).__name__}: {exc}') + raise SystemExit(0) +print(f"response_object={data.get('object')}") +print(f"response_model={data.get('model')}") +print(f"response_npu_busy_delta_us={data.get('npu_busy_delta_us')}") +print(f"embedding_count={len(data.get('data', []))}") +PY +if (( delta <= 0 )); then + printf 'result=failed reason=no_positive_sysfs_npu_delta\n' + exit 4 +fi +printf 'result=ok\n' diff --git a/swarm-common/obsidian-vault/will/will-shared-zap/Resources/Service Catalog.md b/swarm-common/obsidian-vault/will/will-shared-zap/Resources/Service Catalog.md index adf8e1b..7824688 100644 --- a/swarm-common/obsidian-vault/will/will-shared-zap/Resources/Service Catalog.md +++ b/swarm-common/obsidian-vault/will/will-shared-zap/Resources/Service Catalog.md @@ -1,7 +1,7 @@ --- type: service-catalog created: 2026-05-14T14:50:46-07:00 -updated: 2026-06-03T21:31:01-07:00 +updated: 2026-06-04T11:35:00-07:00 tags: - service-catalog - swarm @@ -54,7 +54,12 @@ Canonical index of local services, automation tools, Hermes capabilities, and wh | URL extractor | 18812 | OK 200 | URL/PDF/YouTube content extractor | `http://127.0.0.1:18812/healthz` | | Voice memo processor | 18813 | OK 200 | Voice memo processor | `http://127.0.0.1:18813/healthz` | | RAG/embedding health | 18814 | OK 200 | RAG/OpenVINO/Obsidian health wrapper | `http://127.0.0.1:18814/healthz` | +| Whisper OpenVINO NPU | 18816 | OK 200 / Docker healthy on 2026-06-04 | Intel NPU Whisper transcription service | `http://127.0.0.1:18816/health` | | OpenVINO embeddings | 18817 | OK 200 | Intel NPU embeddings service for live Obsidian RAG | `http://127.0.0.1:18817/health` | +| OpenVINO NPU reranker prototype | 18818 | approved prototype; not enabled live | Optional second-stage RAG reranker | `http://127.0.0.1:18818/readyz` | +| OpenVINO router/classifier prototype | 18819 | approved prototype; not enabled live | Dry-run Atlas/Hermes message classifier/router | `http://127.0.0.1:18819/healthz` | +| OpenVINO GenAI NPU worker prototype | 18820 | approved prototype; not enabled live | Bounded local background generation worker | `http://127.0.0.1:18820/healthz` | +| OpenVINO document/image triage prototype | 18828/18829 | approved foreground prototype; not enabled live | Local document/image triage with NPU embeddings stage via `:18817` | `http://127.0.0.1:/healthz` | | Obsidian REST HTTP | 27123 | OK 200 | Obsidian Local REST API HTTP | `http://127.0.0.1:27123/` | ## Docker services @@ -77,6 +82,7 @@ make status make local-ai-health make api-health make timers +./scripts/npu-service-health.sh ``` ## Host-side systemd/user services @@ -93,6 +99,9 @@ Important known services: | `voice-memo-processor.service` | Voice memo processing on 18813 | | `rag-embedding-health.service` | RAG/OpenVINO/Obsidian health check wrapper on 18814 | | `openvino-embeddings.service` | Intel NPU BGE embedding service on 18817 | +| `openvino-reranker.service` | Optional NPU reranker prototype on 18818; not installed/enabled without approval | +| `openvino-router-classifier.service` | Optional dry-run router/classifier prototype on 18819; not installed/enabled without approval | +| `openvino-genai-npu-worker.service` | Optional bounded GenAI worker prototype on 18820; not installed/enabled without approval | Useful checks: @@ -275,6 +284,7 @@ Profile Model Gateway Alias Distribu | Hermes CLI/toolsets/gateway/profiles | Hermes skill `hermes-agent`; `hermes --help`; `hermes tools list` | | Obsidian automation workflows | `~/lab/swarm/swarm-common/n8n-workflows/obsidian-*.json` | | Runbooks | [[Runbooks Home]] | +| OpenVINO NPU service operations | [[OpenVINO NPU Services Runbook]]; `~/lab/swarm/scripts/npu-service-health.sh` | ## Safety notes diff --git a/swarm-common/obsidian-vault/will/will-shared-zap/Runbooks/OpenVINO NPU Services Runbook.md b/swarm-common/obsidian-vault/will/will-shared-zap/Runbooks/OpenVINO NPU Services Runbook.md new file mode 100644 index 0000000..05d3206 --- /dev/null +++ b/swarm-common/obsidian-vault/will/will-shared-zap/Runbooks/OpenVINO NPU Services Runbook.md @@ -0,0 +1,268 @@ +--- +type: runbook +system: openvino-npu-services +status: draft +created: 2026-06-04 +updated: 2026-06-04 +tags: + - runbook + - openvino + - npu + - swarm + - atlas +related: + - [[Service Catalog]] + - [[Swarm Operating Manual]] + - [[Atlas Capability Upgrade Program]] +--- + +# OpenVINO NPU Services Runbook + +This runbook is the integrated operations view for Will's local Intel NPU/OpenVINO services from the `npu-capability-expansion` board. + +Safety posture: +- Do not restart the live Atlas/Hermes gateway from this runbook. +- Do not change primary Atlas/Hermes routing without explicit Will approval. +- Do not delete, overwrite, or in-place reindex existing Chroma/vector collections. +- Treat HTTP 200 as necessary but not sufficient for NPU-backed services; verify `/sys/class/accel/accel0/device/npu_busy_time_us` before/after an inference. +- Keep endpoints local-only unless Will explicitly approves broader exposure. +- Keep raw prompts, private documents, OCR text, and secrets out of logs and durable handoffs. + +## Current service map + +| Capability | Port | Runtime / service | Path | State | Health endpoint | NPU proof | +| --- | ---: | --- | --- | --- | --- | --- | +| Obsidian/RAG endpoint | 18810 | `obsidian-reindex-endpoint.service` / local Python endpoint | `~/lab/swarm/scripts/` | live baseline; uses collection `obsidian_bge_npu` | `http://127.0.0.1:18810/healthz` | indirect via embeddings `:18817`; do not mutate existing collection | +| RAG/embedding health wrapper | 18814 | `rag-embedding-health.service` | `~/lab/swarm/swarm-common/rag-embedding-health.service` | live baseline | `http://127.0.0.1:18814/healthz` | should exercise embeddings path when configured | +| Whisper transcription, OpenVINO NPU | 18816 | Docker Compose service/container `whisper-server-npu` | `~/lab/swarm/whisper-openvino-npu/` | live baseline | `http://127.0.0.1:18816/health` | transcription response includes `npu_busy_delta_us`; sysfs delta must increase | +| OpenVINO embeddings | 18817 | user systemd `openvino-embeddings.service` | `~/lab/swarm/scripts/openvino-embeddings-server.py`; unit in `~/lab/swarm/swarm-common/openvino-embeddings.service` | live baseline, enabled | `http://127.0.0.1:18817/health` | embedding response and sysfs delta must be positive | +| NPU reranker prototype | 18818 | optional user systemd `openvino-reranker.service` | `~/lab/swarm/openvino-reranker-npu/` | approved prototype; not installed/enabled | `http://127.0.0.1:18818/readyz` | `/readyz` reports `device=NPU`; `/v1/rerank` response and sysfs delta must be positive | +| NPU router/classifier prototype | 18819 | optional user systemd `openvino-router-classifier.service` | `~/lab/swarm/openvino-classifier-npu/` | approved prototype; not installed/enabled | `http://127.0.0.1:18819/healthz` | `/v1/classify` response has positive `npu_busy_delta_us` and `sysfs_npu_busy_delta_us` | +| Small OpenVINO GenAI NPU worker | 18820 | optional user systemd `openvino-genai-npu-worker.service` | `~/lab/swarm/openvino-genai-npu-worker/` | approved prototype; not installed/enabled | `http://127.0.0.1:18820/healthz`; `GET /models` | generation response includes positive `npu_busy_delta_us` | +| Document/image triage prototype | 18828 or 18829 for review only | foreground local-only server; no persistent unit yet | `~/lab/swarm/openvino-doc-image-triage-npu/` | approved prototype; not installed/enabled | `http://127.0.0.1:/healthz`; `GET /models` | v1 NPU stage is semantic embedding through `:18817`; image classification/OCR remain CPU/local | + +Port notes: +- `18818`, `18819`, and `18820` are reserved prototype ports from the program plan; check listeners before binding. +- `18820` was used by the GenAI worker prototype. The document/image triage prototype README still contains a `18820` example, but review used `18828`/`18829` to avoid collision. Prefer `18828`/`18829` for triage foreground review until Will approves a final persistent port. +- Existing `:18817` is currently bound on `0.0.0.0` by the user service; prototype services should still default to `127.0.0.1`. + +## Read-only unified health check + +From the swarm repo: + +```bash +cd ~/lab/swarm +./scripts/npu-service-health.sh +``` + +The script is read-only. It checks listeners, user service state, Docker Compose state for `whisper-server-npu`, JSON health endpoints, and performs a non-private embeddings request while measuring `/sys/class/accel/accel0/device/npu_busy_time_us` before and after. A positive sysfs delta is required for the embeddings proof. + +Manual minimal checks: + +```bash +BUSY=/sys/class/accel/accel0/device/npu_busy_time_us +cat "$BUSY" +ss -ltnp | grep -E ':(18810|18814|18816|18817|18818|18819|18820|18828|18829)\b' || true +systemctl --user is-active openvino-embeddings.service rag-embedding-health.service +cd ~/lab/swarm && docker compose ps whisper-server-npu +curl -fsS http://127.0.0.1:18817/health | jq . +``` + +Embedding NPU proof: + +```bash +BUSY=/sys/class/accel/accel0/device/npu_busy_time_us +before=$(cat "$BUSY") +curl -fsS http://127.0.0.1:18817/v1/embeddings \ + -H 'Content-Type: application/json' \ + -d '{"input":"non-private npu health probe","model":"bge-base-en-v1.5-int8-ov"}' | jq '{model, object, npu_busy_delta_us, embedding_count:(.data|length)}' +after=$(cat "$BUSY") +echo "sysfs_npu_busy_delta_us=$((after-before))" +``` + +A healthy NPU path has: +- HTTP success from the endpoint. +- Response-level `npu_busy_delta_us > 0` when the service reports it. +- Sysfs `after - before > 0`. + +## Service-specific smoke checks + +### Whisper NPU (`:18816`) + +```bash +curl -fsS http://127.0.0.1:18816/health | jq . +# For a real transcription smoke, use a small non-private WAV fixture only. +# Verify both response npu_busy_delta_us and sysfs busy-time delta. +``` + +Operational notes: +- Managed as Docker Compose service/container `whisper-server-npu` in `~/lab/swarm`. +- Consistent with existing swarm service patterns because it is a containerized service with Compose health. +- Do not restart it from this runbook unless Will asked for remediation. + +### OpenVINO embeddings (`:18817`) + +```bash +systemctl --user status openvino-embeddings.service --no-pager +curl -fsS http://127.0.0.1:18817/health | jq . +``` + +Operational notes: +- User systemd unit: `openvino-embeddings.service`. +- Model: `bge-base-en-v1.5-int8-ov`. +- Model directory: `/home/will/.cache/openvino-models/bge-base-en-v1.5-int8-ov`. +- Live RAG `:18810` uses Chroma collection `obsidian_bge_npu` through this service. Do not reindex or replace this collection in place. + +### Reranker prototype (`:18818`) + +Foreground review start only, after confirming port is free: + +```bash +ss -ltnp | grep ':18818\b' || true +cd ~/lab/swarm/openvino-reranker-npu +source /home/will/.venvs/openvino-reranker/bin/activate +OPENVINO_RERANKER_HOST=127.0.0.1 \ +OPENVINO_RERANKER_PORT=18818 \ +OPENVINO_RERANKER_DEVICE=NPU \ +OPENVINO_RERANKER_MODEL_DIR=/home/will/.cache/openvino-models/rerankers/ms-marco-MiniLM-L6-v2-int8-ov \ +python server.py +``` + +From another shell: + +```bash +curl -fsS http://127.0.0.1:18818/readyz | jq . +python ~/lab/swarm/openvino-reranker-npu/smoke.py --url http://127.0.0.1:18818 +``` + +Approval gate: +- May be installed as `openvino-reranker.service` only after foreground smoke and Will approval. +- May be integrated into RAG only behind disabled-by-default knobs such as `RAG_RERANK_ENABLED=false`; request-time reranking must not mutate Chroma. + +### Router/classifier prototype (`:18819`) + +Foreground review start only, after confirming port is free: + +```bash +ss -ltnp | grep ':18819\b' || true +cd ~/lab/swarm/openvino-classifier-npu +/home/will/.venvs/npu/bin/python router_classifier.py --host 127.0.0.1 --port 18819 +``` + +Smoke: + +```bash +curl -fsS http://127.0.0.1:18819/healthz | jq . +curl -fsS http://127.0.0.1:18819/v1/classify \ + -H 'Content-Type: application/json' \ + -d '{"id":"smoke","text":"Urgent: check whether port 18817 is listening and inspect systemd logs.","options":{"include_evidence":true,"dry_run":true}}' | jq . +``` + +Approval gate: +- May be installed as `openvino-router-classifier.service` only after Will approves live service enablement. +- Must remain dry-run and must not alter Hermes/Atlas routing, memory writes, safety confirmation flow, or outbound messages without a separate explicit approval. + +### Small GenAI NPU worker (`:18820`) + +Foreground review start only, after confirming port is free: + +```bash +ss -ltnp | grep ':18820\b' || true +cd ~/lab/swarm/openvino-genai-npu-worker +/home/will/.venvs/npu/bin/python worker.py --host 127.0.0.1 --port 18820 +``` + +Smoke: + +```bash +curl -fsS http://127.0.0.1:18820/healthz | jq . +curl -fsS http://127.0.0.1:18820/models | jq . +curl -fsS http://127.0.0.1:18820/v1/worker/condense-notification \ + -H 'Content-Type: application/json' \ + -d '{"input":"Non-private smoke notification for local NPU worker.","max_new_tokens":64}' | jq . +``` + +Approval gate: +- May be installed as `openvino-genai-npu-worker.service` only after Will approves persistent service enablement. +- Must not become primary Atlas/Hermes model routing. Use only for bounded background jobs such as title, summary, notification condensation, and memory-candidate drafting. + +### Document/image triage prototype (`:18828`/`:18829` review ports) + +Foreground review start only, after confirming port is free: + +```bash +ss -ltnp | grep -E ':(18828|18829)\b' || true +cd ~/lab/swarm/openvino-doc-image-triage-npu +/home/will/.venvs/npu/bin/python server.py --host 127.0.0.1 --port 18828 --allowed-root "$PWD" +``` + +Smoke: + +```bash +curl -fsS http://127.0.0.1:18828/healthz | jq . +curl -fsS http://127.0.0.1:18828/models | jq . +/home/will/.venvs/npu/bin/python tests/smoke_test.py +``` + +Approval gate: +- Do not point it at arbitrary directories; allowed roots must be equal to or under configured roots. +- Do not include raw OCR text or full source paths unless Will explicitly asks for a one-off response. +- v1 only uses the NPU through `:18817` embeddings for needs-attention; image category classification and OCR are CPU/local fallbacks. + +## Systemd and Compose recommendations + +Recommended management split: +- Keep containerized services in Docker Compose when they already have Docker build/runtime shape and Compose health (`whisper-server-npu`). +- Keep host-side OpenVINO Python prototypes as user systemd services when they depend on local venvs, sysfs NPU access, model caches, and localhost-only APIs (`openvino-embeddings`, optional reranker/classifier/GenAI worker). +- Do not add the prototypes to the live gateway or primary routing during installation. Installation and routing are separate approval gates. + +User-systemd unit expectations for optional prototypes: +- `WorkingDirectory` points at the service directory under `~/lab/swarm/`. +- `ExecStart` uses the existing venv path documented by the prototype. +- `Environment` pins host to `127.0.0.1`, port, model path, device `NPU`, and any upstream endpoint. +- `Restart=on-failure`, not aggressive restart loops. +- Logs go to user journal; do not log raw request bodies. +- Start manually for smoke; enable on boot only after Will approval. + +Compose expectations for existing swarm services: +- Prefer `cd ~/lab/swarm && make ps`, `make status`, and targeted `docker compose ps ` for read-only checks. +- Do not run `docker compose up -d`, restart containers, pull images, or prune volumes from this runbook without approval. + +## Monitoring and logging notes + +Minimum recurring monitoring should include: +- Listener presence for `18816`, `18817`, and any approved optional prototype ports. +- User service state for `openvino-embeddings.service` and any approved optional prototype unit. +- Docker Compose health for `whisper-server-npu`. +- HTTP health endpoint success. +- Positive sysfs NPU busy-time delta on at least one non-private inference probe, preferably embeddings `:18817` because it is already live and central. +- Journal/container logs only at summary level. Avoid raw prompts, raw OCR text, private document names, credentials, and API keys. + +Useful log commands: + +```bash +journalctl --user -u openvino-embeddings.service -n 100 --no-pager +journalctl --user -u rag-embedding-health.service -n 100 --no-pager +journalctl --user -u openvino-reranker.service -n 100 --no-pager +journalctl --user -u openvino-router-classifier.service -n 100 --no-pager +journalctl --user -u openvino-genai-npu-worker.service -n 100 --no-pager +cd ~/lab/swarm && docker compose logs --tail 100 whisper-server-npu +``` + +## Approval gates + +Requires explicit Will approval before proceeding: +- Installing, enabling, or autostarting `openvino-reranker.service`, `openvino-router-classifier.service`, or `openvino-genai-npu-worker.service`. +- Assigning a final persistent port to document/image triage or enabling it as a persistent service. +- Enabling live RAG reranking or any request path that changes Atlas/RAG answers. +- Changing primary Atlas/Hermes routing or connecting router/classifier outputs to live decisions. +- Connecting the GenAI worker to primary Atlas chat, gateway routing, memory writes, or outbound notifications. +- Restarting the live Atlas/Hermes gateway. +- Deleting, overwriting, or in-place reindexing existing vector collections. +- Broadening bind addresses or exposure beyond local-only defaults. + +Approved/parked outcomes: +- Built/approved prototypes: reranker (`:18818`), router/classifier (`:18819`), small GenAI worker (`:18820`), document/image triage (review ports `:18828`/`:18829`). +- Live baseline retained: Whisper NPU (`:18816`), OpenVINO embeddings (`:18817`), RAG endpoint (`:18810`) using `obsidian_bge_npu`. +- Parked: always-on wake-word/audio and conventional vision detection until Will wants a concrete use case. +- Rejected for this NPU program: diffusion/image generation.