From 7b80c1e7a425d9d8ebae0ee95b01ce47b50919a2 Mon Sep 17 00:00:00 2001
From: William Valentin <william.valentin.info@gmail.com>
Date: Mon, 23 Feb 2026 22:38:18 -0800
Subject: [PATCH] docs(eval): record guard-coverage preprobe window and updated
 gates

---
 README.md                                     |   3 +
 .../pi_embedded_eval_window_a_2026-02-24.json |   2 +-
 .../pi_embedded_eval_window_a_2026-02-24.md   |   6 +
 ...al_window_b_2026-02-24_post_fallbacks.json |   2 +-
 ...eval_window_b_2026-02-24_post_fallbacks.md |   6 +
 ...al_window_c_2026-02-24_guard_preprobe.json | 162 ++++++++++++++++++
 ...eval_window_c_2026-02-24_guard_preprobe.md |  62 +++++++
 docs/plans/pi_embedded_evaluation.md          |  45 +++++
 docs/plans/state.json                         |   8 +-
 9 files changed, 291 insertions(+), 5 deletions(-)
 create mode 100644 docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.json
 create mode 100644 docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.md

diff --git a/README.md b/README.md
index ff38d14..ccaf849 100644
--- a/README.md
+++ b/README.md
@@ -377,6 +377,9 @@ pnpm audit:backend-canary \
   --format markdown
 ```
 
+For controlled guardrail probes, also require:
+`--gate-min-guard-pi-no-tools-count 1 --gate-min-guard-capability-query-count 1 --gate-min-guard-attachments-present-count 1`
+
 Phase-2 evaluation checklist and decision template: `docs/plans/pi_embedded_evaluation.md`.
 
 When `args` is non-empty:
diff --git a/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json b/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json
index 4f394bf..d1e6ac5 100644
--- a/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json
+++ b/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json
@@ -1,5 +1,5 @@
 {
-  "generated_at": "2026-02-24T06:33:31.111Z",
+  "generated_at": "2026-02-24T06:37:06.279Z",
   "event_count": 45,
   "filters": {},
   "options": {
diff --git a/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.md b/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.md
index 8cae403..1481259 100644
--- a/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.md
+++ b/docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.md
@@ -11,6 +11,12 @@
 | pi_embedded | 8 |
 | native | 2 |
 
+### Forced Native Guards
+
+| Guard reason | Count |
+| --- | ---: |
+| _none_ | 0 |
+
 ## Reliability
 
 | Metric | Target | Baseline | Delta |
diff --git a/docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.json b/docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.json
index ca4205d..d4e9cfb 100644
--- a/docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.json
+++ b/docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.json
@@ -1,5 +1,5 @@
 {
-  "generated_at": "2026-02-24T06:33:37.931Z",
+  "generated_at": "2026-02-24T06:37:06.252Z",
   "event_count": 24,
   "filters": {
     "since_ms": 1771913640000
diff --git a/docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.md b/docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.md
index 0377817..d1bdb77 100644
--- a/docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.md
+++ b/docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.md
@@ -10,6 +10,12 @@
 | --- | ---: |
 | pi_embedded | 6 |
 
+### Forced Native Guards
+
+| Guard reason | Count |
+| --- | ---: |
+| _none_ | 0 |
+
 ## Reliability
 
 | Metric | Target | Baseline | Delta |
diff --git a/docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.json b/docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.json
new file mode 100644
index 0000000..385b2d9
--- /dev/null
+++ b/docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.json
@@ -0,0 +1,162 @@
+{
+  "generated_at": "2026-02-24T06:37:21.443Z",
+  "event_count": 45,
+  "filters": {},
+  "options": {
+    "targetBackend": "pi_embedded",
+    "baselineBackend": "native",
+    "sessionIds": [
+      "telegram:8367012007"
+    ]
+  },
+  "summary": {
+    "route_stats": {
+      "total": 10,
+      "by_backend": {
+        "native": 2,
+        "pi_embedded": 8
+      },
+      "by_source": {
+        "agent_override": 10
+      },
+      "forced_native_guards": {}
+    },
+    "target": {
+      "backend": "pi_embedded",
+      "routes": 8,
+      "completed_turns": 8,
+      "incomplete_turns": 0,
+      "completion_rate_pct": 100,
+      "e2e_latency_ms": {
+        "count": 8,
+        "avg_ms": 4615,
+        "p50_ms": 3240,
+        "p95_ms": 8776,
+        "min_ms": 1859,
+        "max_ms": 9381
+      }
+    },
+    "baseline": {
+      "backend": "native",
+      "routes": 2,
+      "completed_turns": 2,
+      "incomplete_turns": 0,
+      "completion_rate_pct": 100,
+      "e2e_latency_ms": {
+        "count": 2,
+        "avg_ms": 2981,
+        "p50_ms": 2981,
+        "p95_ms": 3081,
+        "min_ms": 2870,
+        "max_ms": 3092
+      }
+    },
+    "target_external_attempts": {
+      "attempts": 8,
+      "successes": 6,
+      "fallbacks": 2,
+      "unresolved_attempts": 0,
+      "success_rate_pct": 75,
+      "attempt_latency_ms": {
+        "count": 8,
+        "avg_ms": 3961,
+        "p50_ms": 2636,
+        "p95_ms": 8766,
+        "min_ms": 135,
+        "max_ms": 9371
+      }
+    },
+    "comparison": {
+      "completion_rate_delta_pp": 0,
+      "p50_latency_delta_ms": 259,
+      "p95_latency_delta_ms": 5695
+    },
+    "fallback_categories": [
+      {
+        "category": "empty_assistant_text",
+        "count": 1,
+        "pct": 50
+      },
+      {
+        "category": "pi_module_interface",
+        "count": 1,
+        "pct": 50
+      }
+    ],
+    "fallback_top_reasons": [
+      {
+        "reason": "Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession, createSession, createPiSession, createAge",
+        "count": 1
+      },
+      {
+        "reason": "Pi Agent runtime produced no assistant text",
+        "count": 1
+      }
+    ]
+  },
+  "gate": {
+    "pass": false,
+    "criteria": [
+      {
+        "criterion": "Minimum target routes",
+        "pass": true,
+        "actual": "8",
+        "threshold": ">= 8"
+      },
+      {
+        "criterion": "Minimum baseline routes",
+        "pass": true,
+        "actual": "2",
+        "threshold": ">= 2"
+      },
+      {
+        "criterion": "Minimum target external attempts",
+        "pass": true,
+        "actual": "8",
+        "threshold": ">= 8"
+      },
+      {
+        "criterion": "Minimum pi_no_tools_mode guard hits",
+        "pass": false,
+        "actual": "0",
+        "threshold": ">= 1"
+      },
+      {
+        "criterion": "Minimum capability_query guard hits",
+        "pass": false,
+        "actual": "0",
+        "threshold": ">= 1"
+      },
+      {
+        "criterion": "Minimum attachments_present guard hits",
+        "pass": false,
+        "actual": "0",
+        "threshold": ">= 1"
+      },
+      {
+        "criterion": "Completion rate delta (target - baseline)",
+        "pass": true,
+        "actual": "0.00pp",
+        "threshold": ">= -2.00pp"
+      },
+      {
+        "criterion": "P50 latency delta (target - baseline)",
+        "pass": false,
+        "actual": "259ms",
+        "threshold": "<= 250ms"
+      },
+      {
+        "criterion": "P95 latency delta (target - baseline)",
+        "pass": false,
+        "actual": "5695ms",
+        "threshold": "<= 700ms"
+      },
+      {
+        "criterion": "Fallback rate (target external attempts)",
+        "pass": false,
+        "actual": "25.00%",
+        "threshold": "<= 5.00%"
+      }
+    ]
+  }
+}
diff --git a/docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.md b/docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.md
new file mode 100644
index 0000000..90ff953
--- /dev/null
+++ b/docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.md
@@ -0,0 +1,62 @@
+# Pi Embedded Canary Summary
+
+- Target backend: `pi_embedded`
+- Baseline backend: `native`
+- Routes analyzed: 10
+
+## Route Distribution
+
+| Backend | Routes |
+| --- | ---: |
+| pi_embedded | 8 |
+| native | 2 |
+
+### Forced Native Guards
+
+| Guard reason | Count |
+| --- | ---: |
+| _none_ | 0 |
+
+## Reliability
+
+| Metric | Target | Baseline | Delta |
+| --- | ---: | ---: | ---: |
+| Turn completion rate | 100.00% | 100.00% | 0.00pp |
+| External success rate | 75.00% | n/a | n/a |
+| External attempts | 8 | n/a | n/a |
+| External fallbacks | 2 | n/a | n/a |
+
+## Latency
+
+- Target end-to-end: count=8, avg=4615ms, p50=3240ms, p95=8776ms, min=1859ms, max=9381ms
+- Baseline end-to-end: count=2, avg=2981ms, p50=2981ms, p95=3081ms, min=2870ms, max=3092ms
+- P50 delta (target - baseline): 259ms
+- P95 delta (target - baseline): 5695ms
+- Target external attempt: count=8, avg=3961ms, p50=2636ms, p95=8766ms, min=135ms, max=9371ms
+
+## Fallback Taxonomy
+
+| Category | Count | Percent |
+| --- | ---: | ---: |
+| empty_assistant_text | 1 | 50.00% |
+| pi_module_interface | 1 | 50.00% |
+
+## Top Fallback Reasons
+
+- Loaded Pi module does not expose a supported session factory (expected one of: createAgentSession, createSession, createPiSession, createAge (1)
+- Pi Agent runtime produced no assistant text (1)
+
+## Gate Evaluation
+
+- Gate result: HOLD
+- [x] Minimum target routes: actual=8, threshold=>= 8
+- [x] Minimum baseline routes: actual=2, threshold=>= 2
+- [x] Minimum target external attempts: actual=8, threshold=>= 8
+- [ ] Minimum pi_no_tools_mode guard hits: actual=0, threshold=>= 1
+- [ ] Minimum capability_query guard hits: actual=0, threshold=>= 1
+- [ ] Minimum attachments_present guard hits: actual=0, threshold=>= 1
+- [x] Completion rate delta (target - baseline): actual=0.00pp, threshold=>= -2.00pp
+- [ ] P50 latency delta (target - baseline): actual=259ms, threshold=<= 250ms
+- [ ] P95 latency delta (target - baseline): actual=5695ms, threshold=<= 700ms
+- [ ] Fallback rate (target external attempts): actual=25.00%, threshold=<= 5.00%
+
diff --git a/docs/plans/pi_embedded_evaluation.md b/docs/plans/pi_embedded_evaluation.md
index 42d9539..b978056 100644
--- a/docs/plans/pi_embedded_evaluation.md
+++ b/docs/plans/pi_embedded_evaluation.md
@@ -24,6 +24,7 @@ Use the same thresholds for every evaluation window.
 | Minimum target routes | >= 8 |
 | Minimum baseline routes | >= 2 |
 | Minimum target external attempts | >= 8 |
+| Minimum guard coverage (probe window) | `pi_no_tools_mode >= 1`, `capability_query >= 1`, `attachments_present >= 1` |
 | Completion rate delta (target - baseline) | >= -2.00pp |
 | P50 latency delta (target - baseline) | <= +250ms |
 | P95 latency delta (target - baseline) | <= +700ms |
@@ -34,6 +35,7 @@ Notes:
 - Completion rate and latency are computed from route-to-assistant turn timings.
 - Fallback rate is computed from `backend.success` + `backend.fallback` attempt outcomes.
 - Guardrail escapes are reviewed from `backend.route.source == forced_native_guard` + operator incident review.
+- Guard-coverage minimums are enforced for controlled probe windows, not passive traffic slices.
 
 ## How To Run
 
@@ -70,6 +72,27 @@ pnpm audit:backend-canary \
   --gate-max-fallback-rate-pct 5
 ```
 
+Run controlled probe-window evaluation (guard coverage required):
+
+```bash
+pnpm audit:backend-canary \
+  --audit ~/.local/share/flynn/audit.log \
+  --backend pi_embedded \
+  --baseline native \
+  --session telegram:8367012007 \
+  --gate-min-target-routes 8 \
+  --gate-min-baseline-routes 2 \
+  --gate-min-target-attempts 8 \
+  --gate-min-guard-pi-no-tools-count 1 \
+  --gate-min-guard-capability-query-count 1 \
+  --gate-min-guard-attachments-present-count 1 \
+  --gate-max-completion-drop-pp 2 \
+  --gate-max-p50-latency-increase-ms 250 \
+  --gate-max-p95-latency-increase-ms 700 \
+  --gate-max-fallback-rate-pct 5 \
+  --format markdown
+```
+
 ## Evaluation Log
 
 ### Window A
@@ -110,6 +133,27 @@ pnpm audit:backend-canary \
 | Fallback rate | 0.00% (pass) | 0 fallbacks / 6 attempts |
 | Guardrail escapes | none observed (provisional pass) | no `forced_native_guard` events in this window |
 
+### Window C (Guard Coverage Pre-Probe Baseline)
+
+- Dates: February 24, 2026 (same full Window A slice; guard-coverage gates enabled)
+- Route volume: 10 total routes (`pi_embedded`: 8, `native`: 2)
+- Summary artifacts:
+  - `docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.md`
+  - `docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.json`
+
+| Check | Result | Notes |
+| --- | --- | --- |
+| Minimum target routes | 8 (pass) | gate >= 8 |
+| Minimum baseline routes | 2 (pass) | gate >= 2 |
+| Minimum target external attempts | 8 (pass) | gate >= 8 |
+| Minimum `pi_no_tools_mode` guard hits | 0 (fail) | gate >= 1 |
+| Minimum `capability_query` guard hits | 0 (fail) | gate >= 1 |
+| Minimum `attachments_present` guard hits | 0 (fail) | gate >= 1 |
+| Completion rate delta | 0.00pp (pass) | target 100.00% vs baseline 100.00% |
+| P50 latency delta | +259ms (fail) | gate <= +250ms |
+| P95 latency delta | +5695ms (fail) | gate <= +700ms |
+| Fallback rate | 25.00% (fail) | 2 fallbacks / 8 attempts; gate <= 5.00% |
+
 ## Tool Compatibility Findings
 
 Track all tool-adjacent/risky prompts that were force-routed to native (`no_tools_mode`) and any misses.
@@ -125,6 +169,7 @@ Track all tool-adjacent/risky prompts that were force-routed to native (`no_tool
 - Decision date: February 24, 2026
 - Decision: `hold` (no cohort expansion yet)
 - Rationale: Window A fails 3/4 numeric gates (p50 delta, p95 delta, fallback rate) with only 10 total routed turns, including two concrete fallback failure modes:
+  and Window C pre-probe baseline confirms missing guard-coverage evidence (`pi_no_tools_mode`, `capability_query`, `attachments_present` all at 0).
   - `pi_module_interface`
   - `empty_assistant_text`
   Window B shows fallback recovery (0%) in a post-fallback slice but fails minimum sample thresholds and has no native baseline routes for delta-gate evaluation.
diff --git a/docs/plans/state.json b/docs/plans/state.json
index d0ef946..758f005 100644
--- a/docs/plans/state.json
+++ b/docs/plans/state.json
@@ -7,7 +7,7 @@
       "status": "in_progress",
       "date": "2026-02-24",
       "updated": "2026-02-24",
-      "summary": "Formal Pi embedded canary evaluation is active with audit-log summarization, minimum-sample gate thresholds, and normalized Pi-specific fallback categories. Window A (2026-02-24) remains `hold` due to latency/fallback failures; Window B shows fallback recovery (0%) but fails minimum-sample/baseline requirements, so expansion remains blocked.",
+      "summary": "Formal Pi embedded canary evaluation is active with audit-log summarization, minimum-sample thresholds, and guard-coverage thresholds for controlled probe windows. Window A remains `hold` due to latency/fallback failures; Window B shows fallback recovery (0%) but fails sample/baseline minimums; Window C pre-probe baseline confirms guard-coverage evidence is still missing, so expansion remains blocked.",
       "files_modified": [
         "src/audit/backendCanarySummary.ts",
         "src/audit/backendCanarySummary.test.ts",
@@ -20,12 +20,14 @@
         "docs/plans/artifacts/pi_embedded_eval_window_a_2026-02-24.json",
         "docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.md",
         "docs/plans/artifacts/pi_embedded_eval_window_b_2026-02-24_post_fallbacks.json",
+        "docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.md",
+        "docs/plans/artifacts/pi_embedded_eval_window_c_2026-02-24_guard_preprobe.json",
         "docs/architecture/AGENT_DIAGRAM.md",
         "docs/architecture/GATEWAY_SESSIONS_AND_QUEUE.md",
         "docs/api/PROTOCOL.md",
         "docs/plans/state.json"
       ],
-      "test_status": "pnpm test:run src/audit/backendCanarySummary.test.ts + pnpm typecheck + pnpm audit:backend-canary against ~/.local/share/flynn/audit.log with minimum-sample gates (Window A + Window B artifacts regenerated) passing; pnpm lint unchanged warnings-only baseline"
+      "test_status": "pnpm test:run src/audit/backendCanarySummary.test.ts + pnpm typecheck + pnpm audit:backend-canary against ~/.local/share/flynn/audit.log with minimum-sample + guard-coverage gates (Window A/B regenerated, Window C pre-probe generated) passing; pnpm lint unchanged warnings-only baseline"
     },
     "pi-embedded-backend-canary-spike": {
       "status": "completed",
@@ -6486,7 +6488,7 @@
     "remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening",
     "next_up": "Track OpenClaw evolution regularly for inspiration and feature ideas",
     "pi_embedded_canary_spike": "completed — added optional pi_embedded backend adapter, canary-safe no-tools routing guard, backend success/fallback latency audit events, and docs/diagram updates while native remains default",
-    "pi_embedded_evaluation_phase": "in progress — minimum-sample gates enforced; Window A HOLD (p50 +259ms, p95 +5695ms, fallback 25%, categories: pi_module_interface/empty_assistant_text); Window B has 0% fallback but fails sample/baseline gates; no cohort expansion"
+    "pi_embedded_evaluation_phase": "in progress — minimum-sample and guard-coverage gates enforced; Window A HOLD (p50 +259ms, p95 +5695ms, fallback 25%, categories: pi_module_interface/empty_assistant_text); Window B has 0% fallback but fails sample/baseline gates; Window C pre-probe shows 0 guard hits across pi_no_tools_mode/capability_query/attachments_present; no cohort expansion"
   },
   "soul_md_and_cron_create": {
     "date": "2026-02-11",