feat(policy): enforce truthfulness and autonomy guardrails

Add runtime truthfulness modes and autonomy-level tool gating with audit metadata for overrides/denials. Wire policy through prompt assembly, tool execution context, and daemon/gateway agent paths; update tests and planning state for Phase 3 PR #2 completion.
2026-02-12 16:06:45 -08:00
parent 125af4e832
commit 90ce622080
18 changed files with 1172 additions and 104 deletions
@@ -0,0 +1,47 @@
+import { describe, it, expect } from 'vitest';
+import { getTruthfulnessGuidance, type TruthfulnessMode } from './guardrails.js';
+
+describe('guardrails', () => {
+  describe('getTruthfulnessGuidance', () => {
+    it('returns strict guidance for strict mode', () => {
+      const guidance = getTruthfulnessGuidance('strict');
+      expect(guidance).toContain('STRICT MODE');
+      expect(guidance).toContain('Always tell the truth');
+      expect(guidance).toContain('No lies. No invention. No fabrication.');
+      expect(guidance).toContain('Tool output and user data');
+    });
+
+    it('returns standard guidance for standard mode', () => {
+      const guidance = getTruthfulnessGuidance('standard');
+      expect(guidance).toContain('Truthfulness Policy');
+      expect(guidance).not.toContain('STRICT MODE');
+      expect(guidance).toContain('actual tool output');
+      expect(guidance).toContain('report the failure accurately');
+    });
+
+    it('returns relaxed guidance for relaxed mode', () => {
+      const guidance = getTruthfulnessGuidance('relaxed');
+      expect(guidance).toContain('Truthfulness Policy');
+      expect(guidance).toContain('Be accurate');
+      expect(guidance.length).toBeLessThan(getTruthfulnessGuidance('standard').length);
+    });
+
+    it('all modes return non-empty strings', () => {
+      const modes: TruthfulnessMode[] = ['strict', 'standard', 'relaxed'];
+      for (const mode of modes) {
+        const guidance = getTruthfulnessGuidance(mode);
+        expect(guidance).toBeTruthy();
+        expect(guidance.trim().length).toBeGreaterThan(0);
+      }
+    });
+
+    it('strict mode has the longest guidance', () => {
+      const strict = getTruthfulnessGuidance('strict');
+      const standard = getTruthfulnessGuidance('standard');
+      const relaxed = getTruthfulnessGuidance('relaxed');
+
+      expect(strict.length).toBeGreaterThan(standard.length);
+      expect(standard.length).toBeGreaterThan(relaxed.length);
+    });
+  });
+});
@@ -0,0 +1,61 @@
+/**
+ * Guardrails for enforcing truthfulness policies in agent behavior.
+ *
+ * Provides textual guidance that can be injected into system prompts
+ * to enforce different levels of truthfulness constraints.
+ */
+
+import type { TruthfulnessMode } from '../../config/schema.js';
+
+export type { TruthfulnessMode } from '../../config/schema.js';
+
+const STRICT_GUIDANCE = `## Truthfulness Policy (STRICT MODE)
+
+**Always tell the truth. No lies. No invention. No fabrication. No guessing presented as fact.**
+
+This is the single most important rule. It applies to everything — not just tool output, but all communication.
+
+**General truthfulness:**
+- Never state something as fact unless you know it to be true.
+- If you don't know something, say "I don't know."
+- If you're uncertain, say so explicitly — never present a guess as a fact.
+- Never invent information to appear helpful. Being honest about limitations IS being helpful.
+- Do not embellish, exaggerate, or speculate without clearly labeling it as speculation.
+
+**Tool output and user data:**
+- Only present information that was actually returned by a tool, script, or API call.
+- If a tool fails or returns an error, **report the failure honestly** — do not fill in plausible content.
+- If a tool returns no results, say so — do not invent results that "might" exist.
+- When summarizing tool output, every claim must trace back to actual output. No embellishment.
+
+**Prefer "I don't know" or "the tool failed" over any fabricated content.** Always. This applies to all data: emails, calendar events, files, Kubernetes state, metrics, logs, and any other information accessed via tools or from memory.`;
+
+const STANDARD_GUIDANCE = `## Truthfulness Policy
+
+Always base your responses on actual tool output or verified information. Do not fabricate data or present guesses as facts.
+
+- If a tool fails, report the failure accurately.
+- If you don't have information, say so clearly.
+- When summarizing tool output, stay faithful to what was actually returned.
+
+Being honest about limitations is more valuable than inventing plausible-sounding content.`;
+
+const RELAXED_GUIDANCE = `## Truthfulness Policy
+
+Be accurate with tool output and avoid fabricating data when precision matters.`;
+
+/**
+ * Get the truthfulness guidance text for a given mode.
+ *
+ * Returns a markdown section suitable for injection into a system prompt.
+ */
+export function getTruthfulnessGuidance(mode: TruthfulnessMode): string {
+  switch (mode) {
+    case 'strict':
+      return STRICT_GUIDANCE;
+    case 'standard':
+      return STANDARD_GUIDANCE;
+    case 'relaxed':
+      return RELAXED_GUIDANCE;
+  }
+}