feat(policy): enforce truthfulness and autonomy guardrails
Add runtime truthfulness modes and autonomy-level tool gating with audit metadata for overrides/denials. Wire policy through prompt assembly, tool execution context, and daemon/gateway agent paths; update tests and planning state for Phase 3 PR #2 completion.
This commit is contained in:
@@ -0,0 +1,47 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { getTruthfulnessGuidance, type TruthfulnessMode } from './guardrails.js';
|
||||
|
||||
describe('guardrails', () => {
|
||||
describe('getTruthfulnessGuidance', () => {
|
||||
it('returns strict guidance for strict mode', () => {
|
||||
const guidance = getTruthfulnessGuidance('strict');
|
||||
expect(guidance).toContain('STRICT MODE');
|
||||
expect(guidance).toContain('Always tell the truth');
|
||||
expect(guidance).toContain('No lies. No invention. No fabrication.');
|
||||
expect(guidance).toContain('Tool output and user data');
|
||||
});
|
||||
|
||||
it('returns standard guidance for standard mode', () => {
|
||||
const guidance = getTruthfulnessGuidance('standard');
|
||||
expect(guidance).toContain('Truthfulness Policy');
|
||||
expect(guidance).not.toContain('STRICT MODE');
|
||||
expect(guidance).toContain('actual tool output');
|
||||
expect(guidance).toContain('report the failure accurately');
|
||||
});
|
||||
|
||||
it('returns relaxed guidance for relaxed mode', () => {
|
||||
const guidance = getTruthfulnessGuidance('relaxed');
|
||||
expect(guidance).toContain('Truthfulness Policy');
|
||||
expect(guidance).toContain('Be accurate');
|
||||
expect(guidance.length).toBeLessThan(getTruthfulnessGuidance('standard').length);
|
||||
});
|
||||
|
||||
it('all modes return non-empty strings', () => {
|
||||
const modes: TruthfulnessMode[] = ['strict', 'standard', 'relaxed'];
|
||||
for (const mode of modes) {
|
||||
const guidance = getTruthfulnessGuidance(mode);
|
||||
expect(guidance).toBeTruthy();
|
||||
expect(guidance.trim().length).toBeGreaterThan(0);
|
||||
}
|
||||
});
|
||||
|
||||
it('strict mode has the longest guidance', () => {
|
||||
const strict = getTruthfulnessGuidance('strict');
|
||||
const standard = getTruthfulnessGuidance('standard');
|
||||
const relaxed = getTruthfulnessGuidance('relaxed');
|
||||
|
||||
expect(strict.length).toBeGreaterThan(standard.length);
|
||||
expect(standard.length).toBeGreaterThan(relaxed.length);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,61 @@
|
||||
/**
|
||||
* Guardrails for enforcing truthfulness policies in agent behavior.
|
||||
*
|
||||
* Provides textual guidance that can be injected into system prompts
|
||||
* to enforce different levels of truthfulness constraints.
|
||||
*/
|
||||
|
||||
import type { TruthfulnessMode } from '../../config/schema.js';
|
||||
|
||||
export type { TruthfulnessMode } from '../../config/schema.js';
|
||||
|
||||
const STRICT_GUIDANCE = `## Truthfulness Policy (STRICT MODE)
|
||||
|
||||
**Always tell the truth. No lies. No invention. No fabrication. No guessing presented as fact.**
|
||||
|
||||
This is the single most important rule. It applies to everything — not just tool output, but all communication.
|
||||
|
||||
**General truthfulness:**
|
||||
- Never state something as fact unless you know it to be true.
|
||||
- If you don't know something, say "I don't know."
|
||||
- If you're uncertain, say so explicitly — never present a guess as a fact.
|
||||
- Never invent information to appear helpful. Being honest about limitations IS being helpful.
|
||||
- Do not embellish, exaggerate, or speculate without clearly labeling it as speculation.
|
||||
|
||||
**Tool output and user data:**
|
||||
- Only present information that was actually returned by a tool, script, or API call.
|
||||
- If a tool fails or returns an error, **report the failure honestly** — do not fill in plausible content.
|
||||
- If a tool returns no results, say so — do not invent results that "might" exist.
|
||||
- When summarizing tool output, every claim must trace back to actual output. No embellishment.
|
||||
|
||||
**Prefer "I don't know" or "the tool failed" over any fabricated content.** Always. This applies to all data: emails, calendar events, files, Kubernetes state, metrics, logs, and any other information accessed via tools or from memory.`;
|
||||
|
||||
const STANDARD_GUIDANCE = `## Truthfulness Policy
|
||||
|
||||
Always base your responses on actual tool output or verified information. Do not fabricate data or present guesses as facts.
|
||||
|
||||
- If a tool fails, report the failure accurately.
|
||||
- If you don't have information, say so clearly.
|
||||
- When summarizing tool output, stay faithful to what was actually returned.
|
||||
|
||||
Being honest about limitations is more valuable than inventing plausible-sounding content.`;
|
||||
|
||||
const RELAXED_GUIDANCE = `## Truthfulness Policy
|
||||
|
||||
Be accurate with tool output and avoid fabricating data when precision matters.`;
|
||||
|
||||
/**
|
||||
* Get the truthfulness guidance text for a given mode.
|
||||
*
|
||||
* Returns a markdown section suitable for injection into a system prompt.
|
||||
*/
|
||||
export function getTruthfulnessGuidance(mode: TruthfulnessMode): string {
|
||||
switch (mode) {
|
||||
case 'strict':
|
||||
return STRICT_GUIDANCE;
|
||||
case 'standard':
|
||||
return STANDARD_GUIDANCE;
|
||||
case 'relaxed':
|
||||
return RELAXED_GUIDANCE;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user