feat(security): harden tool provenance and skill isolation

This commit is contained in:
William Valentin
2026-02-15 10:16:55 -08:00
parent 3451df41b9
commit 67058c8719
6 changed files with 102 additions and 17 deletions
+19 -2
View File
@@ -143,6 +143,10 @@ export class NativeAgent {
private async toolLoop(): Promise<string> {
const tools = this.toolRegistry!.filteredToAnthropicFormat(this._toolPolicyContext);
// Track whether untrusted content (web/fetched/tool output) has been introduced
// during this run. Used to harden against prompt injection.
let untrustedContentSeen = false;
// Detect tool inventory changes to combat conversational inertia in long sessions.
// When tools change (e.g. new tools added between restarts), the model's prior messages
// saying "I can't do that" can override tool definitions. Injecting a system note fixes this.
@@ -262,11 +266,24 @@ export class NativeAgent {
const internalName = this.toolRegistry!.getByApiName(tc.name)?.name ?? tc.name;
this.onToolUse?.({ type: 'start', tool: internalName, args: tc.args });
const result = await this.toolExecutor!.execute(internalName, tc.args, this._toolPolicyContext);
const perCallContext: ToolPolicyContext | undefined = this._toolPolicyContext
? { ...this._toolPolicyContext, untrustedContent: untrustedContentSeen }
: undefined;
const result = await this.toolExecutor!.execute(internalName, tc.args, perCallContext);
this.onToolUse?.({ type: 'end', tool: internalName, result });
const resultContent = result.success ? result.output : (result.error ?? 'Unknown error');
const provenance = (internalName === 'web.fetch' || internalName === 'web.search' || internalName === 'browser.content')
? 'fetched_content'
: 'tool_output';
if (provenance === 'fetched_content') {
untrustedContentSeen = true;
}
const rawContent = result.success ? result.output : (result.error ?? 'Unknown error');
const resultContent = `[provenance=${provenance} tool=${internalName} untrusted=${provenance === 'fetched_content' ? 'true' : 'false'}]\n${rawContent}\n[/provenance]`;
toolResultBlocks.push({
type: 'tool_result',
tool_use_id: tc.id,