feat(security): harden tool provenance and skill isolation

2026-02-15 10:16:55 -08:00
parent 3451df41b9
commit 67058c8719
6 changed files with 102 additions and 17 deletions
@@ -143,6 +143,10 @@ export class NativeAgent {
  private async toolLoop(): Promise<string> {
    const tools = this.toolRegistry!.filteredToAnthropicFormat(this._toolPolicyContext);

+    // Track whether untrusted content (web/fetched/tool output) has been introduced
+    // during this run. Used to harden against prompt injection.
+    let untrustedContentSeen = false;
+
    // Detect tool inventory changes to combat conversational inertia in long sessions.
    // When tools change (e.g. new tools added between restarts), the model's prior messages
    // saying "I can't do that" can override tool definitions. Injecting a system note fixes this.
@@ -262,11 +266,24 @@ export class NativeAgent {
          const internalName = this.toolRegistry!.getByApiName(tc.name)?.name ?? tc.name;
          this.onToolUse?.({ type: 'start', tool: internalName, args: tc.args });

-          const result = await this.toolExecutor!.execute(internalName, tc.args, this._toolPolicyContext);
+          const perCallContext: ToolPolicyContext | undefined = this._toolPolicyContext
+            ? { ...this._toolPolicyContext, untrustedContent: untrustedContentSeen }
+            : undefined;
+
+          const result = await this.toolExecutor!.execute(internalName, tc.args, perCallContext);

          this.onToolUse?.({ type: 'end', tool: internalName, result });

-          const resultContent = result.success ? result.output : (result.error ?? 'Unknown error');
+          const provenance = (internalName === 'web.fetch' || internalName === 'web.search' || internalName === 'browser.content')
+            ? 'fetched_content'
+            : 'tool_output';
+
+          if (provenance === 'fetched_content') {
+            untrustedContentSeen = true;
+          }
+
+          const rawContent = result.success ? result.output : (result.error ?? 'Unknown error');
+          const resultContent = `[provenance=${provenance} tool=${internalName} untrusted=${provenance === 'fetched_content' ? 'true' : 'false'}]\n${rawContent}\n[/provenance]`;
          toolResultBlocks.push({
            type: 'tool_result',
            tool_use_id: tc.id,