Harden audio transcription arg hydration and add rewrite audit event

2026-02-22 18:56:22 -08:00
parent 7d0d8abec6
commit db4e52dd7e
10 changed files with 1183 additions and 16 deletions
@@ -199,6 +199,458 @@ describe('NativeAgent tool loop', () => {
    expect(mockClient.chat).toHaveBeenCalledTimes(2);
  });

+  it('hydrates missing audio.transcribe args from latest user audio attachment', async () => {
+    let callCount = 0;
+    let seenArgs: Record<string, unknown> | undefined;
+    const mockClient: ModelClient = {
+      chat: vi.fn().mockImplementation(() => {
+        callCount++;
+        if (callCount === 1) {
+          return {
+            content: '',
+            stopReason: 'tool_use',
+            usage: { inputTokens: 10, outputTokens: 5 },
+            toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: {} }],
+          };
+        }
+        return {
+          content: 'done',
+          stopReason: 'end_turn',
+          usage: { inputTokens: 15, outputTokens: 10 },
+        };
+      }),
+    };
+
+    const audioTool: Tool = {
+      name: 'audio.transcribe',
+      description: 'Transcribe audio',
+      inputSchema: { type: 'object', properties: {} },
+      execute: async (args) => {
+        seenArgs = args as Record<string, unknown>;
+        return { success: true, output: 'transcript' };
+      },
+    };
+
+    const registry = new ToolRegistry();
+    registry.register(audioTool);
+    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
+    const executor = new ToolExecutor(registry, hooks);
+
+    const agent = new NativeAgent({
+      modelClient: mockClient,
+      systemPrompt: 'You are helpful.',
+      toolRegistry: registry,
+      toolExecutor: executor,
+    });
+
+    const response = await agent.process('Please transcribe this', [{
+      mimeType: 'audio/ogg',
+      data: 'QUJDRA==',
+      filename: 'voice.ogg',
+    }]);
+
+    expect(response).toBe('done');
+    expect(seenArgs).toEqual(expect.objectContaining({
+      data: 'QUJDRA==',
+      mime_type: 'audio/ogg',
+    }));
+  });
+
+  it('hydrates missing audio.transcribe args from persisted session audio attachment', async () => {
+    let callCount = 0;
+    let seenArgs: Record<string, unknown> | undefined;
+    const mockClient: ModelClient = {
+      chat: vi.fn().mockImplementation(() => {
+        callCount++;
+        if (callCount === 1) {
+          return {
+            content: '',
+            stopReason: 'tool_use',
+            usage: { inputTokens: 10, outputTokens: 5 },
+            toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: {} }],
+          };
+        }
+        return {
+          content: 'done',
+          stopReason: 'end_turn',
+          usage: { inputTokens: 15, outputTokens: 10 },
+        };
+      }),
+    };
+
+    const mockSession = {
+      id: 'telegram:user-audio',
+      getHistory: vi.fn().mockReturnValue([]),
+      addMessage: vi.fn(),
+      clear: vi.fn(),
+      replaceHistory: vi.fn(),
+      getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment'
+        ? JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' })
+        : undefined)),
+      setConfig: vi.fn(),
+      deleteConfig: vi.fn(),
+    };
+
+    const audioTool: Tool = {
+      name: 'audio.transcribe',
+      description: 'Transcribe audio',
+      inputSchema: { type: 'object', properties: {} },
+      execute: async (args) => {
+        seenArgs = args as Record<string, unknown>;
+        return { success: true, output: 'transcript' };
+      },
+    };
+
+    const registry = new ToolRegistry();
+    registry.register(audioTool);
+    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
+    const executor = new ToolExecutor(registry, hooks);
+
+    const agent = new NativeAgent({
+      modelClient: mockClient,
+      systemPrompt: 'You are helpful.',
+      session: mockSession,
+      toolRegistry: registry,
+      toolExecutor: executor,
+    });
+
+    const response = await agent.process('Please transcribe this');
+
+    expect(response).toBe('done');
+    expect(seenArgs).toEqual(expect.objectContaining({
+      data: 'U0VTU0lPTg==',
+      mime_type: 'audio/ogg',
+    }));
+  });
+
+  it('replaces invalid file:// audio.transcribe url with persisted session audio data', async () => {
+    let callCount = 0;
+    let seenArgs: Record<string, unknown> | undefined;
+    const mockClient: ModelClient = {
+      chat: vi.fn().mockImplementation(() => {
+        callCount++;
+        if (callCount === 1) {
+          return {
+            content: '',
+            stopReason: 'tool_use',
+            usage: { inputTokens: 10, outputTokens: 5 },
+            toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { url: 'file://voice_message', mime_type: 'audio/ogg' } }],
+          };
+        }
+        return {
+          content: 'done',
+          stopReason: 'end_turn',
+          usage: { inputTokens: 15, outputTokens: 10 },
+        };
+      }),
+    };
+
+    const mockSession = {
+      id: 'telegram:user-audio',
+      getHistory: vi.fn().mockReturnValue([]),
+      addMessage: vi.fn(),
+      clear: vi.fn(),
+      replaceHistory: vi.fn(),
+      getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment'
+        ? JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' })
+        : undefined)),
+      setConfig: vi.fn(),
+      deleteConfig: vi.fn(),
+    };
+
+    const audioTool: Tool = {
+      name: 'audio.transcribe',
+      description: 'Transcribe audio',
+      inputSchema: { type: 'object', properties: {} },
+      execute: async (args) => {
+        seenArgs = args as Record<string, unknown>;
+        return { success: true, output: 'transcript' };
+      },
+    };
+
+    const registry = new ToolRegistry();
+    registry.register(audioTool);
+    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
+    const executor = new ToolExecutor(registry, hooks);
+
+    const agent = new NativeAgent({
+      modelClient: mockClient,
+      systemPrompt: 'You are helpful.',
+      session: mockSession,
+      toolRegistry: registry,
+      toolExecutor: executor,
+    });
+
+    const response = await agent.process('Please transcribe this');
+
+    expect(response).toBe('done');
+    expect(seenArgs).toEqual(expect.objectContaining({
+      data: 'U0VTU0lPTg==',
+      mime_type: 'audio/ogg',
+    }));
+    expect(seenArgs).not.toHaveProperty('url');
+  });
+
+  it('replaces text-like base64 audio.transcribe data with persisted session audio data', async () => {
+    let callCount = 0;
+    let seenArgs: Record<string, unknown> | undefined;
+    const mockClient: ModelClient = {
+      chat: vi.fn().mockImplementation(() => {
+        callCount++;
+        if (callCount === 1) {
+          return {
+            content: '',
+            stopReason: 'tool_use',
+            usage: { inputTokens: 10, outputTokens: 5 },
+            toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { data: 'VGhpcyBvbmUgdHdvIHRocmVl', mime_type: 'audio/wav' } }],
+          };
+        }
+        return {
+          content: 'done',
+          stopReason: 'end_turn',
+          usage: { inputTokens: 15, outputTokens: 10 },
+        };
+      }),
+    };
+
+    const mockSession = {
+      id: 'telegram:user-audio',
+      getHistory: vi.fn().mockReturnValue([]),
+      addMessage: vi.fn(),
+      clear: vi.fn(),
+      replaceHistory: vi.fn(),
+      getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment'
+        ? JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' })
+        : undefined)),
+      setConfig: vi.fn(),
+      deleteConfig: vi.fn(),
+    };
+
+    const audioTool: Tool = {
+      name: 'audio.transcribe',
+      description: 'Transcribe audio',
+      inputSchema: { type: 'object', properties: {} },
+      execute: async (args) => {
+        seenArgs = args as Record<string, unknown>;
+        return { success: true, output: 'transcript' };
+      },
+    };
+
+    const registry = new ToolRegistry();
+    registry.register(audioTool);
+    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
+    const executor = new ToolExecutor(registry, hooks);
+
+    const agent = new NativeAgent({
+      modelClient: mockClient,
+      systemPrompt: 'You are helpful.',
+      session: mockSession,
+      toolRegistry: registry,
+      toolExecutor: executor,
+    });
+
+    const response = await agent.process('Please transcribe this');
+
+    expect(response).toBe('done');
+    expect(seenArgs).toEqual(expect.objectContaining({
+      data: 'U0VTU0lPTg==',
+      mime_type: 'audio/ogg',
+    }));
+  });
+
+  it('forces persisted audio on voice-transcript fallback turns', async () => {
+    let callCount = 0;
+    let seenArgs: Record<string, unknown> | undefined;
+    const mockClient: ModelClient = {
+      chat: vi.fn().mockImplementation(() => {
+        callCount++;
+        if (callCount === 1) {
+          return {
+            content: '',
+            stopReason: 'tool_use',
+            usage: { inputTokens: 10, outputTokens: 5 },
+            toolCalls: [{
+              id: 'call_1',
+              name: 'audio_transcribe',
+              args: {
+                data: 'UklGRigAAABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YQQAAAAAAAAA',
+                mime_type: 'audio/wav',
+              },
+            }],
+          };
+        }
+        return {
+          content: 'done',
+          stopReason: 'end_turn',
+          usage: { inputTokens: 15, outputTokens: 10 },
+        };
+      }),
+    };
+
+    const mockSession = {
+      id: 'telegram:user-audio',
+      getHistory: vi.fn().mockReturnValue([
+        { role: 'user', content: '[Voice message]: hello world\n\ncaption' },
+      ]),
+      addMessage: vi.fn(),
+      clear: vi.fn(),
+      replaceHistory: vi.fn(),
+      getConfig: vi.fn((key: string) => {
+        if (key === 'lastAudioAttachment') {
+          return JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' });
+        }
+        return undefined;
+      }),
+      setConfig: vi.fn(),
+      deleteConfig: vi.fn(),
+    };
+
+    const audioTool: Tool = {
+      name: 'audio.transcribe',
+      description: 'Transcribe audio',
+      inputSchema: { type: 'object', properties: {} },
+      execute: async (args) => {
+        seenArgs = args as Record<string, unknown>;
+        return { success: true, output: 'transcript' };
+      },
+    };
+
+    const registry = new ToolRegistry();
+    registry.register(audioTool);
+    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
+    const executor = new ToolExecutor(registry, hooks);
+
+    const agent = new NativeAgent({
+      modelClient: mockClient,
+      systemPrompt: 'You are helpful.',
+      session: mockSession,
+      toolRegistry: registry,
+      toolExecutor: executor,
+    });
+
+    const response = await agent.process('Please transcribe this');
+
+    expect(response).toBe('done');
+    expect(seenArgs).toEqual(expect.objectContaining({
+      data: 'U0VTU0lPTg==',
+      mime_type: 'audio/ogg',
+    }));
+  });
+
+  it('replaces placeholder audio.transcribe data with latest attachment bytes', async () => {
+    let callCount = 0;
+    let seenArgs: Record<string, unknown> | undefined;
+    const mockClient: ModelClient = {
+      chat: vi.fn().mockImplementation(() => {
+        callCount++;
+        if (callCount === 1) {
+          return {
+            content: '',
+            stopReason: 'tool_use',
+            usage: { inputTokens: 10, outputTokens: 5 },
+            toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { data: '[voice message data not provided]', mime_type: 'audio/ogg' } }],
+          };
+        }
+        return {
+          content: 'done',
+          stopReason: 'end_turn',
+          usage: { inputTokens: 15, outputTokens: 10 },
+        };
+      }),
+    };
+
+    const audioTool: Tool = {
+      name: 'audio.transcribe',
+      description: 'Transcribe audio',
+      inputSchema: { type: 'object', properties: {} },
+      execute: async (args) => {
+        seenArgs = args as Record<string, unknown>;
+        return { success: true, output: 'transcript' };
+      },
+    };
+
+    const registry = new ToolRegistry();
+    registry.register(audioTool);
+    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
+    const executor = new ToolExecutor(registry, hooks);
+
+    const agent = new NativeAgent({
+      modelClient: mockClient,
+      systemPrompt: 'You are helpful.',
+      toolRegistry: registry,
+      toolExecutor: executor,
+    });
+
+    const response = await agent.process('Please transcribe this', [{
+      mimeType: 'audio/ogg',
+      data: 'QUJDRA==',
+      filename: 'voice.ogg',
+    }]);
+
+    expect(response).toBe('done');
+    expect(seenArgs).toEqual(expect.objectContaining({
+      data: 'QUJDRA==',
+      mime_type: 'audio/ogg',
+    }));
+  });
+
+  it('overrides model-provided base64 with latest turn audio attachment bytes', async () => {
+    let callCount = 0;
+    let seenArgs: Record<string, unknown> | undefined;
+    const mockClient: ModelClient = {
+      chat: vi.fn().mockImplementation(() => {
+        callCount++;
+        if (callCount === 1) {
+          return {
+            content: '',
+            stopReason: 'tool_use',
+            usage: { inputTokens: 10, outputTokens: 5 },
+            toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { data: 'd3Jvbmc=', mime_type: 'audio/ogg' } }],
+          };
+        }
+        return {
+          content: 'done',
+          stopReason: 'end_turn',
+          usage: { inputTokens: 15, outputTokens: 10 },
+        };
+      }),
+    };
+
+    const audioTool: Tool = {
+      name: 'audio.transcribe',
+      description: 'Transcribe audio',
+      inputSchema: { type: 'object', properties: {} },
+      execute: async (args) => {
+        seenArgs = args as Record<string, unknown>;
+        return { success: true, output: 'transcript' };
+      },
+    };
+
+    const registry = new ToolRegistry();
+    registry.register(audioTool);
+    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
+    const executor = new ToolExecutor(registry, hooks);
+
+    const agent = new NativeAgent({
+      modelClient: mockClient,
+      systemPrompt: 'You are helpful.',
+      toolRegistry: registry,
+      toolExecutor: executor,
+    });
+
+    const response = await agent.process('Please transcribe this', [{
+      mimeType: 'audio/ogg',
+      data: 'QUJDRA==',
+      filename: 'voice.ogg',
+    }]);
+
+    expect(response).toBe('done');
+    expect(seenArgs).toEqual(expect.objectContaining({
+      data: 'QUJDRA==',
+      mime_type: 'audio/ogg',
+    }));
+  });
+
  it('respects max iterations when tool calls vary', async () => {
    // Model always returns tool_use but with different args each time (no loop detection)
    let callCount = 0;
@@ -9,6 +9,7 @@ import type { Attachment } from '../../channels/types.js';
 import type { OutboundAttachmentCollector } from './attachments.js';
 import { buildUserMessage } from '../../models/media.js';
 import { getElevationWindow } from '../../security/elevation.js';
+import { auditLogger } from '../../audit/index.js';

 export interface ToolUseEvent {
  type: 'start' | 'end';
@@ -62,6 +63,20 @@ interface ExtractedTextToolCall {
  end: number;
 }

+const LAST_AUDIO_ATTACHMENT_CONFIG_KEY = 'lastAudioAttachment';
+
+interface AudioToolInput {
+  data?: string;
+  url?: string;
+  mime_type?: string;
+}
+
+interface AudioToolArgSummary {
+  hasData: boolean;
+  hasUrl: boolean;
+  mimeType?: string;
+}
+
 export class NativeAgent {
  private static readonly EMPTY_RESPONSE_FALLBACK =
    'I could not generate a response for that. Please try again.';
@@ -363,7 +378,8 @@ export class NativeAgent {
            }
            : undefined;

-          const result = await toolExecutor.execute(internalName, tc.args, perCallContext, {
+          const toolArgs = this.normalizeToolArgsForExecution(internalName, tc.args);
+          const result = await toolExecutor.execute(internalName, toolArgs, perCallContext, {
            signal: this._runAbortController?.signal,
          });

@@ -620,6 +636,284 @@ export class NativeAgent {
    return error instanceof Error && error.name === 'AbortError';
  }

+  private normalizeToolArgsForExecution(toolName: string, rawArgs: unknown): unknown {
+    if (toolName !== 'audio.transcribe') {
+      return rawArgs;
+    }
+    return this.hydrateAudioTranscribeArgs(rawArgs);
+  }
+
+  private hydrateAudioTranscribeArgs(rawArgs: unknown): unknown {
+    const args = (rawArgs && typeof rawArgs === 'object')
+      ? { ...(rawArgs as Record<string, unknown>) }
+      : {};
+    const original = this.summarizeAudioToolArgs(args);
+
+    const latestTurnAudio = this.getLatestTurnUserAudioInput();
+    if (latestTurnAudio) {
+      this.applyAudioToolInput(args, latestTurnAudio);
+      this.logAudioArgsRewrite('latest_audio_preferred', 'latest_turn', original, args);
+      return args;
+    }
+
+    if (this.isCurrentTurnVoiceTranscriptFallback()) {
+      const persistedAudio = this.getPersistedAudioInput();
+      if (persistedAudio) {
+        this.applyAudioToolInput(args, persistedAudio);
+        this.logAudioArgsRewrite('voice_turn_fallback', 'persisted', original, args);
+        return args;
+      }
+    }
+
+    const normalizedData = this.normalizeAudioTranscribeDataArg(args.data, args.mime_type);
+    const normalizedUrl = this.normalizeAudioTranscribeUrlArg(args.url);
+    if (normalizedData) {
+      args.data = normalizedData;
+      delete args.url;
+    } else if (normalizedUrl) {
+      args.url = normalizedUrl;
+      delete args.data;
+    } else {
+      delete args.data;
+      delete args.url;
+    }
+
+    const hasData = typeof args.data === 'string' && args.data.length > 0;
+    const hasUrl = typeof args.url === 'string' && args.url.length > 0;
+    if (hasData || hasUrl) {
+      if (hasData && (typeof args.mime_type !== 'string' || args.mime_type.length === 0)) {
+        const latestAudioForMime = this.getLatestUserAudioInput();
+        if (latestAudioForMime?.mime_type) {
+          args.mime_type = latestAudioForMime.mime_type;
+        }
+      }
+      return args;
+    }
+
+    const latestAudio = this.getLatestUserAudioInput();
+    if (!latestAudio) {
+      return args;
+    }
+    const persistedAudio = this.getPersistedAudioInput();
+    const source: 'history' | 'persisted' = persistedAudio?.data === latestAudio.data
+      && persistedAudio?.mime_type === latestAudio.mime_type
+      ? 'persisted'
+      : 'history';
+    this.applyAudioToolInput(args, latestAudio);
+    this.logAudioArgsRewrite(original.hasData || original.hasUrl ? 'invalid_model_args' : 'missing_model_args', source, original, args);
+    return args;
+  }
+
+  private summarizeAudioToolArgs(args: Record<string, unknown>): AudioToolArgSummary {
+    const hasData = typeof args.data === 'string' && args.data.length > 0;
+    const hasUrl = typeof args.url === 'string' && args.url.length > 0;
+    const mimeType = typeof args.mime_type === 'string' && args.mime_type.length > 0
+      ? args.mime_type
+      : undefined;
+    return { hasData, hasUrl, mimeType };
+  }
+
+  private applyAudioToolInput(args: Record<string, unknown>, audio: AudioToolInput): void {
+    if (audio.data) {
+      args.data = audio.data;
+      delete args.url;
+    } else if (audio.url) {
+      args.url = audio.url;
+      delete args.data;
+    } else {
+      delete args.data;
+      delete args.url;
+    }
+    if (audio.mime_type) {
+      args.mime_type = audio.mime_type;
+    }
+  }
+
+  private logAudioArgsRewrite(
+    reason: 'latest_audio_preferred' | 'voice_turn_fallback' | 'invalid_model_args' | 'missing_model_args',
+    source: 'latest_turn' | 'history' | 'persisted',
+    original: AudioToolArgSummary,
+    normalizedArgs: Record<string, unknown>,
+  ): void {
+    const finalMime = typeof normalizedArgs.mime_type === 'string' && normalizedArgs.mime_type.length > 0
+      ? normalizedArgs.mime_type
+      : undefined;
+    auditLogger?.toolArgsRewritten({
+      tool_name: 'audio.transcribe',
+      session_id: this.session?.id,
+      source,
+      reason,
+      original_has_data: original.hasData,
+      original_has_url: original.hasUrl,
+      original_mime_type: original.mimeType,
+      final_mime_type: finalMime,
+    });
+  }
+
+  private isCurrentTurnVoiceTranscriptFallback(): boolean {
+    for (let i = this.history.length - 1; i >= 0; i--) {
+      const msg = this.history[i];
+      if (msg.role !== 'user') {
+        continue;
+      }
+      if (typeof msg.content === 'string') {
+        return msg.content.includes('[Voice message]:');
+      }
+      if (!Array.isArray(msg.content)) {
+        return false;
+      }
+      return msg.content.some((part) => (
+        part.type === 'text'
+        && typeof part.text === 'string'
+        && part.text.includes('[Voice message]:')
+      ));
+    }
+    return false;
+  }
+
+  private getLatestTurnUserAudioInput(): AudioToolInput | null {
+    for (let i = this.history.length - 1; i >= 0; i--) {
+      const msg = this.history[i];
+      if (msg.role !== 'user') {
+        continue;
+      }
+      if (!Array.isArray(msg.content)) {
+        return null;
+      }
+      for (const part of msg.content) {
+        if (part.type !== 'audio') {
+          continue;
+        }
+        const source = part.source;
+        if (typeof source.data === 'string' && source.data.length > 0 && typeof source.media_type === 'string' && source.media_type.length > 0) {
+          return { data: source.data, mime_type: source.media_type };
+        }
+      }
+      return null;
+    }
+    return null;
+  }
+
+  private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined {
+    if (typeof rawData !== 'string') {
+      return undefined;
+    }
+    const compact = rawData.replace(/\s+/g, '');
+    if (compact.length === 0) {
+      return undefined;
+    }
+    if (!/^[A-Za-z0-9+/=]+$/.test(compact)) {
+      return undefined;
+    }
+    try {
+      const decoded = Buffer.from(compact, 'base64');
+      if (decoded.length === 0) {
+        return undefined;
+      }
+      const mimeType = typeof rawMimeType === 'string' ? rawMimeType : undefined;
+      if (!this.matchesAudioSignature(decoded, mimeType)) {
+        return undefined;
+      }
+      return compact;
+    } catch {
+      return undefined;
+    }
+  }
+
+  private matchesAudioSignature(buffer: Buffer, mimeType?: string): boolean {
+    const ascii = (offset: number, value: string): boolean => {
+      if (buffer.length < offset + value.length) {
+        return false;
+      }
+      return buffer.subarray(offset, offset + value.length).toString('ascii') === value;
+    };
+
+    if (!mimeType) {
+      return true;
+    }
+
+    switch (mimeType) {
+      case 'audio/ogg':
+        return ascii(0, 'OggS');
+      case 'audio/wav':
+        return ascii(0, 'RIFF') && ascii(8, 'WAVE');
+      case 'audio/webm':
+        return buffer.length >= 4
+          && buffer[0] === 0x1A
+          && buffer[1] === 0x45
+          && buffer[2] === 0xDF
+          && buffer[3] === 0xA3;
+      case 'audio/mpeg':
+      case 'audio/mp3':
+        return ascii(0, 'ID3')
+          || (buffer.length >= 2 && buffer[0] === 0xFF && (buffer[1] & 0xE0) === 0xE0);
+      case 'audio/mp4':
+      case 'audio/x-m4a':
+        return ascii(4, 'ftyp');
+      default:
+        return true;
+    }
+  }
+
+  private normalizeAudioTranscribeUrlArg(rawUrl: unknown): string | undefined {
+    if (typeof rawUrl !== 'string') {
+      return undefined;
+    }
+    const trimmed = rawUrl.trim();
+    if (trimmed.length === 0) {
+      return undefined;
+    }
+    if (!/^https?:\/\//i.test(trimmed)) {
+      return undefined;
+    }
+    return trimmed;
+  }
+
+  private getLatestUserAudioInput(): AudioToolInput | null {
+    for (let i = this.history.length - 1; i >= 0; i--) {
+      const msg = this.history[i];
+      if (msg.role !== 'user' || !Array.isArray(msg.content)) {
+        continue;
+      }
+
+      for (const part of msg.content) {
+        if (part.type !== 'audio') {
+          continue;
+        }
+        const source = part.source;
+        if (typeof source.data === 'string' && source.data.length > 0 && typeof source.media_type === 'string' && source.media_type.length > 0) {
+          return { data: source.data, mime_type: source.media_type };
+        }
+      }
+    }
+
+    return this.getPersistedAudioInput();
+  }
+
+  private getPersistedAudioInput(): AudioToolInput | null {
+    const persisted = this.session?.getConfig(LAST_AUDIO_ATTACHMENT_CONFIG_KEY);
+    if (!persisted) {
+      return null;
+    }
+
+    try {
+      const parsed = JSON.parse(persisted) as { data?: unknown; url?: unknown; mimeType?: unknown };
+      const data = typeof parsed.data === 'string' && parsed.data.length > 0 ? parsed.data : undefined;
+      const url = typeof parsed.url === 'string' && parsed.url.length > 0 ? parsed.url : undefined;
+      const mimeType = typeof parsed.mimeType === 'string' && parsed.mimeType.length > 0 ? parsed.mimeType : undefined;
+      if (!data && !url) {
+        return null;
+      }
+      return {
+        ...(data ? { data } : {}),
+        ...(url ? { url } : {}),
+        ...(mimeType ? { mime_type: mimeType } : {}),
+      };
+    } catch {
+      return null;
+    }
+  }
+
  private extractPseudoToolUse(content: string): PseudoToolUse | null {
    if (!content) {
      return null;