Harden audio transcription arg hydration and add rewrite audit event

2026-02-22 18:56:22 -08:00
parent 7d0d8abec6
commit db4e52dd7e
10 changed files with 1183 additions and 16 deletions
@@ -1351,7 +1351,7 @@ describe('daemon audio routing integration', () => {
    expect(String(msg.text)).toContain('audio transcription is not configured');
  });

-  it('transcribes voice attachments when transcription is configured, then strips audio before calling agent.process', async () => {
+  it('transcribes voice attachments when transcription is configured and preserves audio for anthropic tool fallback', async () => {
    const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('ok');

    // Mock transcription endpoint call.
@@ -1422,6 +1422,90 @@ describe('daemon audio routing integration', () => {
      timestamp: Date.now(),
    } as MessageRouterInput, reply);

+    expect(fetchSpy).toHaveBeenCalled();
+    expect(processSpy).toHaveBeenCalledTimes(1);
+    const [calledText, calledAttachments] = processSpy.mock.calls[0] ?? [];
+    expect(String(calledText)).toContain('[Voice message]: hello world');
+    expect(String(calledText)).toContain('caption');
+    const atts = calledAttachments as Array<{ mimeType: string }> | undefined;
+    expect(atts?.some(a => a.mimeType === 'audio/ogg')).toBe(true);
+    expect(atts?.some(a => a.mimeType === 'image/jpeg')).toBe(true);
+    expect(session.setConfig).toHaveBeenCalledWith(
+      'lastAudioAttachment',
+      expect.stringContaining('"mimeType":"audio/ogg"'),
+    );
+  });
+
+  it('transcribes voice attachments when transcription is configured and strips audio for openai-compatible providers', async () => {
+    const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('ok');
+
+    const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      status: 200,
+      statusText: 'OK',
+      json: async () => ({ text: 'hello world' }),
+    } as Response);
+
+    const session = {
+      id: 'telegram:user-voice-3',
+      addMessage: vi.fn(),
+      getHistory: vi.fn(() => []),
+      clear: vi.fn(),
+      replaceHistory: vi.fn(),
+      getConfig: vi.fn(() => undefined),
+      setConfig: vi.fn(),
+      deleteConfig: vi.fn(),
+    };
+
+    const commandRegistry = new CommandRegistry();
+    registerBuiltinCommands(commandRegistry);
+
+    const router = createMessageRouter({
+      sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
+      modelRouter: {
+        getAvailableTiers: () => ['default'],
+        getAllLabels: () => ({ default: 'default' }),
+        getLabel: (tier: string) => tier,
+      } as unknown as MessageRouterDeps['modelRouter'],
+      systemPrompt: 'test prompt',
+      toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
+      toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
+      config: {
+        agents: {
+          primary_tier: 'default',
+          delegation: {
+            compaction: 'default',
+            memory_extraction: 'default',
+            classification: 'default',
+            tool_summarisation: 'default',
+            complex_reasoning: 'default',
+          },
+          max_delegation_depth: 1,
+          max_iterations: 3,
+        },
+        compaction: { enabled: false },
+        models: { default: { provider: 'openai', model: 'gpt-4.1', supports_audio: false } },
+        audio: {
+          enabled: true,
+          provider: { type: 'openai', endpoint: 'https://example.com/v1/audio/transcriptions', api_key: 'sk-test', model: 'whisper-1' },
+        },
+      } as unknown as MessageRouterDeps['config'],
+      commandRegistry,
+    });
+
+    const reply = vi.fn(async (_message: OutboundMessage) => {});
+    await router.handler({
+      id: 'v3',
+      channel: 'telegram',
+      senderId: 'user-voice-3',
+      text: 'caption',
+      attachments: [
+        { mimeType: 'audio/ogg', data: 'ZGF0YQ==', filename: 'voice.ogg' },
+        { mimeType: 'image/jpeg', data: 'aW1n', filename: 'img.jpg' },
+      ],
+      timestamp: Date.now(),
+    } as MessageRouterInput, reply);
+
    expect(fetchSpy).toHaveBeenCalled();
    expect(processSpy).toHaveBeenCalledTimes(1);
    const [calledText, calledAttachments] = processSpy.mock.calls[0] ?? [];
@@ -1430,6 +1514,10 @@ describe('daemon audio routing integration', () => {
    const atts = calledAttachments as Array<{ mimeType: string }> | undefined;
    expect(atts?.some(a => a.mimeType === 'audio/ogg')).toBe(false);
    expect(atts?.some(a => a.mimeType === 'image/jpeg')).toBe(true);
+    expect(session.setConfig).toHaveBeenCalledWith(
+      'lastAudioAttachment',
+      expect.stringContaining('"mimeType":"audio/ogg"'),
+    );
  });
 });

@@ -164,6 +164,57 @@ function shouldForceNativeForCapabilityQuery(text: string): boolean {
  );
 }

+function providerAcceptsNativeAudioContentParts(provider: string): boolean {
+  return (
+    provider === 'openai'
+    || provider === 'github'
+    || provider === 'gemini'
+    || provider === 'openrouter'
+    || provider === 'zhipuai'
+    || provider === 'xai'
+    || provider === 'minimax'
+    || provider === 'moonshot'
+    || provider === 'vercel'
+  );
+}
+
+const LAST_AUDIO_ATTACHMENT_CONFIG_KEY = 'lastAudioAttachment';
+
+function persistLatestAudioAttachment(
+  session: { setConfig(key: string, value: string): void },
+  audioAttachments: Attachment[],
+): void {
+  const latest = [...audioAttachments].reverse().find((att) => (
+    (typeof att.data === 'string' && att.data.length > 0)
+      || (typeof att.url === 'string' && att.url.length > 0)
+  ));
+  if (!latest) {
+    return;
+  }
+
+  const payload: { data?: string; url?: string; mimeType?: string } = {
+    mimeType: latest.mimeType,
+  };
+  if (typeof latest.data === 'string' && latest.data.length > 0) {
+    payload.data = latest.data;
+  } else if (typeof latest.url === 'string' && latest.url.length > 0) {
+    payload.url = latest.url;
+  }
+
+  if (!payload.data && !payload.url) {
+    return;
+  }
+
+  try {
+    session.setConfig(LAST_AUDIO_ATTACHMENT_CONFIG_KEY, JSON.stringify(payload));
+  } catch (error) {
+    console.warn(
+      'Failed to persist latest audio attachment for tool hydration:',
+      error instanceof Error ? error.message : String(error),
+    );
+  }
+}
+
 function isTtsEnabledForChannel(config: Config, channel: string): boolean {
  if (!config.tts?.enabled) {
    return false;
@@ -1266,6 +1317,9 @@ export function createMessageRouter(deps: {
      let messageText = incomingText;
      let attachments = msg.attachments;
      const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
+      if (audioAttachments.length > 0) {
+        persistLatestAudioAttachment(session, audioAttachments);
+      }

      if (audioAttachments.length > 0 && !nativeAudioSupported) {
        // Model doesn't support native audio — transcribe via Whisper and strip audio attachments
@@ -1300,9 +1354,15 @@ export function createMessageRouter(deps: {
          const transcript = await transcribeAudio(att, audioConfig);
          messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
        }
-        // Remove audio attachments so buildUserMessage doesn't create audio content parts
-        attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
-        if (attachments.length === 0) { attachments = undefined; }
+        // For providers that cannot ingest native audio content parts (e.g. Anthropic),
+        // keep the original audio attachment available in the tool loop so
+        // audio.transcribe can still be hydrated from bytes if the model requests it.
+        // For providers that do accept native audio parts (OpenAI-compatible/Gemini),
+        // strip audio to avoid sending raw audio to a model tier that was marked as non-audio.
+        if (providerAcceptsNativeAudioContentParts(modelProvider)) {
+          attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
+          if (attachments.length === 0) { attachments = undefined; }
+        }
      }
      // If native audio IS supported, we pass attachments through unchanged —
      // buildUserMessage() in the agent will create native audio content parts