feat(audio): add tests, token estimation, and config override for native audio

- Add capabilities.test.ts (18 tests) for supportsAudioInput() - Add 15 audio tests to media.test.ts (hasAudio, stripAudioParts, attachmentToAudioSource) - Add estimateAudioTokens() to tokens.ts (base64→bytes→duration→tokens) - Update estimateMessageTokens() to include audio content parts - Add 5 audio token tests to tokens.test.ts - Add supports_audio config override to model schema - Wire supports_audio from tier config through routing to capability check Total tests: 1369 (was 1331, +38 audio-related)
2026-02-11 18:27:19 -08:00
parent 32ac4df20a
commit 148219153e
7 changed files with 357 additions and 8 deletions
@@ -6,11 +6,14 @@ import {
  isSupportedImage,
  isSupportedAudio,
  attachmentToImageSource,
+  attachmentToAudioSource,
  buildUserMessage,
  getMessageText,
  getMessageTextWithTools,
  normalizeMessagesForLocal,
  hasImages,
+  hasAudio,
+  stripAudioParts,
  transcribeAudio,
  buildUserMessageWithAudio,
  type AudioTranscriptionConfig,
@@ -820,3 +823,212 @@ describe('normalizeMessagesForLocal', () => {
    ]);
  });
 });
+
+// ---------------------------------------------------------------------------
+// 12. attachmentToAudioSource
+// ---------------------------------------------------------------------------
+
+describe('attachmentToAudioSource', () => {
+  // Positive: supported audio type with data returns AudioSource.
+  it('returns AudioSource for supported audio type with data', () => {
+    const result = attachmentToAudioSource(oggAudioAttachment);
+
+    expect(result).toEqual({
+      media_type: 'audio/ogg',
+      data: 'AAAAAAAAAAAAAAAAAAAA',
+    });
+  });
+
+  // Negative: unsupported MIME type returns null.
+  it('returns null for unsupported mime type', () => {
+    const result = attachmentToAudioSource(pdfAttachment);
+
+    expect(result).toBeNull();
+  });
+
+  // Negative: supported audio type but no data returns null.
+  it('returns null when no data present', () => {
+    const noDataAudio = makeAttachment({
+      mimeType: 'audio/ogg',
+      filename: 'voice.ogg',
+    });
+
+    const result = attachmentToAudioSource(noDataAudio);
+
+    expect(result).toBeNull();
+  });
+
+  // Negative: image attachment returns null.
+  it('returns null for image attachment', () => {
+    const result = attachmentToAudioSource(jpegBase64Attachment);
+
+    expect(result).toBeNull();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 13. hasAudio
+// ---------------------------------------------------------------------------
+
+describe('hasAudio', () => {
+  // Negative: string content never has audio.
+  it('returns false for string content messages', () => {
+    const msg: Message = { role: 'user', content: 'no audio here' };
+
+    expect(hasAudio(msg)).toBe(false);
+  });
+
+  // Negative: multimodal messages with only text parts have no audio.
+  it('returns false for multimodal messages with only text parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [{ type: 'text', text: 'just text' }],
+    };
+
+    expect(hasAudio(msg)).toBe(false);
+  });
+
+  // Negative: multimodal messages with only image parts have no audio.
+  it('returns false for multimodal messages with only image parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
+      ],
+    };
+
+    expect(hasAudio(msg)).toBe(false);
+  });
+
+  // Positive: multimodal messages with audio parts are detected.
+  it('returns true for multimodal messages with audio parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } },
+      ],
+    };
+
+    expect(hasAudio(msg)).toBe(true);
+  });
+
+  // Positive: multimodal messages with mixed image + audio parts are detected.
+  it('returns true for multimodal messages with mixed image+audio parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'image', source: { type: 'base64', media_type: 'image/png', data: 'img' } },
+        { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } },
+      ],
+    };
+
+    expect(hasAudio(msg)).toBe(true);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 14. stripAudioParts
+// ---------------------------------------------------------------------------
+
+describe('stripAudioParts', () => {
+  // String content passes through unchanged.
+  it('returns unchanged message for string content', () => {
+    const msg: Message = { role: 'user', content: 'plain text' };
+
+    const result = stripAudioParts(msg);
+
+    expect(result).toEqual({ role: 'user', content: 'plain text' });
+  });
+
+  // Audio part with transcript is replaced with transcript text.
+  it('replaces audio part with transcript text when transcript is present', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'text', text: 'Check this out' },
+        { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'Hello world' } },
+      ],
+    };
+
+    const result = stripAudioParts(msg);
+
+    expect(result.role).toBe('user');
+    expect(Array.isArray(result.content)).toBe(true);
+    const parts = result.content as Array<{ type: string; text?: string }>;
+    expect(parts).toHaveLength(2);
+    expect(parts[0]).toEqual({ type: 'text', text: 'Check this out' });
+    expect(parts[1]).toEqual({ type: 'text', text: '[Voice message]: Hello world' });
+  });
+
+  // Audio part without transcript is replaced with placeholder.
+  it('replaces audio part with placeholder when no transcript', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'text', text: 'Listen' },
+        { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } },
+      ],
+    };
+
+    const result = stripAudioParts(msg);
+
+    expect(Array.isArray(result.content)).toBe(true);
+    const parts = result.content as Array<{ type: string; text?: string }>;
+    expect(parts).toHaveLength(2);
+    expect(parts[0]).toEqual({ type: 'text', text: 'Listen' });
+    expect(parts[1]).toEqual({ type: 'text', text: '[Audio message received but no transcript available]' });
+  });
+
+  // Non-audio parts (text + image) are kept unchanged.
+  it('keeps non-audio parts unchanged', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'text', text: 'caption' },
+        { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
+      ],
+    };
+
+    const result = stripAudioParts(msg);
+
+    expect(result.content).toEqual([
+      { type: 'text', text: 'caption' },
+      { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
+    ]);
+  });
+
+  // Simplifies to string content when only one text part remains after stripping.
+  it('simplifies to string content when only one text part remains after stripping', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'Hi there' } },
+      ],
+    };
+
+    const result = stripAudioParts(msg);
+
+    expect(result).toEqual({ role: 'user', content: '[Voice message]: Hi there' });
+  });
+
+  // Handles message with multiple audio parts.
+  it('handles message with multiple audio parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'First message' } },
+        { type: 'text', text: 'in between' },
+        { type: 'audio', source: { media_type: 'audio/mpeg', data: 'BBBB' } },
+      ],
+    };
+
+    const result = stripAudioParts(msg);
+
+    expect(Array.isArray(result.content)).toBe(true);
+    const parts = result.content as Array<{ type: string; text?: string }>;
+    expect(parts).toHaveLength(3);
+    expect(parts[0]).toEqual({ type: 'text', text: '[Voice message]: First message' });
+    expect(parts[1]).toEqual({ type: 'text', text: 'in between' });
+    expect(parts[2]).toEqual({ type: 'text', text: '[Audio message received but no transcript available]' });
+  });
+});