feat(audio): add tests, token estimation, and config override for native audio

- Add capabilities.test.ts (18 tests) for supportsAudioInput() - Add 15 audio tests to media.test.ts (hasAudio, stripAudioParts, attachmentToAudioSource) - Add estimateAudioTokens() to tokens.ts (base64→bytes→duration→tokens) - Update estimateMessageTokens() to include audio content parts - Add 5 audio token tests to tokens.test.ts - Add supports_audio config override to model schema - Wire supports_audio from tier config through routing to capability check Total tests: 1369 (was 1331, +38 audio-related)
2026-02-11 18:27:19 -08:00
parent 32ac4df20a
commit 148219153e
7 changed files with 357 additions and 8 deletions
@@ -52,6 +52,7 @@ const modelConfigBaseSchema = z.object({
  for: z.array(z.string()).optional(),
  num_gpu: z.number().optional(),
  context_window: z.number().optional(),
+  supports_audio: z.boolean().optional(),
 });

 const modelConfigSchema = modelConfigBaseSchema.extend({
@@ -1,5 +1,5 @@
 import { describe, it, expect } from 'vitest';
-import { estimateTokens, estimateMessageTokens, getContextWindow, shouldCompact, CONTEXT_WINDOWS } from './tokens.js';
+import { estimateTokens, estimateAudioTokens, estimateMessageTokens, getContextWindow, shouldCompact, CONTEXT_WINDOWS } from './tokens.js';

 describe('estimateTokens', () => {
  it('returns 0 for empty string', () => {
@@ -20,6 +20,33 @@ describe('estimateTokens', () => {
  });
 });

+describe('estimateAudioTokens', () => {
+  it('returns positive number for valid audio data', () => {
+    // 10000 base64 chars → ~7500 bytes → ~3.75s → ceil(3.75 * 32) = 120
+    const source = { media_type: 'audio/ogg', data: 'A'.repeat(10000) };
+    const tokens = estimateAudioTokens(source);
+    expect(tokens).toBeGreaterThan(0);
+    expect(tokens).toBe(120);
+  });
+
+  it('returns at least 1 for very short audio', () => {
+    // 1 byte of base64 data → very tiny duration, but minimum is 1
+    const source = { media_type: 'audio/ogg', data: 'A' };
+    expect(estimateAudioTokens(source)).toBe(1);
+  });
+
+  it('returns 0 for empty audio data', () => {
+    const source = { media_type: 'audio/ogg', data: '' };
+    expect(estimateAudioTokens(source)).toBe(0);
+  });
+
+  it('longer audio data produces more tokens', () => {
+    const short = { media_type: 'audio/ogg', data: 'A'.repeat(1000) };
+    const long = { media_type: 'audio/ogg', data: 'A'.repeat(100000) };
+    expect(estimateAudioTokens(long)).toBeGreaterThan(estimateAudioTokens(short));
+  });
+});
+
 describe('estimateMessageTokens', () => {
  it('returns 0 for empty array', () => {
    expect(estimateMessageTokens([])).toBe(0);
@@ -38,6 +65,23 @@ describe('estimateMessageTokens', () => {
    ];
    expect(estimateMessageTokens(messages)).toBe(10);
  });
+
+  it('includes audio token estimate for multimodal messages', () => {
+    // Text part: 'hello' = 5 chars → ceil(5/4) = 2 text tokens
+    // Audio part: 10000 base64 chars → 120 audio tokens (see estimateAudioTokens test)
+    // Overhead: 4
+    // Total: 2 + 120 + 4 = 126
+    const messages = [
+      {
+        role: 'user' as const,
+        content: [
+          { type: 'text' as const, text: 'hello' },
+          { type: 'audio' as const, source: { media_type: 'audio/ogg', data: 'A'.repeat(10000) } },
+        ],
+      },
+    ];
+    expect(estimateMessageTokens(messages)).toBe(126);
+  });
 });

 describe('getContextWindow', () => {
@@ -1,4 +1,4 @@
-import type { Message } from '../models/types.js';
+import type { Message, AudioSource } from '../models/types.js';
 import { getMessageText } from '../models/media.js';

 /**
@@ -36,6 +36,25 @@ export function estimateTokens(text: string): number {
  return Math.ceil(text.length / 4);
 }

+/**
+ * Estimate token count for an audio content part.
+ *
+ * Heuristic:
+ *  1. Decode base64 length to bytes: `base64Length * 0.75`
+ *  2. Assume ~16 kbps bitrate (typical voice OGG/Opus): `bytes / 2000` → seconds
+ *  3. Estimate ~32 tokens per second of audio (Gemini-style rate)
+ *
+ * Returns at least 1 token for any non-empty audio data.
+ */
+export function estimateAudioTokens(audioSource: AudioSource): number {
+  const base64Length = audioSource.data.length;
+  if (base64Length === 0) {
+    return 0;
+  }
+  const durationSeconds = (base64Length * 0.75) / 2000;
+  return Math.max(1, Math.ceil(durationSeconds * 32));
+}
+
 /**
 * Estimate the total token count for an array of messages.
 *
@@ -43,10 +62,20 @@ export function estimateTokens(text: string): number {
 * overhead of ~4 tokens to account for the role marker and separators.
 */
 export function estimateMessageTokens(messages: Message[]): number {
-  return messages.reduce(
-    (sum, msg) => sum + estimateTokens(getMessageText(msg)) + MESSAGE_OVERHEAD_TOKENS,
-    0,
-  );
+  return messages.reduce((sum, msg) => {
+    let tokens = estimateTokens(getMessageText(msg)) + MESSAGE_OVERHEAD_TOKENS;
+
+    // Add audio token estimates for multimodal messages
+    if (Array.isArray(msg.content)) {
+      for (const part of msg.content) {
+        if (part.type === 'audio') {
+          tokens += estimateAudioTokens(part.source);
+        }
+      }
+    }
+
+    return sum + tokens;
+  }, 0);
 }

 /**
@@ -232,7 +232,8 @@ export function createMessageRouter(deps: {
      const tierConfig = modelsConfig[effectiveTier] ?? deps.config.models.default;
      const modelProvider = tierConfig?.provider ?? deps.config.models.default.provider;
      const modelName = tierConfig?.model ?? deps.config.models.default.model;
-      const nativeAudioSupported = supportsAudioInput(modelProvider, modelName);
+      const supportsAudioOverride = (tierConfig as Record<string, unknown> | undefined)?.supports_audio as boolean | undefined;
+      const nativeAudioSupported = supportsAudioInput(modelProvider, modelName, supportsAudioOverride);

      let messageText = msg.text;
      let attachments = msg.attachments;
@@ -0,0 +1,60 @@
+import { describe, it, expect } from 'vitest';
+import { supportsAudioInput } from './capabilities.js';
+
+describe('supportsAudioInput', () => {
+  describe('audio-capable providers with modern models', () => {
+    it('returns true for gemini with a modern model', () => {
+      expect(supportsAudioInput('gemini', 'gemini-1.5-pro')).toBe(true);
+    });
+
+    it('returns true for openai with a modern model', () => {
+      expect(supportsAudioInput('openai', 'gpt-4o')).toBe(true);
+    });
+
+    it('returns true for github with a modern model', () => {
+      expect(supportsAudioInput('github', 'gpt-4o')).toBe(true);
+    });
+  });
+
+  describe('non-audio providers return false', () => {
+    const nonAudioProviders = [
+      'anthropic',
+      'bedrock',
+      'ollama',
+      'llamacpp',
+      'openrouter',
+      'zhipuai',
+      'xai',
+    ] as const;
+
+    for (const provider of nonAudioProviders) {
+      it(`returns false for ${provider}`, () => {
+        expect(supportsAudioInput(provider, 'some-model')).toBe(false);
+      });
+    }
+  });
+
+  describe('model-specific exclusions', () => {
+    const excludedModels = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo'];
+
+    for (const model of excludedModels) {
+      it(`returns false for openai/${model} despite provider being capable`, () => {
+        expect(supportsAudioInput('openai', model)).toBe(false);
+      });
+
+      it(`returns false for github/${model} despite provider being capable`, () => {
+        expect(supportsAudioInput('github', model)).toBe(false);
+      });
+    }
+  });
+
+  describe('unknown provider', () => {
+    it('returns false for a completely unknown provider', () => {
+      expect(supportsAudioInput('unknown-provider', 'some-model')).toBe(false);
+    });
+
+    it('returns false for an empty string provider', () => {
+      expect(supportsAudioInput('', 'some-model')).toBe(false);
+    });
+  });
+});
@@ -31,7 +31,9 @@ const AUDIO_INCAPABLE_MODELS = new Set<string>([
 * Returns true if the model can receive raw audio data directly via its API,
 * false if audio must be transcribed to text before sending.
 */
-export function supportsAudioInput(provider: string, model: string): boolean {
+export function supportsAudioInput(provider: string, model: string, override?: boolean): boolean {
+  if (override !== undefined) return override;
+
  // Provider must be in the capable set
  if (!AUDIO_CAPABLE_PROVIDERS.has(provider)) {
    return false;
@@ -6,11 +6,14 @@ import {
  isSupportedImage,
  isSupportedAudio,
  attachmentToImageSource,
+  attachmentToAudioSource,
  buildUserMessage,
  getMessageText,
  getMessageTextWithTools,
  normalizeMessagesForLocal,
  hasImages,
+  hasAudio,
+  stripAudioParts,
  transcribeAudio,
  buildUserMessageWithAudio,
  type AudioTranscriptionConfig,
@@ -820,3 +823,212 @@ describe('normalizeMessagesForLocal', () => {
    ]);
  });
 });
+
+// ---------------------------------------------------------------------------
+// 12. attachmentToAudioSource
+// ---------------------------------------------------------------------------
+
+describe('attachmentToAudioSource', () => {
+  // Positive: supported audio type with data returns AudioSource.
+  it('returns AudioSource for supported audio type with data', () => {
+    const result = attachmentToAudioSource(oggAudioAttachment);
+
+    expect(result).toEqual({
+      media_type: 'audio/ogg',
+      data: 'AAAAAAAAAAAAAAAAAAAA',
+    });
+  });
+
+  // Negative: unsupported MIME type returns null.
+  it('returns null for unsupported mime type', () => {
+    const result = attachmentToAudioSource(pdfAttachment);
+
+    expect(result).toBeNull();
+  });
+
+  // Negative: supported audio type but no data returns null.
+  it('returns null when no data present', () => {
+    const noDataAudio = makeAttachment({
+      mimeType: 'audio/ogg',
+      filename: 'voice.ogg',
+    });
+
+    const result = attachmentToAudioSource(noDataAudio);
+
+    expect(result).toBeNull();
+  });
+
+  // Negative: image attachment returns null.
+  it('returns null for image attachment', () => {
+    const result = attachmentToAudioSource(jpegBase64Attachment);
+
+    expect(result).toBeNull();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 13. hasAudio
+// ---------------------------------------------------------------------------
+
+describe('hasAudio', () => {
+  // Negative: string content never has audio.
+  it('returns false for string content messages', () => {
+    const msg: Message = { role: 'user', content: 'no audio here' };
+
+    expect(hasAudio(msg)).toBe(false);
+  });
+
+  // Negative: multimodal messages with only text parts have no audio.
+  it('returns false for multimodal messages with only text parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [{ type: 'text', text: 'just text' }],
+    };
+
+    expect(hasAudio(msg)).toBe(false);
+  });
+
+  // Negative: multimodal messages with only image parts have no audio.
+  it('returns false for multimodal messages with only image parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
+      ],
+    };
+
+    expect(hasAudio(msg)).toBe(false);
+  });
+
+  // Positive: multimodal messages with audio parts are detected.
+  it('returns true for multimodal messages with audio parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } },
+      ],
+    };
+
+    expect(hasAudio(msg)).toBe(true);
+  });
+
+  // Positive: multimodal messages with mixed image + audio parts are detected.
+  it('returns true for multimodal messages with mixed image+audio parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'image', source: { type: 'base64', media_type: 'image/png', data: 'img' } },
+        { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } },
+      ],
+    };
+
+    expect(hasAudio(msg)).toBe(true);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 14. stripAudioParts
+// ---------------------------------------------------------------------------
+
+describe('stripAudioParts', () => {
+  // String content passes through unchanged.
+  it('returns unchanged message for string content', () => {
+    const msg: Message = { role: 'user', content: 'plain text' };
+
+    const result = stripAudioParts(msg);
+
+    expect(result).toEqual({ role: 'user', content: 'plain text' });
+  });
+
+  // Audio part with transcript is replaced with transcript text.
+  it('replaces audio part with transcript text when transcript is present', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'text', text: 'Check this out' },
+        { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'Hello world' } },
+      ],
+    };
+
+    const result = stripAudioParts(msg);
+
+    expect(result.role).toBe('user');
+    expect(Array.isArray(result.content)).toBe(true);
+    const parts = result.content as Array<{ type: string; text?: string }>;
+    expect(parts).toHaveLength(2);
+    expect(parts[0]).toEqual({ type: 'text', text: 'Check this out' });
+    expect(parts[1]).toEqual({ type: 'text', text: '[Voice message]: Hello world' });
+  });
+
+  // Audio part without transcript is replaced with placeholder.
+  it('replaces audio part with placeholder when no transcript', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'text', text: 'Listen' },
+        { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } },
+      ],
+    };
+
+    const result = stripAudioParts(msg);
+
+    expect(Array.isArray(result.content)).toBe(true);
+    const parts = result.content as Array<{ type: string; text?: string }>;
+    expect(parts).toHaveLength(2);
+    expect(parts[0]).toEqual({ type: 'text', text: 'Listen' });
+    expect(parts[1]).toEqual({ type: 'text', text: '[Audio message received but no transcript available]' });
+  });
+
+  // Non-audio parts (text + image) are kept unchanged.
+  it('keeps non-audio parts unchanged', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'text', text: 'caption' },
+        { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
+      ],
+    };
+
+    const result = stripAudioParts(msg);
+
+    expect(result.content).toEqual([
+      { type: 'text', text: 'caption' },
+      { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
+    ]);
+  });
+
+  // Simplifies to string content when only one text part remains after stripping.
+  it('simplifies to string content when only one text part remains after stripping', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'Hi there' } },
+      ],
+    };
+
+    const result = stripAudioParts(msg);
+
+    expect(result).toEqual({ role: 'user', content: '[Voice message]: Hi there' });
+  });
+
+  // Handles message with multiple audio parts.
+  it('handles message with multiple audio parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'First message' } },
+        { type: 'text', text: 'in between' },
+        { type: 'audio', source: { media_type: 'audio/mpeg', data: 'BBBB' } },
+      ],
+    };
+
+    const result = stripAudioParts(msg);
+
+    expect(Array.isArray(result.content)).toBe(true);
+    const parts = result.content as Array<{ type: string; text?: string }>;
+    expect(parts).toHaveLength(3);
+    expect(parts[0]).toEqual({ type: 'text', text: '[Voice message]: First message' });
+    expect(parts[1]).toEqual({ type: 'text', text: 'in between' });
+    expect(parts[2]).toEqual({ type: 'text', text: '[Audio message received but no transcript available]' });
+  });
+});