feat(audio): add tests, token estimation, and config override for native audio
- Add capabilities.test.ts (18 tests) for supportsAudioInput() - Add 15 audio tests to media.test.ts (hasAudio, stripAudioParts, attachmentToAudioSource) - Add estimateAudioTokens() to tokens.ts (base64→bytes→duration→tokens) - Update estimateMessageTokens() to include audio content parts - Add 5 audio token tests to tokens.test.ts - Add supports_audio config override to model schema - Wire supports_audio from tier config through routing to capability check Total tests: 1369 (was 1331, +38 audio-related)
This commit is contained in:
@@ -0,0 +1,60 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { supportsAudioInput } from './capabilities.js';
|
||||
|
||||
describe('supportsAudioInput', () => {
|
||||
describe('audio-capable providers with modern models', () => {
|
||||
it('returns true for gemini with a modern model', () => {
|
||||
expect(supportsAudioInput('gemini', 'gemini-1.5-pro')).toBe(true);
|
||||
});
|
||||
|
||||
it('returns true for openai with a modern model', () => {
|
||||
expect(supportsAudioInput('openai', 'gpt-4o')).toBe(true);
|
||||
});
|
||||
|
||||
it('returns true for github with a modern model', () => {
|
||||
expect(supportsAudioInput('github', 'gpt-4o')).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('non-audio providers return false', () => {
|
||||
const nonAudioProviders = [
|
||||
'anthropic',
|
||||
'bedrock',
|
||||
'ollama',
|
||||
'llamacpp',
|
||||
'openrouter',
|
||||
'zhipuai',
|
||||
'xai',
|
||||
] as const;
|
||||
|
||||
for (const provider of nonAudioProviders) {
|
||||
it(`returns false for ${provider}`, () => {
|
||||
expect(supportsAudioInput(provider, 'some-model')).toBe(false);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
describe('model-specific exclusions', () => {
|
||||
const excludedModels = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo'];
|
||||
|
||||
for (const model of excludedModels) {
|
||||
it(`returns false for openai/${model} despite provider being capable`, () => {
|
||||
expect(supportsAudioInput('openai', model)).toBe(false);
|
||||
});
|
||||
|
||||
it(`returns false for github/${model} despite provider being capable`, () => {
|
||||
expect(supportsAudioInput('github', model)).toBe(false);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
describe('unknown provider', () => {
|
||||
it('returns false for a completely unknown provider', () => {
|
||||
expect(supportsAudioInput('unknown-provider', 'some-model')).toBe(false);
|
||||
});
|
||||
|
||||
it('returns false for an empty string provider', () => {
|
||||
expect(supportsAudioInput('', 'some-model')).toBe(false);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -31,7 +31,9 @@ const AUDIO_INCAPABLE_MODELS = new Set<string>([
|
||||
* Returns true if the model can receive raw audio data directly via its API,
|
||||
* false if audio must be transcribed to text before sending.
|
||||
*/
|
||||
export function supportsAudioInput(provider: string, model: string): boolean {
|
||||
export function supportsAudioInput(provider: string, model: string, override?: boolean): boolean {
|
||||
if (override !== undefined) return override;
|
||||
|
||||
// Provider must be in the capable set
|
||||
if (!AUDIO_CAPABLE_PROVIDERS.has(provider)) {
|
||||
return false;
|
||||
|
||||
@@ -6,11 +6,14 @@ import {
|
||||
isSupportedImage,
|
||||
isSupportedAudio,
|
||||
attachmentToImageSource,
|
||||
attachmentToAudioSource,
|
||||
buildUserMessage,
|
||||
getMessageText,
|
||||
getMessageTextWithTools,
|
||||
normalizeMessagesForLocal,
|
||||
hasImages,
|
||||
hasAudio,
|
||||
stripAudioParts,
|
||||
transcribeAudio,
|
||||
buildUserMessageWithAudio,
|
||||
type AudioTranscriptionConfig,
|
||||
@@ -820,3 +823,212 @@ describe('normalizeMessagesForLocal', () => {
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// 12. attachmentToAudioSource
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('attachmentToAudioSource', () => {
|
||||
// Positive: supported audio type with data returns AudioSource.
|
||||
it('returns AudioSource for supported audio type with data', () => {
|
||||
const result = attachmentToAudioSource(oggAudioAttachment);
|
||||
|
||||
expect(result).toEqual({
|
||||
media_type: 'audio/ogg',
|
||||
data: 'AAAAAAAAAAAAAAAAAAAA',
|
||||
});
|
||||
});
|
||||
|
||||
// Negative: unsupported MIME type returns null.
|
||||
it('returns null for unsupported mime type', () => {
|
||||
const result = attachmentToAudioSource(pdfAttachment);
|
||||
|
||||
expect(result).toBeNull();
|
||||
});
|
||||
|
||||
// Negative: supported audio type but no data returns null.
|
||||
it('returns null when no data present', () => {
|
||||
const noDataAudio = makeAttachment({
|
||||
mimeType: 'audio/ogg',
|
||||
filename: 'voice.ogg',
|
||||
});
|
||||
|
||||
const result = attachmentToAudioSource(noDataAudio);
|
||||
|
||||
expect(result).toBeNull();
|
||||
});
|
||||
|
||||
// Negative: image attachment returns null.
|
||||
it('returns null for image attachment', () => {
|
||||
const result = attachmentToAudioSource(jpegBase64Attachment);
|
||||
|
||||
expect(result).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// 13. hasAudio
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('hasAudio', () => {
|
||||
// Negative: string content never has audio.
|
||||
it('returns false for string content messages', () => {
|
||||
const msg: Message = { role: 'user', content: 'no audio here' };
|
||||
|
||||
expect(hasAudio(msg)).toBe(false);
|
||||
});
|
||||
|
||||
// Negative: multimodal messages with only text parts have no audio.
|
||||
it('returns false for multimodal messages with only text parts', () => {
|
||||
const msg: Message = {
|
||||
role: 'user',
|
||||
content: [{ type: 'text', text: 'just text' }],
|
||||
};
|
||||
|
||||
expect(hasAudio(msg)).toBe(false);
|
||||
});
|
||||
|
||||
// Negative: multimodal messages with only image parts have no audio.
|
||||
it('returns false for multimodal messages with only image parts', () => {
|
||||
const msg: Message = {
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
|
||||
],
|
||||
};
|
||||
|
||||
expect(hasAudio(msg)).toBe(false);
|
||||
});
|
||||
|
||||
// Positive: multimodal messages with audio parts are detected.
|
||||
it('returns true for multimodal messages with audio parts', () => {
|
||||
const msg: Message = {
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } },
|
||||
],
|
||||
};
|
||||
|
||||
expect(hasAudio(msg)).toBe(true);
|
||||
});
|
||||
|
||||
// Positive: multimodal messages with mixed image + audio parts are detected.
|
||||
it('returns true for multimodal messages with mixed image+audio parts', () => {
|
||||
const msg: Message = {
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'image', source: { type: 'base64', media_type: 'image/png', data: 'img' } },
|
||||
{ type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } },
|
||||
],
|
||||
};
|
||||
|
||||
expect(hasAudio(msg)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// 14. stripAudioParts
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('stripAudioParts', () => {
|
||||
// String content passes through unchanged.
|
||||
it('returns unchanged message for string content', () => {
|
||||
const msg: Message = { role: 'user', content: 'plain text' };
|
||||
|
||||
const result = stripAudioParts(msg);
|
||||
|
||||
expect(result).toEqual({ role: 'user', content: 'plain text' });
|
||||
});
|
||||
|
||||
// Audio part with transcript is replaced with transcript text.
|
||||
it('replaces audio part with transcript text when transcript is present', () => {
|
||||
const msg: Message = {
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: 'Check this out' },
|
||||
{ type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'Hello world' } },
|
||||
],
|
||||
};
|
||||
|
||||
const result = stripAudioParts(msg);
|
||||
|
||||
expect(result.role).toBe('user');
|
||||
expect(Array.isArray(result.content)).toBe(true);
|
||||
const parts = result.content as Array<{ type: string; text?: string }>;
|
||||
expect(parts).toHaveLength(2);
|
||||
expect(parts[0]).toEqual({ type: 'text', text: 'Check this out' });
|
||||
expect(parts[1]).toEqual({ type: 'text', text: '[Voice message]: Hello world' });
|
||||
});
|
||||
|
||||
// Audio part without transcript is replaced with placeholder.
|
||||
it('replaces audio part with placeholder when no transcript', () => {
|
||||
const msg: Message = {
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: 'Listen' },
|
||||
{ type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } },
|
||||
],
|
||||
};
|
||||
|
||||
const result = stripAudioParts(msg);
|
||||
|
||||
expect(Array.isArray(result.content)).toBe(true);
|
||||
const parts = result.content as Array<{ type: string; text?: string }>;
|
||||
expect(parts).toHaveLength(2);
|
||||
expect(parts[0]).toEqual({ type: 'text', text: 'Listen' });
|
||||
expect(parts[1]).toEqual({ type: 'text', text: '[Audio message received but no transcript available]' });
|
||||
});
|
||||
|
||||
// Non-audio parts (text + image) are kept unchanged.
|
||||
it('keeps non-audio parts unchanged', () => {
|
||||
const msg: Message = {
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: 'caption' },
|
||||
{ type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
|
||||
],
|
||||
};
|
||||
|
||||
const result = stripAudioParts(msg);
|
||||
|
||||
expect(result.content).toEqual([
|
||||
{ type: 'text', text: 'caption' },
|
||||
{ type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
|
||||
]);
|
||||
});
|
||||
|
||||
// Simplifies to string content when only one text part remains after stripping.
|
||||
it('simplifies to string content when only one text part remains after stripping', () => {
|
||||
const msg: Message = {
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'Hi there' } },
|
||||
],
|
||||
};
|
||||
|
||||
const result = stripAudioParts(msg);
|
||||
|
||||
expect(result).toEqual({ role: 'user', content: '[Voice message]: Hi there' });
|
||||
});
|
||||
|
||||
// Handles message with multiple audio parts.
|
||||
it('handles message with multiple audio parts', () => {
|
||||
const msg: Message = {
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'First message' } },
|
||||
{ type: 'text', text: 'in between' },
|
||||
{ type: 'audio', source: { media_type: 'audio/mpeg', data: 'BBBB' } },
|
||||
],
|
||||
};
|
||||
|
||||
const result = stripAudioParts(msg);
|
||||
|
||||
expect(Array.isArray(result.content)).toBe(true);
|
||||
const parts = result.content as Array<{ type: string; text?: string }>;
|
||||
expect(parts).toHaveLength(3);
|
||||
expect(parts[0]).toEqual({ type: 'text', text: '[Voice message]: First message' });
|
||||
expect(parts[1]).toEqual({ type: 'text', text: 'in between' });
|
||||
expect(parts[2]).toEqual({ type: 'text', text: '[Audio message received but no transcript available]' });
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user