feat(audio): add tests, token estimation, and config override for native audio
- Add capabilities.test.ts (18 tests) for supportsAudioInput() - Add 15 audio tests to media.test.ts (hasAudio, stripAudioParts, attachmentToAudioSource) - Add estimateAudioTokens() to tokens.ts (base64→bytes→duration→tokens) - Update estimateMessageTokens() to include audio content parts - Add 5 audio token tests to tokens.test.ts - Add supports_audio config override to model schema - Wire supports_audio from tier config through routing to capability check Total tests: 1369 (was 1331, +38 audio-related)
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { estimateTokens, estimateMessageTokens, getContextWindow, shouldCompact, CONTEXT_WINDOWS } from './tokens.js';
|
||||
import { estimateTokens, estimateAudioTokens, estimateMessageTokens, getContextWindow, shouldCompact, CONTEXT_WINDOWS } from './tokens.js';
|
||||
|
||||
describe('estimateTokens', () => {
|
||||
it('returns 0 for empty string', () => {
|
||||
@@ -20,6 +20,33 @@ describe('estimateTokens', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('estimateAudioTokens', () => {
|
||||
it('returns positive number for valid audio data', () => {
|
||||
// 10000 base64 chars → ~7500 bytes → ~3.75s → ceil(3.75 * 32) = 120
|
||||
const source = { media_type: 'audio/ogg', data: 'A'.repeat(10000) };
|
||||
const tokens = estimateAudioTokens(source);
|
||||
expect(tokens).toBeGreaterThan(0);
|
||||
expect(tokens).toBe(120);
|
||||
});
|
||||
|
||||
it('returns at least 1 for very short audio', () => {
|
||||
// 1 byte of base64 data → very tiny duration, but minimum is 1
|
||||
const source = { media_type: 'audio/ogg', data: 'A' };
|
||||
expect(estimateAudioTokens(source)).toBe(1);
|
||||
});
|
||||
|
||||
it('returns 0 for empty audio data', () => {
|
||||
const source = { media_type: 'audio/ogg', data: '' };
|
||||
expect(estimateAudioTokens(source)).toBe(0);
|
||||
});
|
||||
|
||||
it('longer audio data produces more tokens', () => {
|
||||
const short = { media_type: 'audio/ogg', data: 'A'.repeat(1000) };
|
||||
const long = { media_type: 'audio/ogg', data: 'A'.repeat(100000) };
|
||||
expect(estimateAudioTokens(long)).toBeGreaterThan(estimateAudioTokens(short));
|
||||
});
|
||||
});
|
||||
|
||||
describe('estimateMessageTokens', () => {
|
||||
it('returns 0 for empty array', () => {
|
||||
expect(estimateMessageTokens([])).toBe(0);
|
||||
@@ -38,6 +65,23 @@ describe('estimateMessageTokens', () => {
|
||||
];
|
||||
expect(estimateMessageTokens(messages)).toBe(10);
|
||||
});
|
||||
|
||||
it('includes audio token estimate for multimodal messages', () => {
|
||||
// Text part: 'hello' = 5 chars → ceil(5/4) = 2 text tokens
|
||||
// Audio part: 10000 base64 chars → 120 audio tokens (see estimateAudioTokens test)
|
||||
// Overhead: 4
|
||||
// Total: 2 + 120 + 4 = 126
|
||||
const messages = [
|
||||
{
|
||||
role: 'user' as const,
|
||||
content: [
|
||||
{ type: 'text' as const, text: 'hello' },
|
||||
{ type: 'audio' as const, source: { media_type: 'audio/ogg', data: 'A'.repeat(10000) } },
|
||||
],
|
||||
},
|
||||
];
|
||||
expect(estimateMessageTokens(messages)).toBe(126);
|
||||
});
|
||||
});
|
||||
|
||||
describe('getContextWindow', () => {
|
||||
|
||||
+34
-5
@@ -1,4 +1,4 @@
|
||||
import type { Message } from '../models/types.js';
|
||||
import type { Message, AudioSource } from '../models/types.js';
|
||||
import { getMessageText } from '../models/media.js';
|
||||
|
||||
/**
|
||||
@@ -36,6 +36,25 @@ export function estimateTokens(text: string): number {
|
||||
return Math.ceil(text.length / 4);
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate token count for an audio content part.
|
||||
*
|
||||
* Heuristic:
|
||||
* 1. Decode base64 length to bytes: `base64Length * 0.75`
|
||||
* 2. Assume ~16 kbps bitrate (typical voice OGG/Opus): `bytes / 2000` → seconds
|
||||
* 3. Estimate ~32 tokens per second of audio (Gemini-style rate)
|
||||
*
|
||||
* Returns at least 1 token for any non-empty audio data.
|
||||
*/
|
||||
export function estimateAudioTokens(audioSource: AudioSource): number {
|
||||
const base64Length = audioSource.data.length;
|
||||
if (base64Length === 0) {
|
||||
return 0;
|
||||
}
|
||||
const durationSeconds = (base64Length * 0.75) / 2000;
|
||||
return Math.max(1, Math.ceil(durationSeconds * 32));
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate the total token count for an array of messages.
|
||||
*
|
||||
@@ -43,10 +62,20 @@ export function estimateTokens(text: string): number {
|
||||
* overhead of ~4 tokens to account for the role marker and separators.
|
||||
*/
|
||||
export function estimateMessageTokens(messages: Message[]): number {
|
||||
return messages.reduce(
|
||||
(sum, msg) => sum + estimateTokens(getMessageText(msg)) + MESSAGE_OVERHEAD_TOKENS,
|
||||
0,
|
||||
);
|
||||
return messages.reduce((sum, msg) => {
|
||||
let tokens = estimateTokens(getMessageText(msg)) + MESSAGE_OVERHEAD_TOKENS;
|
||||
|
||||
// Add audio token estimates for multimodal messages
|
||||
if (Array.isArray(msg.content)) {
|
||||
for (const part of msg.content) {
|
||||
if (part.type === 'audio') {
|
||||
tokens += estimateAudioTokens(part.source);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return sum + tokens;
|
||||
}, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user