diff --git a/src/config/schema.ts b/src/config/schema.ts index cded816..0b57612 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -52,6 +52,7 @@ const modelConfigBaseSchema = z.object({ for: z.array(z.string()).optional(), num_gpu: z.number().optional(), context_window: z.number().optional(), + supports_audio: z.boolean().optional(), }); const modelConfigSchema = modelConfigBaseSchema.extend({ diff --git a/src/context/tokens.test.ts b/src/context/tokens.test.ts index 0f1ae6e..d68a110 100644 --- a/src/context/tokens.test.ts +++ b/src/context/tokens.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from 'vitest'; -import { estimateTokens, estimateMessageTokens, getContextWindow, shouldCompact, CONTEXT_WINDOWS } from './tokens.js'; +import { estimateTokens, estimateAudioTokens, estimateMessageTokens, getContextWindow, shouldCompact, CONTEXT_WINDOWS } from './tokens.js'; describe('estimateTokens', () => { it('returns 0 for empty string', () => { @@ -20,6 +20,33 @@ describe('estimateTokens', () => { }); }); +describe('estimateAudioTokens', () => { + it('returns positive number for valid audio data', () => { + // 10000 base64 chars → ~7500 bytes → ~3.75s → ceil(3.75 * 32) = 120 + const source = { media_type: 'audio/ogg', data: 'A'.repeat(10000) }; + const tokens = estimateAudioTokens(source); + expect(tokens).toBeGreaterThan(0); + expect(tokens).toBe(120); + }); + + it('returns at least 1 for very short audio', () => { + // 1 byte of base64 data → very tiny duration, but minimum is 1 + const source = { media_type: 'audio/ogg', data: 'A' }; + expect(estimateAudioTokens(source)).toBe(1); + }); + + it('returns 0 for empty audio data', () => { + const source = { media_type: 'audio/ogg', data: '' }; + expect(estimateAudioTokens(source)).toBe(0); + }); + + it('longer audio data produces more tokens', () => { + const short = { media_type: 'audio/ogg', data: 'A'.repeat(1000) }; + const long = { media_type: 'audio/ogg', data: 'A'.repeat(100000) }; + expect(estimateAudioTokens(long)).toBeGreaterThan(estimateAudioTokens(short)); + }); +}); + describe('estimateMessageTokens', () => { it('returns 0 for empty array', () => { expect(estimateMessageTokens([])).toBe(0); @@ -38,6 +65,23 @@ describe('estimateMessageTokens', () => { ]; expect(estimateMessageTokens(messages)).toBe(10); }); + + it('includes audio token estimate for multimodal messages', () => { + // Text part: 'hello' = 5 chars → ceil(5/4) = 2 text tokens + // Audio part: 10000 base64 chars → 120 audio tokens (see estimateAudioTokens test) + // Overhead: 4 + // Total: 2 + 120 + 4 = 126 + const messages = [ + { + role: 'user' as const, + content: [ + { type: 'text' as const, text: 'hello' }, + { type: 'audio' as const, source: { media_type: 'audio/ogg', data: 'A'.repeat(10000) } }, + ], + }, + ]; + expect(estimateMessageTokens(messages)).toBe(126); + }); }); describe('getContextWindow', () => { diff --git a/src/context/tokens.ts b/src/context/tokens.ts index b3358a6..06e6a55 100644 --- a/src/context/tokens.ts +++ b/src/context/tokens.ts @@ -1,4 +1,4 @@ -import type { Message } from '../models/types.js'; +import type { Message, AudioSource } from '../models/types.js'; import { getMessageText } from '../models/media.js'; /** @@ -36,6 +36,25 @@ export function estimateTokens(text: string): number { return Math.ceil(text.length / 4); } +/** + * Estimate token count for an audio content part. + * + * Heuristic: + * 1. Decode base64 length to bytes: `base64Length * 0.75` + * 2. Assume ~16 kbps bitrate (typical voice OGG/Opus): `bytes / 2000` → seconds + * 3. Estimate ~32 tokens per second of audio (Gemini-style rate) + * + * Returns at least 1 token for any non-empty audio data. + */ +export function estimateAudioTokens(audioSource: AudioSource): number { + const base64Length = audioSource.data.length; + if (base64Length === 0) { + return 0; + } + const durationSeconds = (base64Length * 0.75) / 2000; + return Math.max(1, Math.ceil(durationSeconds * 32)); +} + /** * Estimate the total token count for an array of messages. * @@ -43,10 +62,20 @@ export function estimateTokens(text: string): number { * overhead of ~4 tokens to account for the role marker and separators. */ export function estimateMessageTokens(messages: Message[]): number { - return messages.reduce( - (sum, msg) => sum + estimateTokens(getMessageText(msg)) + MESSAGE_OVERHEAD_TOKENS, - 0, - ); + return messages.reduce((sum, msg) => { + let tokens = estimateTokens(getMessageText(msg)) + MESSAGE_OVERHEAD_TOKENS; + + // Add audio token estimates for multimodal messages + if (Array.isArray(msg.content)) { + for (const part of msg.content) { + if (part.type === 'audio') { + tokens += estimateAudioTokens(part.source); + } + } + } + + return sum + tokens; + }, 0); } /** diff --git a/src/daemon/routing.ts b/src/daemon/routing.ts index af3833c..fafa98e 100644 --- a/src/daemon/routing.ts +++ b/src/daemon/routing.ts @@ -232,7 +232,8 @@ export function createMessageRouter(deps: { const tierConfig = modelsConfig[effectiveTier] ?? deps.config.models.default; const modelProvider = tierConfig?.provider ?? deps.config.models.default.provider; const modelName = tierConfig?.model ?? deps.config.models.default.model; - const nativeAudioSupported = supportsAudioInput(modelProvider, modelName); + const supportsAudioOverride = (tierConfig as Record | undefined)?.supports_audio as boolean | undefined; + const nativeAudioSupported = supportsAudioInput(modelProvider, modelName, supportsAudioOverride); let messageText = msg.text; let attachments = msg.attachments; diff --git a/src/models/capabilities.test.ts b/src/models/capabilities.test.ts new file mode 100644 index 0000000..824ba43 --- /dev/null +++ b/src/models/capabilities.test.ts @@ -0,0 +1,60 @@ +import { describe, it, expect } from 'vitest'; +import { supportsAudioInput } from './capabilities.js'; + +describe('supportsAudioInput', () => { + describe('audio-capable providers with modern models', () => { + it('returns true for gemini with a modern model', () => { + expect(supportsAudioInput('gemini', 'gemini-1.5-pro')).toBe(true); + }); + + it('returns true for openai with a modern model', () => { + expect(supportsAudioInput('openai', 'gpt-4o')).toBe(true); + }); + + it('returns true for github with a modern model', () => { + expect(supportsAudioInput('github', 'gpt-4o')).toBe(true); + }); + }); + + describe('non-audio providers return false', () => { + const nonAudioProviders = [ + 'anthropic', + 'bedrock', + 'ollama', + 'llamacpp', + 'openrouter', + 'zhipuai', + 'xai', + ] as const; + + for (const provider of nonAudioProviders) { + it(`returns false for ${provider}`, () => { + expect(supportsAudioInput(provider, 'some-model')).toBe(false); + }); + } + }); + + describe('model-specific exclusions', () => { + const excludedModels = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo']; + + for (const model of excludedModels) { + it(`returns false for openai/${model} despite provider being capable`, () => { + expect(supportsAudioInput('openai', model)).toBe(false); + }); + + it(`returns false for github/${model} despite provider being capable`, () => { + expect(supportsAudioInput('github', model)).toBe(false); + }); + } + }); + + describe('unknown provider', () => { + it('returns false for a completely unknown provider', () => { + expect(supportsAudioInput('unknown-provider', 'some-model')).toBe(false); + }); + + it('returns false for an empty string provider', () => { + expect(supportsAudioInput('', 'some-model')).toBe(false); + }); + }); +}); diff --git a/src/models/capabilities.ts b/src/models/capabilities.ts index 5836b28..70dc91f 100644 --- a/src/models/capabilities.ts +++ b/src/models/capabilities.ts @@ -31,7 +31,9 @@ const AUDIO_INCAPABLE_MODELS = new Set([ * Returns true if the model can receive raw audio data directly via its API, * false if audio must be transcribed to text before sending. */ -export function supportsAudioInput(provider: string, model: string): boolean { +export function supportsAudioInput(provider: string, model: string, override?: boolean): boolean { + if (override !== undefined) return override; + // Provider must be in the capable set if (!AUDIO_CAPABLE_PROVIDERS.has(provider)) { return false; diff --git a/src/models/media.test.ts b/src/models/media.test.ts index 5dcaff6..95e251b 100644 --- a/src/models/media.test.ts +++ b/src/models/media.test.ts @@ -6,11 +6,14 @@ import { isSupportedImage, isSupportedAudio, attachmentToImageSource, + attachmentToAudioSource, buildUserMessage, getMessageText, getMessageTextWithTools, normalizeMessagesForLocal, hasImages, + hasAudio, + stripAudioParts, transcribeAudio, buildUserMessageWithAudio, type AudioTranscriptionConfig, @@ -820,3 +823,212 @@ describe('normalizeMessagesForLocal', () => { ]); }); }); + +// --------------------------------------------------------------------------- +// 12. attachmentToAudioSource +// --------------------------------------------------------------------------- + +describe('attachmentToAudioSource', () => { + // Positive: supported audio type with data returns AudioSource. + it('returns AudioSource for supported audio type with data', () => { + const result = attachmentToAudioSource(oggAudioAttachment); + + expect(result).toEqual({ + media_type: 'audio/ogg', + data: 'AAAAAAAAAAAAAAAAAAAA', + }); + }); + + // Negative: unsupported MIME type returns null. + it('returns null for unsupported mime type', () => { + const result = attachmentToAudioSource(pdfAttachment); + + expect(result).toBeNull(); + }); + + // Negative: supported audio type but no data returns null. + it('returns null when no data present', () => { + const noDataAudio = makeAttachment({ + mimeType: 'audio/ogg', + filename: 'voice.ogg', + }); + + const result = attachmentToAudioSource(noDataAudio); + + expect(result).toBeNull(); + }); + + // Negative: image attachment returns null. + it('returns null for image attachment', () => { + const result = attachmentToAudioSource(jpegBase64Attachment); + + expect(result).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// 13. hasAudio +// --------------------------------------------------------------------------- + +describe('hasAudio', () => { + // Negative: string content never has audio. + it('returns false for string content messages', () => { + const msg: Message = { role: 'user', content: 'no audio here' }; + + expect(hasAudio(msg)).toBe(false); + }); + + // Negative: multimodal messages with only text parts have no audio. + it('returns false for multimodal messages with only text parts', () => { + const msg: Message = { + role: 'user', + content: [{ type: 'text', text: 'just text' }], + }; + + expect(hasAudio(msg)).toBe(false); + }); + + // Negative: multimodal messages with only image parts have no audio. + it('returns false for multimodal messages with only image parts', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } }, + ], + }; + + expect(hasAudio(msg)).toBe(false); + }); + + // Positive: multimodal messages with audio parts are detected. + it('returns true for multimodal messages with audio parts', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } }, + ], + }; + + expect(hasAudio(msg)).toBe(true); + }); + + // Positive: multimodal messages with mixed image + audio parts are detected. + it('returns true for multimodal messages with mixed image+audio parts', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'image', source: { type: 'base64', media_type: 'image/png', data: 'img' } }, + { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } }, + ], + }; + + expect(hasAudio(msg)).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// 14. stripAudioParts +// --------------------------------------------------------------------------- + +describe('stripAudioParts', () => { + // String content passes through unchanged. + it('returns unchanged message for string content', () => { + const msg: Message = { role: 'user', content: 'plain text' }; + + const result = stripAudioParts(msg); + + expect(result).toEqual({ role: 'user', content: 'plain text' }); + }); + + // Audio part with transcript is replaced with transcript text. + it('replaces audio part with transcript text when transcript is present', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'text', text: 'Check this out' }, + { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'Hello world' } }, + ], + }; + + const result = stripAudioParts(msg); + + expect(result.role).toBe('user'); + expect(Array.isArray(result.content)).toBe(true); + const parts = result.content as Array<{ type: string; text?: string }>; + expect(parts).toHaveLength(2); + expect(parts[0]).toEqual({ type: 'text', text: 'Check this out' }); + expect(parts[1]).toEqual({ type: 'text', text: '[Voice message]: Hello world' }); + }); + + // Audio part without transcript is replaced with placeholder. + it('replaces audio part with placeholder when no transcript', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'text', text: 'Listen' }, + { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA' } }, + ], + }; + + const result = stripAudioParts(msg); + + expect(Array.isArray(result.content)).toBe(true); + const parts = result.content as Array<{ type: string; text?: string }>; + expect(parts).toHaveLength(2); + expect(parts[0]).toEqual({ type: 'text', text: 'Listen' }); + expect(parts[1]).toEqual({ type: 'text', text: '[Audio message received but no transcript available]' }); + }); + + // Non-audio parts (text + image) are kept unchanged. + it('keeps non-audio parts unchanged', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'text', text: 'caption' }, + { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } }, + ], + }; + + const result = stripAudioParts(msg); + + expect(result.content).toEqual([ + { type: 'text', text: 'caption' }, + { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } }, + ]); + }); + + // Simplifies to string content when only one text part remains after stripping. + it('simplifies to string content when only one text part remains after stripping', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'Hi there' } }, + ], + }; + + const result = stripAudioParts(msg); + + expect(result).toEqual({ role: 'user', content: '[Voice message]: Hi there' }); + }); + + // Handles message with multiple audio parts. + it('handles message with multiple audio parts', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'audio', source: { media_type: 'audio/ogg', data: 'AAAA', transcript: 'First message' } }, + { type: 'text', text: 'in between' }, + { type: 'audio', source: { media_type: 'audio/mpeg', data: 'BBBB' } }, + ], + }; + + const result = stripAudioParts(msg); + + expect(Array.isArray(result.content)).toBe(true); + const parts = result.content as Array<{ type: string; text?: string }>; + expect(parts).toHaveLength(3); + expect(parts[0]).toEqual({ type: 'text', text: '[Voice message]: First message' }); + expect(parts[1]).toEqual({ type: 'text', text: 'in between' }); + expect(parts[2]).toEqual({ type: 'text', text: '[Audio message received but no transcript available]' }); + }); +});