diff --git a/src/daemon/routing.ts b/src/daemon/routing.ts index 657d694..af3833c 100644 --- a/src/daemon/routing.ts +++ b/src/daemon/routing.ts @@ -1,6 +1,7 @@ import type { AudioTranscriptionConfig } from '../models/media.js'; import type { Attachment } from '../channels/types.js'; import { isSupportedAudio, transcribeAudio } from '../models/media.js'; +import { supportsAudioInput } from '../models/capabilities.js'; import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js'; import { OutboundAttachmentCollector } from '../backends/native/attachments.js'; import type { InboundMessage, OutboundMessage } from '../channels/index.js'; @@ -32,7 +33,6 @@ export function createMessageRouter(deps: { agentConfigRegistry?: AgentConfigRegistry; agentRouter?: AgentRouter; sandboxManager?: SandboxManager; - audioConfig?: AudioTranscriptionConfig; }): { handler: (msg: InboundMessage, reply: (response: OutboundMessage) => Promise) => Promise; agents: Map; @@ -213,18 +213,55 @@ export function createMessageRouter(deps: { } try { - // Transcribe audio attachments before processing - let messageText = msg.text; - const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a)); - - if (audioAttachments.length > 0 && deps.audioConfig) { - for (const att of audioAttachments) { - const transcript = await transcribeAudio(att, deps.audioConfig); - messageText = `[Voice message]: ${transcript}\n\n${messageText}`; + // Determine if the active model supports native audio input + let effectiveTier: string = deps.config.agents.primary_tier ?? 'default'; + if (msg.metadata?.modelTier) { + effectiveTier = msg.metadata.modelTier as string; + } else if (deps.agentRouter && deps.agentConfigRegistry) { + const agentName = deps.agentRouter.resolve(msg.channel, msg.senderId); + if (agentName) { + const agentCfg = deps.agentConfigRegistry.get(agentName); + if (agentCfg?.modelTier) { + effectiveTier = agentCfg.modelTier; + } } } - const response = await agent.process(messageText, msg.attachments); + // Look up provider/model for the effective tier + const modelsConfig = deps.config.models as Record; + const tierConfig = modelsConfig[effectiveTier] ?? deps.config.models.default; + const modelProvider = tierConfig?.provider ?? deps.config.models.default.provider; + const modelName = tierConfig?.model ?? deps.config.models.default.model; + const nativeAudioSupported = supportsAudioInput(modelProvider, modelName); + + let messageText = msg.text; + let attachments = msg.attachments; + const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a)); + + if (audioAttachments.length > 0 && !nativeAudioSupported) { + // Model doesn't support native audio — transcribe via Whisper and strip audio attachments + const audioConfig: AudioTranscriptionConfig | undefined = deps.config.audio?.enabled && deps.config.audio.provider + ? { + endpoint: deps.config.audio.provider.endpoint, + apiKey: deps.config.audio.provider.api_key, + model: deps.config.audio.provider.model, + } + : undefined; + + if (audioConfig?.endpoint) { + for (const att of audioAttachments) { + const transcript = await transcribeAudio(att, audioConfig); + messageText = `[Voice message]: ${transcript}\n\n${messageText}`; + } + } + // Remove audio attachments so buildUserMessage doesn't create audio content parts + attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a)); + if (attachments.length === 0) { attachments = undefined; } + } + // If native audio IS supported, we pass attachments through unchanged — + // buildUserMessage() in the agent will create native audio content parts + + const response = await agent.process(messageText, attachments); const outboundAttachments = collector.drain(); await reply({ text: response, diff --git a/src/models/capabilities.ts b/src/models/capabilities.ts new file mode 100644 index 0000000..5836b28 --- /dev/null +++ b/src/models/capabilities.ts @@ -0,0 +1,46 @@ +/** + * Model capability detection for native audio input support. + * + * Models that support native audio will receive raw audio data directly. + * Models that don't will receive a Whisper transcript as text instead. + */ + +export type ModelProvider = 'anthropic' | 'openai' | 'gemini' | 'bedrock' | 'github' | 'ollama' | 'llamacpp' | 'openrouter' | 'zhipuai' | 'xai'; + +/** Providers that support native audio input in their API. */ +const AUDIO_CAPABLE_PROVIDERS = new Set([ + 'gemini', + 'openai', + 'github', // GitHub Models uses OpenAI-compatible API +]); + +/** + * Models known NOT to support audio despite their provider supporting it. + * For example, older OpenAI models or specialized models. + */ +const AUDIO_INCAPABLE_MODELS = new Set([ + // Older OpenAI models that predate audio input support + 'gpt-3.5-turbo', + 'gpt-4', + 'gpt-4-turbo', +]); + +/** + * Check whether a provider+model combination supports native audio input. + * + * Returns true if the model can receive raw audio data directly via its API, + * false if audio must be transcribed to text before sending. + */ +export function supportsAudioInput(provider: string, model: string): boolean { + // Provider must be in the capable set + if (!AUDIO_CAPABLE_PROVIDERS.has(provider)) { + return false; + } + + // Check model-specific exclusions + if (AUDIO_INCAPABLE_MODELS.has(model)) { + return false; + } + + return true; +} diff --git a/src/models/index.ts b/src/models/index.ts index 5850a63..72574a0 100644 --- a/src/models/index.ts +++ b/src/models/index.ts @@ -8,6 +8,7 @@ export { LlamaCppClient, type LlamaCppClientConfig } from './local/index.js'; export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js'; export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js'; export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js'; +export { supportsAudioInput } from './capabilities.js'; export { isSupportedImage, isSupportedAudio,