feat(audio): add smart routing for native vs transcribed audio

- Create capabilities.ts with supportsAudioInput() detection
- Gemini, OpenAI, and GitHub Models get native audio passthrough
- Anthropic, Bedrock, Ollama, llama.cpp fall back to Whisper transcription
- routing.ts now checks model capability before deciding to transcribe
- Audio attachments are stripped for non-native models (only transcript text passed)
- Remove deprecated audioConfig from createMessageRouter deps (read from config.audio)
This commit is contained in:
William Valentin
2026-02-11 18:20:10 -08:00
parent 32e1a2724a
commit 32ac4df20a
3 changed files with 94 additions and 10 deletions
+47 -10
View File
@@ -1,6 +1,7 @@
import type { AudioTranscriptionConfig } from '../models/media.js';
import type { Attachment } from '../channels/types.js';
import { isSupportedAudio, transcribeAudio } from '../models/media.js';
import { supportsAudioInput } from '../models/capabilities.js';
import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
import type { InboundMessage, OutboundMessage } from '../channels/index.js';
@@ -32,7 +33,6 @@ export function createMessageRouter(deps: {
agentConfigRegistry?: AgentConfigRegistry;
agentRouter?: AgentRouter;
sandboxManager?: SandboxManager;
audioConfig?: AudioTranscriptionConfig;
}): {
handler: (msg: InboundMessage, reply: (response: OutboundMessage) => Promise<void>) => Promise<void>;
agents: Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>;
@@ -213,18 +213,55 @@ export function createMessageRouter(deps: {
}
try {
// Transcribe audio attachments before processing
let messageText = msg.text;
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
if (audioAttachments.length > 0 && deps.audioConfig) {
for (const att of audioAttachments) {
const transcript = await transcribeAudio(att, deps.audioConfig);
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
// Determine if the active model supports native audio input
let effectiveTier: string = deps.config.agents.primary_tier ?? 'default';
if (msg.metadata?.modelTier) {
effectiveTier = msg.metadata.modelTier as string;
} else if (deps.agentRouter && deps.agentConfigRegistry) {
const agentName = deps.agentRouter.resolve(msg.channel, msg.senderId);
if (agentName) {
const agentCfg = deps.agentConfigRegistry.get(agentName);
if (agentCfg?.modelTier) {
effectiveTier = agentCfg.modelTier;
}
}
}
const response = await agent.process(messageText, msg.attachments);
// Look up provider/model for the effective tier
const modelsConfig = deps.config.models as Record<string, { provider?: string; model?: string } | undefined>;
const tierConfig = modelsConfig[effectiveTier] ?? deps.config.models.default;
const modelProvider = tierConfig?.provider ?? deps.config.models.default.provider;
const modelName = tierConfig?.model ?? deps.config.models.default.model;
const nativeAudioSupported = supportsAudioInput(modelProvider, modelName);
let messageText = msg.text;
let attachments = msg.attachments;
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
if (audioAttachments.length > 0 && !nativeAudioSupported) {
// Model doesn't support native audio — transcribe via Whisper and strip audio attachments
const audioConfig: AudioTranscriptionConfig | undefined = deps.config.audio?.enabled && deps.config.audio.provider
? {
endpoint: deps.config.audio.provider.endpoint,
apiKey: deps.config.audio.provider.api_key,
model: deps.config.audio.provider.model,
}
: undefined;
if (audioConfig?.endpoint) {
for (const att of audioAttachments) {
const transcript = await transcribeAudio(att, audioConfig);
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
}
}
// Remove audio attachments so buildUserMessage doesn't create audio content parts
attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
if (attachments.length === 0) { attachments = undefined; }
}
// If native audio IS supported, we pass attachments through unchanged —
// buildUserMessage() in the agent will create native audio content parts
const response = await agent.process(messageText, attachments);
const outboundAttachments = collector.drain();
await reply({
text: response,