feat(audio): add smart routing for native vs transcribed audio
- Create capabilities.ts with supportsAudioInput() detection - Gemini, OpenAI, and GitHub Models get native audio passthrough - Anthropic, Bedrock, Ollama, llama.cpp fall back to Whisper transcription - routing.ts now checks model capability before deciding to transcribe - Audio attachments are stripped for non-native models (only transcript text passed) - Remove deprecated audioConfig from createMessageRouter deps (read from config.audio)
This commit is contained in:
+42
-5
@@ -1,6 +1,7 @@
|
|||||||
import type { AudioTranscriptionConfig } from '../models/media.js';
|
import type { AudioTranscriptionConfig } from '../models/media.js';
|
||||||
import type { Attachment } from '../channels/types.js';
|
import type { Attachment } from '../channels/types.js';
|
||||||
import { isSupportedAudio, transcribeAudio } from '../models/media.js';
|
import { isSupportedAudio, transcribeAudio } from '../models/media.js';
|
||||||
|
import { supportsAudioInput } from '../models/capabilities.js';
|
||||||
import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
|
import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
|
||||||
import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
|
import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
|
||||||
import type { InboundMessage, OutboundMessage } from '../channels/index.js';
|
import type { InboundMessage, OutboundMessage } from '../channels/index.js';
|
||||||
@@ -32,7 +33,6 @@ export function createMessageRouter(deps: {
|
|||||||
agentConfigRegistry?: AgentConfigRegistry;
|
agentConfigRegistry?: AgentConfigRegistry;
|
||||||
agentRouter?: AgentRouter;
|
agentRouter?: AgentRouter;
|
||||||
sandboxManager?: SandboxManager;
|
sandboxManager?: SandboxManager;
|
||||||
audioConfig?: AudioTranscriptionConfig;
|
|
||||||
}): {
|
}): {
|
||||||
handler: (msg: InboundMessage, reply: (response: OutboundMessage) => Promise<void>) => Promise<void>;
|
handler: (msg: InboundMessage, reply: (response: OutboundMessage) => Promise<void>) => Promise<void>;
|
||||||
agents: Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>;
|
agents: Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>;
|
||||||
@@ -213,18 +213,55 @@ export function createMessageRouter(deps: {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Transcribe audio attachments before processing
|
// Determine if the active model supports native audio input
|
||||||
|
let effectiveTier: string = deps.config.agents.primary_tier ?? 'default';
|
||||||
|
if (msg.metadata?.modelTier) {
|
||||||
|
effectiveTier = msg.metadata.modelTier as string;
|
||||||
|
} else if (deps.agentRouter && deps.agentConfigRegistry) {
|
||||||
|
const agentName = deps.agentRouter.resolve(msg.channel, msg.senderId);
|
||||||
|
if (agentName) {
|
||||||
|
const agentCfg = deps.agentConfigRegistry.get(agentName);
|
||||||
|
if (agentCfg?.modelTier) {
|
||||||
|
effectiveTier = agentCfg.modelTier;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look up provider/model for the effective tier
|
||||||
|
const modelsConfig = deps.config.models as Record<string, { provider?: string; model?: string } | undefined>;
|
||||||
|
const tierConfig = modelsConfig[effectiveTier] ?? deps.config.models.default;
|
||||||
|
const modelProvider = tierConfig?.provider ?? deps.config.models.default.provider;
|
||||||
|
const modelName = tierConfig?.model ?? deps.config.models.default.model;
|
||||||
|
const nativeAudioSupported = supportsAudioInput(modelProvider, modelName);
|
||||||
|
|
||||||
let messageText = msg.text;
|
let messageText = msg.text;
|
||||||
|
let attachments = msg.attachments;
|
||||||
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
|
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
|
||||||
|
|
||||||
if (audioAttachments.length > 0 && deps.audioConfig) {
|
if (audioAttachments.length > 0 && !nativeAudioSupported) {
|
||||||
|
// Model doesn't support native audio — transcribe via Whisper and strip audio attachments
|
||||||
|
const audioConfig: AudioTranscriptionConfig | undefined = deps.config.audio?.enabled && deps.config.audio.provider
|
||||||
|
? {
|
||||||
|
endpoint: deps.config.audio.provider.endpoint,
|
||||||
|
apiKey: deps.config.audio.provider.api_key,
|
||||||
|
model: deps.config.audio.provider.model,
|
||||||
|
}
|
||||||
|
: undefined;
|
||||||
|
|
||||||
|
if (audioConfig?.endpoint) {
|
||||||
for (const att of audioAttachments) {
|
for (const att of audioAttachments) {
|
||||||
const transcript = await transcribeAudio(att, deps.audioConfig);
|
const transcript = await transcribeAudio(att, audioConfig);
|
||||||
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
|
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Remove audio attachments so buildUserMessage doesn't create audio content parts
|
||||||
|
attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
|
||||||
|
if (attachments.length === 0) { attachments = undefined; }
|
||||||
|
}
|
||||||
|
// If native audio IS supported, we pass attachments through unchanged —
|
||||||
|
// buildUserMessage() in the agent will create native audio content parts
|
||||||
|
|
||||||
const response = await agent.process(messageText, msg.attachments);
|
const response = await agent.process(messageText, attachments);
|
||||||
const outboundAttachments = collector.drain();
|
const outboundAttachments = collector.drain();
|
||||||
await reply({
|
await reply({
|
||||||
text: response,
|
text: response,
|
||||||
|
|||||||
@@ -0,0 +1,46 @@
|
|||||||
|
/**
|
||||||
|
* Model capability detection for native audio input support.
|
||||||
|
*
|
||||||
|
* Models that support native audio will receive raw audio data directly.
|
||||||
|
* Models that don't will receive a Whisper transcript as text instead.
|
||||||
|
*/
|
||||||
|
|
||||||
|
export type ModelProvider = 'anthropic' | 'openai' | 'gemini' | 'bedrock' | 'github' | 'ollama' | 'llamacpp' | 'openrouter' | 'zhipuai' | 'xai';
|
||||||
|
|
||||||
|
/** Providers that support native audio input in their API. */
|
||||||
|
const AUDIO_CAPABLE_PROVIDERS = new Set<string>([
|
||||||
|
'gemini',
|
||||||
|
'openai',
|
||||||
|
'github', // GitHub Models uses OpenAI-compatible API
|
||||||
|
]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Models known NOT to support audio despite their provider supporting it.
|
||||||
|
* For example, older OpenAI models or specialized models.
|
||||||
|
*/
|
||||||
|
const AUDIO_INCAPABLE_MODELS = new Set<string>([
|
||||||
|
// Older OpenAI models that predate audio input support
|
||||||
|
'gpt-3.5-turbo',
|
||||||
|
'gpt-4',
|
||||||
|
'gpt-4-turbo',
|
||||||
|
]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check whether a provider+model combination supports native audio input.
|
||||||
|
*
|
||||||
|
* Returns true if the model can receive raw audio data directly via its API,
|
||||||
|
* false if audio must be transcribed to text before sending.
|
||||||
|
*/
|
||||||
|
export function supportsAudioInput(provider: string, model: string): boolean {
|
||||||
|
// Provider must be in the capable set
|
||||||
|
if (!AUDIO_CAPABLE_PROVIDERS.has(provider)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check model-specific exclusions
|
||||||
|
if (AUDIO_INCAPABLE_MODELS.has(model)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
@@ -8,6 +8,7 @@ export { LlamaCppClient, type LlamaCppClientConfig } from './local/index.js';
|
|||||||
export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js';
|
export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js';
|
||||||
export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js';
|
export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js';
|
||||||
export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js';
|
export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js';
|
||||||
|
export { supportsAudioInput } from './capabilities.js';
|
||||||
export {
|
export {
|
||||||
isSupportedImage,
|
isSupportedImage,
|
||||||
isSupportedAudio,
|
isSupportedAudio,
|
||||||
|
|||||||
Reference in New Issue
Block a user