feat: add audio transcription pipeline for voice messages

Adds Whisper-compatible audio transcription via configurable endpoint.
New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(),
buildUserMessageWithAudio(). Config schema gains audio section with
transcription_endpoint, api_key, and model. Daemon wires transcription
into the message router. Channel adapters extract audio from voice/audio
messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp
ptt+audio). Includes 57 media tests (was 25, now covers all audio paths).
This commit is contained in:
William Valentin
2026-02-07 09:09:13 -08:00
parent e052778b0a
commit 2a962abcd0
4 changed files with 531 additions and 12 deletions
+49 -11
View File
@@ -1,11 +1,15 @@
import { Lifecycle } from './lifecycle.js';
import type { Config, ModelConfig } from '../config/index.js';
import type { AudioTranscriptionConfig } from '../models/media.js';
import type { Attachment } from '../channels/types.js';
import { isSupportedAudio, transcribeAudio } from '../models/media.js';
import { AnthropicClient, OpenAIClient, OllamaClient, LlamaCppClient, GeminiClient, BedrockClient, GitHubModelsClient, ModelRouter, DEFAULT_RETRY_CONFIG } from '../models/index.js';
import type { ModelClient, RetryConfig } from '../models/index.js';
import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
import { SessionStore, SessionManager } from '../session/index.js';
import { HookEngine } from '../hooks/index.js';
import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools } from '../tools/index.js';
import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool } from '../tools/index.js';
import type { Tool } from '../tools/types.js';
import { MemoryStore } from '../memory/index.js';
import { createMemoryTools } from '../tools/builtin/index.js';
@@ -204,11 +208,12 @@ function createMessageRouter(deps: {
agentConfigRegistry?: AgentConfigRegistry;
agentRouter?: AgentRouter;
sandboxManager?: SandboxManager;
audioConfig?: AudioTranscriptionConfig;
}) {
// Cache agents by session ID + agent config name to avoid recreating on every message
const agents = new Map<string, AgentOrchestrator>();
const agents = new Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>();
function getOrCreateAgent(channel: string, senderId: string): AgentOrchestrator {
function getOrCreateAgent(channel: string, senderId: string): { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector } {
// Resolve agent config name via routing (sender → channel → default fallback)
const agentConfigName = deps.agentRouter?.resolve(channel, senderId);
const agentConfig = agentConfigName ? deps.agentConfigRegistry?.get(agentConfigName) : undefined;
@@ -218,8 +223,8 @@ function createMessageRouter(deps: {
? `${channel}:${senderId}:${agentConfigName}`
: `${channel}:${senderId}`;
let agent = agents.get(sessionId);
if (!agent) {
let entry = agents.get(sessionId);
if (!entry) {
const session = deps.sessionManager.getSession(channel, senderId);
// Use agent config overrides where available, falling back to global config
@@ -286,7 +291,14 @@ function createMessageRouter(deps: {
effectiveToolRegistry.replace(lazySandboxProcess);
}
agent = new AgentOrchestrator({
// Create an attachment collector for this agent session
const collector = new OutboundAttachmentCollector();
// Clone the tool registry to register the media.send tool bound to this collector
effectiveToolRegistry = effectiveToolRegistry.clone();
effectiveToolRegistry.register(createMediaSendTool(collector));
const orchestrator = new AgentOrchestrator({
modelRouter: deps.modelRouter,
systemPrompt: effectiveSystemPrompt,
session,
@@ -307,14 +319,16 @@ function createMessageRouter(deps: {
agent: effectiveTier,
provider: effectiveProvider,
},
attachmentCollector: collector,
});
agents.set(sessionId, agent);
entry = { orchestrator, collector };
agents.set(sessionId, entry);
}
return agent;
return entry;
}
return async (msg: InboundMessage, reply: (response: OutboundMessage) => Promise<void>): Promise<void> => {
const agent = getOrCreateAgent(msg.channel, msg.senderId);
const { orchestrator: agent, collector } = getOrCreateAgent(msg.channel, msg.senderId);
// Handle special commands
if (msg.metadata?.isCommand) {
@@ -367,8 +381,24 @@ function createMessageRouter(deps: {
}
try {
const response = await agent.process(msg.text, msg.attachments);
await reply({ text: response, replyTo: msg.id });
// Transcribe audio attachments before processing
let messageText = msg.text;
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
if (audioAttachments.length > 0 && deps.audioConfig) {
for (const att of audioAttachments) {
const transcript = await transcribeAudio(att, deps.audioConfig);
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
}
}
const response = await agent.process(messageText, msg.attachments);
const outboundAttachments = collector.drain();
await reply({
text: response,
replyTo: msg.id,
attachments: outboundAttachments.length > 0 ? outboundAttachments : undefined,
});
} catch (error) {
console.error(`Error processing message from ${msg.channel}:${msg.senderId}:`, error);
await reply({
@@ -539,6 +569,13 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
});
}
// Initialize audio transcription config
const audioConfig: AudioTranscriptionConfig = {
endpoint: config.audio.transcription_endpoint,
apiKey: config.audio.transcription_api_key,
model: config.audio.transcription_model,
};
// Initialize model router
const modelRouter = createModelRouter(config);
@@ -593,6 +630,7 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
agentConfigRegistry,
agentRouter,
sandboxManager,
audioConfig,
}));
// Register Telegram adapter