feat: add audio transcription pipeline for voice messages
Adds Whisper-compatible audio transcription via configurable endpoint. New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(), buildUserMessageWithAudio(). Config schema gains audio section with transcription_endpoint, api_key, and model. Daemon wires transcription into the message router. Channel adapters extract audio from voice/audio messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp ptt+audio). Includes 57 media tests (was 25, now covers all audio paths).
This commit is contained in:
+49
-11
@@ -1,11 +1,15 @@
|
||||
import { Lifecycle } from './lifecycle.js';
|
||||
import type { Config, ModelConfig } from '../config/index.js';
|
||||
import type { AudioTranscriptionConfig } from '../models/media.js';
|
||||
import type { Attachment } from '../channels/types.js';
|
||||
import { isSupportedAudio, transcribeAudio } from '../models/media.js';
|
||||
import { AnthropicClient, OpenAIClient, OllamaClient, LlamaCppClient, GeminiClient, BedrockClient, GitHubModelsClient, ModelRouter, DEFAULT_RETRY_CONFIG } from '../models/index.js';
|
||||
import type { ModelClient, RetryConfig } from '../models/index.js';
|
||||
import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
|
||||
import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
|
||||
import { SessionStore, SessionManager } from '../session/index.js';
|
||||
import { HookEngine } from '../hooks/index.js';
|
||||
import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools } from '../tools/index.js';
|
||||
import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool } from '../tools/index.js';
|
||||
import type { Tool } from '../tools/types.js';
|
||||
import { MemoryStore } from '../memory/index.js';
|
||||
import { createMemoryTools } from '../tools/builtin/index.js';
|
||||
@@ -204,11 +208,12 @@ function createMessageRouter(deps: {
|
||||
agentConfigRegistry?: AgentConfigRegistry;
|
||||
agentRouter?: AgentRouter;
|
||||
sandboxManager?: SandboxManager;
|
||||
audioConfig?: AudioTranscriptionConfig;
|
||||
}) {
|
||||
// Cache agents by session ID + agent config name to avoid recreating on every message
|
||||
const agents = new Map<string, AgentOrchestrator>();
|
||||
const agents = new Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>();
|
||||
|
||||
function getOrCreateAgent(channel: string, senderId: string): AgentOrchestrator {
|
||||
function getOrCreateAgent(channel: string, senderId: string): { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector } {
|
||||
// Resolve agent config name via routing (sender → channel → default fallback)
|
||||
const agentConfigName = deps.agentRouter?.resolve(channel, senderId);
|
||||
const agentConfig = agentConfigName ? deps.agentConfigRegistry?.get(agentConfigName) : undefined;
|
||||
@@ -218,8 +223,8 @@ function createMessageRouter(deps: {
|
||||
? `${channel}:${senderId}:${agentConfigName}`
|
||||
: `${channel}:${senderId}`;
|
||||
|
||||
let agent = agents.get(sessionId);
|
||||
if (!agent) {
|
||||
let entry = agents.get(sessionId);
|
||||
if (!entry) {
|
||||
const session = deps.sessionManager.getSession(channel, senderId);
|
||||
|
||||
// Use agent config overrides where available, falling back to global config
|
||||
@@ -286,7 +291,14 @@ function createMessageRouter(deps: {
|
||||
effectiveToolRegistry.replace(lazySandboxProcess);
|
||||
}
|
||||
|
||||
agent = new AgentOrchestrator({
|
||||
// Create an attachment collector for this agent session
|
||||
const collector = new OutboundAttachmentCollector();
|
||||
|
||||
// Clone the tool registry to register the media.send tool bound to this collector
|
||||
effectiveToolRegistry = effectiveToolRegistry.clone();
|
||||
effectiveToolRegistry.register(createMediaSendTool(collector));
|
||||
|
||||
const orchestrator = new AgentOrchestrator({
|
||||
modelRouter: deps.modelRouter,
|
||||
systemPrompt: effectiveSystemPrompt,
|
||||
session,
|
||||
@@ -307,14 +319,16 @@ function createMessageRouter(deps: {
|
||||
agent: effectiveTier,
|
||||
provider: effectiveProvider,
|
||||
},
|
||||
attachmentCollector: collector,
|
||||
});
|
||||
agents.set(sessionId, agent);
|
||||
entry = { orchestrator, collector };
|
||||
agents.set(sessionId, entry);
|
||||
}
|
||||
return agent;
|
||||
return entry;
|
||||
}
|
||||
|
||||
return async (msg: InboundMessage, reply: (response: OutboundMessage) => Promise<void>): Promise<void> => {
|
||||
const agent = getOrCreateAgent(msg.channel, msg.senderId);
|
||||
const { orchestrator: agent, collector } = getOrCreateAgent(msg.channel, msg.senderId);
|
||||
|
||||
// Handle special commands
|
||||
if (msg.metadata?.isCommand) {
|
||||
@@ -367,8 +381,24 @@ function createMessageRouter(deps: {
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await agent.process(msg.text, msg.attachments);
|
||||
await reply({ text: response, replyTo: msg.id });
|
||||
// Transcribe audio attachments before processing
|
||||
let messageText = msg.text;
|
||||
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
|
||||
|
||||
if (audioAttachments.length > 0 && deps.audioConfig) {
|
||||
for (const att of audioAttachments) {
|
||||
const transcript = await transcribeAudio(att, deps.audioConfig);
|
||||
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
|
||||
}
|
||||
}
|
||||
|
||||
const response = await agent.process(messageText, msg.attachments);
|
||||
const outboundAttachments = collector.drain();
|
||||
await reply({
|
||||
text: response,
|
||||
replyTo: msg.id,
|
||||
attachments: outboundAttachments.length > 0 ? outboundAttachments : undefined,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`Error processing message from ${msg.channel}:${msg.senderId}:`, error);
|
||||
await reply({
|
||||
@@ -539,6 +569,13 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
|
||||
});
|
||||
}
|
||||
|
||||
// Initialize audio transcription config
|
||||
const audioConfig: AudioTranscriptionConfig = {
|
||||
endpoint: config.audio.transcription_endpoint,
|
||||
apiKey: config.audio.transcription_api_key,
|
||||
model: config.audio.transcription_model,
|
||||
};
|
||||
|
||||
// Initialize model router
|
||||
const modelRouter = createModelRouter(config);
|
||||
|
||||
@@ -593,6 +630,7 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
|
||||
agentConfigRegistry,
|
||||
agentRouter,
|
||||
sandboxManager,
|
||||
audioConfig,
|
||||
}));
|
||||
|
||||
// Register Telegram adapter
|
||||
|
||||
Reference in New Issue
Block a user