feat: add audio transcription pipeline for voice messages

Adds Whisper-compatible audio transcription via configurable endpoint. New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(), buildUserMessageWithAudio(). Config schema gains audio section with transcription_endpoint, api_key, and model. Daemon wires transcription into the message router. Channel adapters extract audio from voice/audio messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp ptt+audio). Includes 57 media tests (was 25, now covers all audio paths).
2026-02-07 09:09:13 -08:00
parent e052778b0a
commit 2a962abcd0
4 changed files with 531 additions and 12 deletions
@@ -1,11 +1,15 @@
 import { Lifecycle } from './lifecycle.js';
 import type { Config, ModelConfig } from '../config/index.js';
+import type { AudioTranscriptionConfig } from '../models/media.js';
+import type { Attachment } from '../channels/types.js';
+import { isSupportedAudio, transcribeAudio } from '../models/media.js';
 import { AnthropicClient, OpenAIClient, OllamaClient, LlamaCppClient, GeminiClient, BedrockClient, GitHubModelsClient, ModelRouter, DEFAULT_RETRY_CONFIG } from '../models/index.js';
 import type { ModelClient, RetryConfig } from '../models/index.js';
 import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
+import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
 import { SessionStore, SessionManager } from '../session/index.js';
 import { HookEngine } from '../hooks/index.js';
-import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools } from '../tools/index.js';
+import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool } from '../tools/index.js';
 import type { Tool } from '../tools/types.js';
 import { MemoryStore } from '../memory/index.js';
 import { createMemoryTools } from '../tools/builtin/index.js';
@@ -204,11 +208,12 @@ function createMessageRouter(deps: {
  agentConfigRegistry?: AgentConfigRegistry;
  agentRouter?: AgentRouter;
  sandboxManager?: SandboxManager;
+  audioConfig?: AudioTranscriptionConfig;
 }) {
  // Cache agents by session ID + agent config name to avoid recreating on every message
-  const agents = new Map<string, AgentOrchestrator>();
+  const agents = new Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>();

-  function getOrCreateAgent(channel: string, senderId: string): AgentOrchestrator {
+  function getOrCreateAgent(channel: string, senderId: string): { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector } {
    // Resolve agent config name via routing (sender → channel → default fallback)
    const agentConfigName = deps.agentRouter?.resolve(channel, senderId);
    const agentConfig = agentConfigName ? deps.agentConfigRegistry?.get(agentConfigName) : undefined;
@@ -218,8 +223,8 @@ function createMessageRouter(deps: {
      ? `${channel}:${senderId}:${agentConfigName}`
      : `${channel}:${senderId}`;

-    let agent = agents.get(sessionId);
-    if (!agent) {
+    let entry = agents.get(sessionId);
+    if (!entry) {
      const session = deps.sessionManager.getSession(channel, senderId);

      // Use agent config overrides where available, falling back to global config
@@ -286,7 +291,14 @@ function createMessageRouter(deps: {
        effectiveToolRegistry.replace(lazySandboxProcess);
      }

-      agent = new AgentOrchestrator({
+      // Create an attachment collector for this agent session
+      const collector = new OutboundAttachmentCollector();
+
+      // Clone the tool registry to register the media.send tool bound to this collector
+      effectiveToolRegistry = effectiveToolRegistry.clone();
+      effectiveToolRegistry.register(createMediaSendTool(collector));
+
+      const orchestrator = new AgentOrchestrator({
        modelRouter: deps.modelRouter,
        systemPrompt: effectiveSystemPrompt,
        session,
@@ -307,14 +319,16 @@ function createMessageRouter(deps: {
          agent: effectiveTier,
          provider: effectiveProvider,
        },
+        attachmentCollector: collector,
      });
-      agents.set(sessionId, agent);
+      entry = { orchestrator, collector };
+      agents.set(sessionId, entry);
    }
-    return agent;
+    return entry;
  }

  return async (msg: InboundMessage, reply: (response: OutboundMessage) => Promise<void>): Promise<void> => {
-    const agent = getOrCreateAgent(msg.channel, msg.senderId);
+    const { orchestrator: agent, collector } = getOrCreateAgent(msg.channel, msg.senderId);

    // Handle special commands
    if (msg.metadata?.isCommand) {
@@ -367,8 +381,24 @@ function createMessageRouter(deps: {
    }

    try {
-      const response = await agent.process(msg.text, msg.attachments);
-      await reply({ text: response, replyTo: msg.id });
+      // Transcribe audio attachments before processing
+      let messageText = msg.text;
+      const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
+
+      if (audioAttachments.length > 0 && deps.audioConfig) {
+        for (const att of audioAttachments) {
+          const transcript = await transcribeAudio(att, deps.audioConfig);
+          messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
+        }
+      }
+
+      const response = await agent.process(messageText, msg.attachments);
+      const outboundAttachments = collector.drain();
+      await reply({
+        text: response,
+        replyTo: msg.id,
+        attachments: outboundAttachments.length > 0 ? outboundAttachments : undefined,
+      });
    } catch (error) {
      console.error(`Error processing message from ${msg.channel}:${msg.senderId}:`, error);
      await reply({
@@ -539,6 +569,13 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
    });
  }

+  // Initialize audio transcription config
+  const audioConfig: AudioTranscriptionConfig = {
+    endpoint: config.audio.transcription_endpoint,
+    apiKey: config.audio.transcription_api_key,
+    model: config.audio.transcription_model,
+  };
+
  // Initialize model router
  const modelRouter = createModelRouter(config);

@@ -593,6 +630,7 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
    agentConfigRegistry,
    agentRouter,
    sandboxManager,
+    audioConfig,
  }));

  // Register Telegram adapter