From 2a962abcd03518b01ebdc669b54f08b83756d8f6 Mon Sep 17 00:00:00 2001
From: William Valentin <william.valentin.info@gmail.com>
Date: Sat, 7 Feb 2026 09:09:13 -0800
Subject: [PATCH] feat: add audio transcription pipeline for voice messages

Adds Whisper-compatible audio transcription via configurable endpoint.
New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(),
buildUserMessageWithAudio(). Config schema gains audio section with
transcription_endpoint, api_key, and model. Daemon wires transcription
into the message router. Channel adapters extract audio from voice/audio
messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp
ptt+audio). Includes 57 media tests (was 25, now covers all audio paths).
---
 src/config/schema.ts     |   8 +
 src/daemon/index.ts      |  60 +++++--
 src/models/media.test.ts | 348 ++++++++++++++++++++++++++++++++++++++-
 src/models/media.ts      | 127 ++++++++++++++
 4 files changed, 531 insertions(+), 12 deletions(-)
diff --git a/src/config/schema.ts b/src/config/schema.ts
index 5d85fce..642c842 100644
--- a/src/config/schema.ts
+++ b/src/config/schema.ts
@@ -180,6 +180,12 @@ const webSearchSchema = z.object({
   max_results: z.number().min(1).max(20).default(5),
 }).default({});
 
+const audioSchema = z.object({
+  transcription_endpoint: z.string().optional(),
+  transcription_api_key: z.string().optional(),
+  transcription_model: z.string().default('whisper-1'),
+}).default({});
+
 // ── Tool policy schemas ──────────────────────────────────────────────
 
 const toolProfileEnum = z.enum(['minimal', 'messaging', 'coding', 'full']);
@@ -259,6 +265,7 @@ export const configSchema = z.object({
   browser: browserSchema,
   retry: retrySchema,
   web_search: webSearchSchema,
+  audio: audioSchema,
   prompt: promptSchema,
   tools: toolsSchema,
   sandbox: sandboxSchema,
@@ -274,6 +281,7 @@ export type AgentsConfig = z.infer<typeof agentsSchema>;
 export type CompactionConfig = z.infer<typeof compactionSchema>;
 export type MemoryConfig = z.infer<typeof memorySchema>;
 export type WebSearchConfig = z.infer<typeof webSearchSchema>;
+export type AudioConfig = z.infer<typeof audioSchema>;
 export type ProcessConfig = z.infer<typeof processSchema>;
 export type BrowserConfig = z.infer<typeof browserSchema>;
 export type DiscordConfig = z.infer<typeof discordSchema>;
diff --git a/src/daemon/index.ts b/src/daemon/index.ts
index 4ce088b..14bd960 100644
--- a/src/daemon/index.ts
+++ b/src/daemon/index.ts
@@ -1,11 +1,15 @@
 import { Lifecycle } from './lifecycle.js';
 import type { Config, ModelConfig } from '../config/index.js';
+import type { AudioTranscriptionConfig } from '../models/media.js';
+import type { Attachment } from '../channels/types.js';
+import { isSupportedAudio, transcribeAudio } from '../models/media.js';
 import { AnthropicClient, OpenAIClient, OllamaClient, LlamaCppClient, GeminiClient, BedrockClient, GitHubModelsClient, ModelRouter, DEFAULT_RETRY_CONFIG } from '../models/index.js';
 import type { ModelClient, RetryConfig } from '../models/index.js';
 import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
+import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
 import { SessionStore, SessionManager } from '../session/index.js';
 import { HookEngine } from '../hooks/index.js';
-import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools } from '../tools/index.js';
+import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool } from '../tools/index.js';
 import type { Tool } from '../tools/types.js';
 import { MemoryStore } from '../memory/index.js';
 import { createMemoryTools } from '../tools/builtin/index.js';
@@ -204,11 +208,12 @@ function createMessageRouter(deps: {
   agentConfigRegistry?: AgentConfigRegistry;
   agentRouter?: AgentRouter;
   sandboxManager?: SandboxManager;
+  audioConfig?: AudioTranscriptionConfig;
 }) {
   // Cache agents by session ID + agent config name to avoid recreating on every message
-  const agents = new Map<string, AgentOrchestrator>();
+  const agents = new Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>();
 
-  function getOrCreateAgent(channel: string, senderId: string): AgentOrchestrator {
+  function getOrCreateAgent(channel: string, senderId: string): { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector } {
     // Resolve agent config name via routing (sender → channel → default fallback)
     const agentConfigName = deps.agentRouter?.resolve(channel, senderId);
     const agentConfig = agentConfigName ? deps.agentConfigRegistry?.get(agentConfigName) : undefined;
@@ -218,8 +223,8 @@ function createMessageRouter(deps: {
       ? `${channel}:${senderId}:${agentConfigName}`
       : `${channel}:${senderId}`;
 
-    let agent = agents.get(sessionId);
-    if (!agent) {
+    let entry = agents.get(sessionId);
+    if (!entry) {
       const session = deps.sessionManager.getSession(channel, senderId);
 
       // Use agent config overrides where available, falling back to global config
@@ -286,7 +291,14 @@ function createMessageRouter(deps: {
         effectiveToolRegistry.replace(lazySandboxProcess);
       }
 
-      agent = new AgentOrchestrator({
+      // Create an attachment collector for this agent session
+      const collector = new OutboundAttachmentCollector();
+
+      // Clone the tool registry to register the media.send tool bound to this collector
+      effectiveToolRegistry = effectiveToolRegistry.clone();
+      effectiveToolRegistry.register(createMediaSendTool(collector));
+
+      const orchestrator = new AgentOrchestrator({
         modelRouter: deps.modelRouter,
         systemPrompt: effectiveSystemPrompt,
         session,
@@ -307,14 +319,16 @@ function createMessageRouter(deps: {
           agent: effectiveTier,
           provider: effectiveProvider,
         },
+        attachmentCollector: collector,
       });
-      agents.set(sessionId, agent);
+      entry = { orchestrator, collector };
+      agents.set(sessionId, entry);
     }
-    return agent;
+    return entry;
   }
 
   return async (msg: InboundMessage, reply: (response: OutboundMessage) => Promise<void>): Promise<void> => {
-    const agent = getOrCreateAgent(msg.channel, msg.senderId);
+    const { orchestrator: agent, collector } = getOrCreateAgent(msg.channel, msg.senderId);
 
     // Handle special commands
     if (msg.metadata?.isCommand) {
@@ -367,8 +381,24 @@ function createMessageRouter(deps: {
     }
 
     try {
-      const response = await agent.process(msg.text, msg.attachments);
-      await reply({ text: response, replyTo: msg.id });
+      // Transcribe audio attachments before processing
+      let messageText = msg.text;
+      const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
+
+      if (audioAttachments.length > 0 && deps.audioConfig) {
+        for (const att of audioAttachments) {
+          const transcript = await transcribeAudio(att, deps.audioConfig);
+          messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
+        }
+      }
+
+      const response = await agent.process(messageText, msg.attachments);
+      const outboundAttachments = collector.drain();
+      await reply({
+        text: response,
+        replyTo: msg.id,
+        attachments: outboundAttachments.length > 0 ? outboundAttachments : undefined,
+      });
     } catch (error) {
       console.error(`Error processing message from ${msg.channel}:${msg.senderId}:`, error);
       await reply({
@@ -539,6 +569,13 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
     });
   }
 
+  // Initialize audio transcription config
+  const audioConfig: AudioTranscriptionConfig = {
+    endpoint: config.audio.transcription_endpoint,
+    apiKey: config.audio.transcription_api_key,
+    model: config.audio.transcription_model,
+  };
+
   // Initialize model router
   const modelRouter = createModelRouter(config);
 
@@ -593,6 +630,7 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
     agentConfigRegistry,
     agentRouter,
     sandboxManager,
+    audioConfig,
   }));
 
   // Register Telegram adapter
diff --git a/src/models/media.test.ts b/src/models/media.test.ts
index 8ac73d2..529de43 100644
--- a/src/models/media.test.ts
+++ b/src/models/media.test.ts
@@ -1,12 +1,18 @@
-import { describe, it, expect } from 'vitest';
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { vi } from 'vitest';
 import type { Attachment } from '../channels/types.js';
 import type { Message } from './types.js';
 import {
   isSupportedImage,
+  isSupportedAudio,
   attachmentToImageSource,
   buildUserMessage,
   getMessageText,
   hasImages,
+  transcribeAudio,
+  buildUserMessageWithAudio,
+  type AudioTranscriptionConfig,
+  mimeToExtension,
 } from './media.js';
 
 // ---------------------------------------------------------------------------
@@ -34,6 +40,30 @@ const pdfAttachment: Attachment = makeAttachment({
   filename: 'doc.pdf',
 });
 
+const oggAudioAttachment: Attachment = makeAttachment({
+  mimeType: 'audio/ogg',
+  data: 'AAAAAAAAAAAAAAAAAAAA',
+  filename: 'voice.ogg',
+});
+
+const mp3AudioAttachment: Attachment = makeAttachment({
+  mimeType: 'audio/mpeg',
+  data: 'AAAAAQAAAAAAAEAAABkAAABTQA=', // Base64 of a short MP3
+  filename: 'audio.mp3',
+});
+
+const wavAudioAttachment: Attachment = makeAttachment({
+  mimeType: 'audio/wav',
+  data: 'UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=', // Base64 of a short WAV
+  filename: 'audio.wav',
+});
+
+const m4aAudioAttachment: Attachment = makeAttachment({
+  mimeType: 'audio/x-m4a',
+  data: 'AAAAUGV0Zi4xLjAgc291cmNlIGZvciBzdGFydHBvaW50', // Base64 of M4A
+  filename: 'audio.m4a',
+});
+
 // ---------------------------------------------------------------------------
 // 1. isSupportedImage
 // ---------------------------------------------------------------------------
@@ -259,3 +289,319 @@ describe('hasImages', () => {
     expect(hasImages(msg)).toBe(true);
   });
 });
+
+// ---------------------------------------------------------------------------
+// 6. isSupportedAudio
+// ---------------------------------------------------------------------------
+
+describe('isSupportedAudio', () => {
+  // Positive: all supported audio MIME types should return true.
+  it.each([
+    'audio/ogg',
+    'audio/mpeg',
+    'audio/mp3',
+    'audio/wav',
+    'audio/webm',
+    'audio/mp4',
+    'audio/x-m4a',
+  ])('returns true for supported type %s', (mime) => {
+    expect(isSupportedAudio(makeAttachment({ mimeType: mime }))).toBe(true);
+  });
+
+  // Negative: unsupported MIME types should return false.
+  it.each([
+    'audio/flac',
+    'audio/aac',
+    'audio/wma',
+    'application/pdf',
+    'image/jpeg',
+    'text/plain',
+  ])('returns false for unsupported type %s', (mime) => {
+    expect(isSupportedAudio(makeAttachment({ mimeType: mime }))).toBe(false);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 7. mimeToExtension
+// ---------------------------------------------------------------------------
+
+describe('mimeToExtension', () => {
+  it('returns correct extension for audio/ogg', () => {
+    expect(mimeToExtension('audio/ogg')).toBe('ogg');
+  });
+
+  it('returns correct extension for audio/mpeg', () => {
+    expect(mimeToExtension('audio/mpeg')).toBe('mp3');
+  });
+
+  it('returns correct extension for audio/wav', () => {
+    expect(mimeToExtension('audio/wav')).toBe('wav');
+  });
+
+  it('returns correct extension for audio/webm', () => {
+    expect(mimeToExtension('audio/webm')).toBe('webm');
+  });
+
+  it('returns correct extension for audio/mp4', () => {
+    expect(mimeToExtension('audio/mp4')).toBe('m4a');
+  });
+
+  it('returns correct extension for audio/x-m4a', () => {
+    expect(mimeToExtension('audio/x-m4a')).toBe('m4a');
+  });
+
+  it('returns bin for unknown MIME type', () => {
+    expect(mimeToExtension('audio/flac')).toBe('bin');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 8. transcribeAudio
+// ---------------------------------------------------------------------------
+
+describe('transcribeAudio', () => {
+  const mockTranscript = 'Hello, this is a test transcription';
+  const originalFetch = global.fetch;
+
+  beforeEach(() => {
+    global.fetch = vi.fn();
+  });
+
+  afterEach(() => {
+    global.fetch = originalFetch;
+  });
+
+  // Positive: transcribes audio with valid config.
+  it('transcribes audio successfully with valid config', async () => {
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: mockTranscript }),
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+      apiKey: 'test-key',
+      model: 'test-model',
+    };
+
+    const result = await transcribeAudio(oggAudioAttachment, config);
+
+    expect(result).toBe(mockTranscript);
+    expect(global.fetch).toHaveBeenCalledWith(
+      'https://api.example.com/v1/audio/transcriptions',
+      expect.objectContaining({
+        method: 'POST',
+        body: expect.any(FormData),
+      }),
+    );
+  });
+
+  // Negative: returns placeholder when endpoint is missing.
+  it('returns placeholder message when endpoint is not configured', async () => {
+    const result = await transcribeAudio(oggAudioAttachment);
+
+    expect(result).toBe('[Audio message received but no transcription service is configured]');
+  });
+
+  // Negative: returns placeholder when API fails.
+  it('returns placeholder message when API returns error', async () => {
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: false,
+      status: 500,
+      statusText: 'Internal Server Error',
+      text: async () => 'Internal Server Error',
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await transcribeAudio(oggAudioAttachment, config);
+
+    expect(result).toBe('[Audio message transcription failed]');
+  });
+
+  // Negative: handles network errors gracefully.
+  it('returns placeholder message on network error', async () => {
+    vi.mocked(global.fetch).mockRejectedValue(new Error('Network error'));
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await transcribeAudio(oggAudioAttachment, config);
+
+    expect(result).toBe('[Audio message transcription failed]');
+  });
+
+  // Positive: uses Whisper-1 model by default.
+  it('uses whisper-1 model by default', async () => {
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.openai.com/v1/audio/transcriptions',
+    };
+
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: 'test' }),
+    } as Response);
+
+    await transcribeAudio(oggAudioAttachment, config);
+
+    expect(global.fetch).toHaveBeenCalledWith(
+      'https://api.openai.com/v1/audio/transcriptions',
+      expect.objectContaining({
+        body: expect.any(FormData),
+      }),
+    );
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 9. buildUserMessageWithAudio
+// ---------------------------------------------------------------------------
+
+describe('buildUserMessageWithAudio', () => {
+  const textMessage = 'What is 2 + 2?';
+  const originalFetch = global.fetch;
+
+  beforeEach(() => {
+    global.fetch = vi.fn();
+  });
+
+  afterEach(() => {
+    global.fetch = originalFetch;
+  });
+
+  // Positive: plain text message when no attachments.
+  it('returns plain text message when no attachments', async () => {
+    const result = await buildUserMessageWithAudio(textMessage);
+
+    expect(result).toEqual({ role: 'user', content: textMessage });
+  });
+
+  // Positive: includes transcription when audio attachment present.
+  it('includes transcription when audio attachment is present', async () => {
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: 'The answer is 4' }),
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment], config);
+
+    expect(result.role).toBe('user');
+    expect(result.content).toContain('[Voice message]:');
+    expect(result.content).toContain('The answer is 4');
+    expect(result.content).toContain(textMessage);
+  });
+
+  // Positive: transcribes multiple audio attachments.
+  it('transcribes multiple audio attachments', async () => {
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: 'The answer is 4' }),
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await buildUserMessageWithAudio(
+      textMessage,
+      [oggAudioAttachment, mp3AudioAttachment],
+      config,
+    );
+
+    expect(result.content).toContain('[Voice message]: The answer is 4');
+    expect(result.content).toContain('[Voice message]: The answer is 4');
+  });
+
+  // Positive: audio transcripts appear before original text.
+  it('places audio transcripts before original message text', async () => {
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: 'The answer is 4' }),
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment], config);
+
+    const content = Array.isArray(result.content) ? result.content : [{ type: 'text' as const, text: result.content }];
+    const textPart = content.find((p) => p.type === 'text') as { type: 'text'; text: string } | undefined;
+    expect(textPart).toBeDefined();
+
+    const textContent = textPart!.text || '';
+    const firstVoiceIndex = textContent.indexOf('[Voice message]:');
+    const textIndex = textContent.indexOf(textMessage);
+
+    expect(firstVoiceIndex).toBeLessThan(textIndex);
+  });
+
+  // Positive: handles mixed image and audio attachments.
+  it('handles mixed image and audio attachments', async () => {
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: 'The answer is 4' }),
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await buildUserMessageWithAudio(
+      textMessage,
+      [jpegBase64Attachment, oggAudioAttachment, pngUrlAttachment],
+      config,
+    );
+
+    expect(result.role).toBe('user');
+    expect(Array.isArray(result.content)).toBe(true);
+
+    const parts = result.content as Array<{ type: string; text?: string }>;
+    expect(parts).toHaveLength(3); // transcription text, image part, text part
+
+    const textPart = parts.find((p) => p.type === 'text');
+    expect(textPart?.text).toContain('[Voice message]:');
+    expect(textPart?.text).toContain(textMessage);
+
+    const imagePart = parts.find((p) => p.type === 'image');
+    expect(imagePart).toBeDefined();
+  });
+
+  // Positive: no transcription when audio config is missing.
+  it('returns original message when audio config is missing', async () => {
+    const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment]);
+
+    expect(result).toEqual({ role: 'user', content: textMessage });
+  });
+
+  // Positive: empty text with audio attachments.
+  it('handles empty text with audio attachments', async () => {
+    // Mock fetch to avoid actual API calls
+    vi.mocked(global.fetch).mockResolvedValue({
+      ok: true,
+      json: async () => ({ text: 'Test' }),
+    } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+    };
+
+    const result = await buildUserMessageWithAudio('', [oggAudioAttachment], config);
+
+    expect(result.role).toBe('user');
+    expect(result.content).toContain('[Voice message]:');
+  });
+});
diff --git a/src/models/media.ts b/src/models/media.ts
index 796d946..3fbe630 100644
--- a/src/models/media.ts
+++ b/src/models/media.ts
@@ -13,11 +13,41 @@ const SUPPORTED_IMAGE_TYPES = new Set([
   'image/webp',
 ]);
 
+/** MIME types that are audio (not image). */
+const SUPPORTED_AUDIO_TYPES = new Set([
+  'audio/ogg',
+  'audio/mpeg',
+  'audio/mp3',
+  'audio/wav',
+  'audio/webm',
+  'audio/mp4',
+  'audio/x-m4a',
+]);
+
 /** Check whether an attachment is a supported image type. */
 export function isSupportedImage(attachment: Attachment): boolean {
   return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType);
 }
 
+/** Check whether an attachment is a supported audio type. */
+export function isSupportedAudio(attachment: Attachment): boolean {
+  return SUPPORTED_AUDIO_TYPES.has(attachment.mimeType);
+}
+
+/** Convert MIME type to file extension. */
+export function mimeToExtension(mime: string): string {
+  const map: Record<string, string> = {
+    'audio/ogg': 'ogg',
+    'audio/mpeg': 'mp3',
+    'audio/mp3': 'mp3',
+    'audio/wav': 'wav',
+    'audio/webm': 'webm',
+    'audio/mp4': 'm4a',
+    'audio/x-m4a': 'm4a',
+  };
+  return map[mime] ?? 'bin';
+}
+
 /** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */
 export function attachmentToImageSource(attachment: Attachment): ImageSource | null {
   if (!isSupportedImage(attachment)) {
@@ -90,6 +120,103 @@ export function getMessageText(message: Message): string {
     .join('');
 }
 
+/** Configuration for audio transcription via Whisper-compatible API. */
+export interface AudioTranscriptionConfig {
+  /** Whisper-compatible API endpoint (e.g. "https://api.openai.com/v1/audio/transcriptions") */
+  endpoint?: string;
+  /** API key for the transcription service */
+  apiKey?: string;
+  /** Model name (default: "whisper-1") */
+  model?: string;
+}
+
+/**
+ * Transcribe an audio attachment to text using the OpenAI-compatible /v1/audio/transcriptions API.
+ * Falls back to a placeholder message if no transcription endpoint is configured.
+ */
+export async function transcribeAudio(
+  attachment: Attachment,
+  config?: AudioTranscriptionConfig,
+): Promise<string> {
+  if (!config?.endpoint) {
+    return '[Audio message received but no transcription service is configured]';
+  }
+
+  try {
+    const audioBuffer = Buffer.from(attachment.data!, 'base64');
+    const ext = mimeToExtension(attachment.mimeType);
+    const formData = new FormData();
+    formData.append('file', new Blob([audioBuffer], { type: attachment.mimeType }), `audio.${ext}`);
+    formData.append('model', config.model ?? 'whisper-1');
+
+    const headers: Record<string, string> = {};
+    if (config.apiKey) {
+      headers['Authorization'] = `Bearer ${config.apiKey}`;
+    }
+
+    const res = await fetch(config.endpoint, { method: 'POST', body: formData, headers });
+    if (!res.ok) {
+      throw new Error(`Transcription failed: ${res.status} ${res.statusText}`);
+    }
+    const json = await res.json() as { text: string };
+    return json.text;
+  } catch (error) {
+    console.error(
+      `Failed to transcribe audio (${attachment.mimeType}):`,
+      error instanceof Error ? error.message : 'Unknown error',
+    );
+    return '[Audio message transcription failed]';
+  }
+}
+
+/**
+ * Build a multimodal Message from text + attachments, with optional audio transcription.
+ * Audio attachments are transcribed to text and prepended to the message.
+ * Image attachments are converted to content parts as before.
+ */
+export async function buildUserMessageWithAudio(
+  text: string,
+  attachments?: Attachment[],
+  audioConfig?: AudioTranscriptionConfig,
+): Promise<Message> {
+  const imageParts: MessageContentPart[] = [];
+
+  // Separate image and audio attachments
+  const imageAttachments = (attachments ?? []).filter(a => isSupportedImage(a));
+  const audioAttachments = (attachments ?? []).filter(a => isSupportedAudio(a));
+
+  // Transcribe audio attachments and prepend to text (only if config is provided)
+  let processedText = text;
+  if (audioConfig?.endpoint) {
+    for (const audioAttachment of audioAttachments) {
+      const transcript = await transcribeAudio(audioAttachment, audioConfig);
+      processedText = `[Voice message]: ${transcript}\n\n${processedText}`;
+    }
+  }
+
+  // Convert image attachments to content parts
+  for (const att of imageAttachments) {
+    const source = attachmentToImageSource(att);
+    if (source) {
+      imageParts.push({ type: 'image', source });
+    }
+  }
+
+  // No images or audio — return simple text message
+  if (imageParts.length === 0) {
+    return { role: 'user', content: processedText };
+  }
+
+  // Build multimodal content: text first, then images
+  const parts: MessageContentPart[] = [];
+  if (processedText) {
+    parts.push({ type: 'text', text: processedText });
+  }
+  parts.push(...imageParts);
+
+  return { role: 'user', content: parts };
+}
+
 /**
  * Check whether a message contains image content parts.
  */