feat: add audio transcription pipeline for voice messages

Adds Whisper-compatible audio transcription via configurable endpoint. New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(), buildUserMessageWithAudio(). Config schema gains audio section with transcription_endpoint, api_key, and model. Daemon wires transcription into the message router. Channel adapters extract audio from voice/audio messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp ptt+audio). Includes 57 media tests (was 25, now covers all audio paths).
2026-02-07 09:09:13 -08:00
parent e052778b0a
commit 2a962abcd0
4 changed files with 531 additions and 12 deletions
@@ -13,11 +13,41 @@ const SUPPORTED_IMAGE_TYPES = new Set([
  'image/webp',
 ]);

+/** MIME types that are audio (not image). */
+const SUPPORTED_AUDIO_TYPES = new Set([
+  'audio/ogg',
+  'audio/mpeg',
+  'audio/mp3',
+  'audio/wav',
+  'audio/webm',
+  'audio/mp4',
+  'audio/x-m4a',
+]);
+
 /** Check whether an attachment is a supported image type. */
 export function isSupportedImage(attachment: Attachment): boolean {
  return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType);
 }

+/** Check whether an attachment is a supported audio type. */
+export function isSupportedAudio(attachment: Attachment): boolean {
+  return SUPPORTED_AUDIO_TYPES.has(attachment.mimeType);
+}
+
+/** Convert MIME type to file extension. */
+export function mimeToExtension(mime: string): string {
+  const map: Record<string, string> = {
+    'audio/ogg': 'ogg',
+    'audio/mpeg': 'mp3',
+    'audio/mp3': 'mp3',
+    'audio/wav': 'wav',
+    'audio/webm': 'webm',
+    'audio/mp4': 'm4a',
+    'audio/x-m4a': 'm4a',
+  };
+  return map[mime] ?? 'bin';
+}
+
 /** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */
 export function attachmentToImageSource(attachment: Attachment): ImageSource | null {
  if (!isSupportedImage(attachment)) {
@@ -90,6 +120,103 @@ export function getMessageText(message: Message): string {
    .join('');
 }

+/** Configuration for audio transcription via Whisper-compatible API. */
+export interface AudioTranscriptionConfig {
+  /** Whisper-compatible API endpoint (e.g. "https://api.openai.com/v1/audio/transcriptions") */
+  endpoint?: string;
+  /** API key for the transcription service */
+  apiKey?: string;
+  /** Model name (default: "whisper-1") */
+  model?: string;
+}
+
+/**
+ * Transcribe an audio attachment to text using the OpenAI-compatible /v1/audio/transcriptions API.
+ * Falls back to a placeholder message if no transcription endpoint is configured.
+ */
+export async function transcribeAudio(
+  attachment: Attachment,
+  config?: AudioTranscriptionConfig,
+): Promise<string> {
+  if (!config?.endpoint) {
+    return '[Audio message received but no transcription service is configured]';
+  }
+
+  try {
+    const audioBuffer = Buffer.from(attachment.data!, 'base64');
+    const ext = mimeToExtension(attachment.mimeType);
+    const formData = new FormData();
+    formData.append('file', new Blob([audioBuffer], { type: attachment.mimeType }), `audio.${ext}`);
+    formData.append('model', config.model ?? 'whisper-1');
+
+    const headers: Record<string, string> = {};
+    if (config.apiKey) {
+      headers['Authorization'] = `Bearer ${config.apiKey}`;
+    }
+
+    const res = await fetch(config.endpoint, { method: 'POST', body: formData, headers });
+    if (!res.ok) {
+      throw new Error(`Transcription failed: ${res.status} ${res.statusText}`);
+    }
+    const json = await res.json() as { text: string };
+    return json.text;
+  } catch (error) {
+    console.error(
+      `Failed to transcribe audio (${attachment.mimeType}):`,
+      error instanceof Error ? error.message : 'Unknown error',
+    );
+    return '[Audio message transcription failed]';
+  }
+}
+
+/**
+ * Build a multimodal Message from text + attachments, with optional audio transcription.
+ * Audio attachments are transcribed to text and prepended to the message.
+ * Image attachments are converted to content parts as before.
+ */
+export async function buildUserMessageWithAudio(
+  text: string,
+  attachments?: Attachment[],
+  audioConfig?: AudioTranscriptionConfig,
+): Promise<Message> {
+  const imageParts: MessageContentPart[] = [];
+
+  // Separate image and audio attachments
+  const imageAttachments = (attachments ?? []).filter(a => isSupportedImage(a));
+  const audioAttachments = (attachments ?? []).filter(a => isSupportedAudio(a));
+
+  // Transcribe audio attachments and prepend to text (only if config is provided)
+  let processedText = text;
+  if (audioConfig?.endpoint) {
+    for (const audioAttachment of audioAttachments) {
+      const transcript = await transcribeAudio(audioAttachment, audioConfig);
+      processedText = `[Voice message]: ${transcript}\n\n${processedText}`;
+    }
+  }
+
+  // Convert image attachments to content parts
+  for (const att of imageAttachments) {
+    const source = attachmentToImageSource(att);
+    if (source) {
+      imageParts.push({ type: 'image', source });
+    }
+  }
+
+  // No images or audio — return simple text message
+  if (imageParts.length === 0) {
+    return { role: 'user', content: processedText };
+  }
+
+  // Build multimodal content: text first, then images
+  const parts: MessageContentPart[] = [];
+  if (processedText) {
+    parts.push({ type: 'text', text: processedText });
+  }
+  parts.push(...imageParts);
+
+  return { role: 'user', content: parts };
+}
+
 /**
 * Check whether a message contains image content parts.
 */