feat(audio): add native audio support to type system and model clients

- Add AudioSource interface and 'audio' variant to MessageContentPart union - Update buildUserMessage() to create audio content parts from attachments - Add attachmentToAudioSource(), hasAudio(), stripAudioParts() helpers - Gemini: native audio via inlineData (same format as images) - OpenAI/GitHub: native audio via input_audio content parts - Anthropic/Bedrock: graceful fallback to transcript text - Update getMessageTextWithTools() to handle audio blocks for local models
2026-02-11 18:17:33 -08:00
parent a875bcc4ae
commit 32e1a2724a
8 changed files with 169 additions and 22 deletions
@@ -3,7 +3,7 @@
 */

 import type { Attachment } from '../channels/types.js';
-import type { MessageContentPart, ImageSource, Message } from './types.js';
+import type { MessageContentPart, ImageSource, AudioSource, Message } from './types.js';

 /** MIME types that vision models generally accept. */
 const SUPPORTED_IMAGE_TYPES = new Set([
@@ -73,34 +73,55 @@ export function attachmentToImageSource(attachment: Attachment): ImageSource | n
  return null;
 }

+/** Convert a channel Attachment to a model AudioSource. Only base64 data is supported. */
+export function attachmentToAudioSource(attachment: Attachment): AudioSource | null {
+  if (!isSupportedAudio(attachment)) {
+    return null;
+  }
+
+  if (!attachment.data) {
+    return null;
+  }
+
+  return {
+    media_type: attachment.mimeType,
+    data: attachment.data,
+  };
+}
+
 /**
 * Build a multimodal Message from text + attachments.
- * If there are no image attachments, returns a plain text Message.
- * If there are image attachments, returns a Message with structured content parts.
+ * If there are no image or audio attachments, returns a plain text Message.
+ * If there are image/audio attachments, returns a Message with structured content parts.
 */
 export function buildUserMessage(text: string, attachments?: Attachment[]): Message {
-  const imageParts: MessageContentPart[] = [];
+  const mediaParts: MessageContentPart[] = [];

  if (attachments) {
    for (const att of attachments) {
-      const source = attachmentToImageSource(att);
-      if (source) {
-        imageParts.push({ type: 'image', source });
+      const imageSource = attachmentToImageSource(att);
+      if (imageSource) {
+        mediaParts.push({ type: 'image', source: imageSource });
+        continue;
+      }
+      const audioSource = attachmentToAudioSource(att);
+      if (audioSource) {
+        mediaParts.push({ type: 'audio', source: audioSource });
      }
    }
  }

-  // No images — return simple text message (preserves backward compat)
-  if (imageParts.length === 0) {
+  // No media — return simple text message (preserves backward compat)
+  if (mediaParts.length === 0) {
    return { role: 'user', content: text };
  }

-  // Build multimodal content: text first, then images
+  // Build multimodal content: text first, then media
  const parts: MessageContentPart[] = [];
  if (text) {
    parts.push({ type: 'text', text });
  }
-  parts.push(...imageParts);
+  parts.push(...mediaParts);

  return { role: 'user', content: parts };
 }
@@ -148,6 +169,13 @@ export function getMessageTextWithTools(message: Message): string {
      const content = (block.content as string) ?? '';
      const isError = block.is_error ? ' (error)' : '';
      parts.push(`[Tool result${isError}: ${content}]`);
+    } else if (block.type === 'audio') {
+      const source = block.source as Record<string, unknown>;
+      if (source?.transcript) {
+        parts.push(`[Voice message]: ${source.transcript}`);
+      } else {
+        parts.push('[Audio attachment]');
+      }
    }
  }
  return parts.join('\n');
@@ -298,3 +326,44 @@ export function hasImages(message: Message): boolean {
  }
  return message.content.some(p => p.type === 'image');
 }
+
+/**
+ * Check whether a message contains audio content parts.
+ */
+export function hasAudio(message: Message): boolean {
+  if (typeof message.content === 'string') {
+    return false;
+  }
+  return message.content.some(p => p.type === 'audio');
+}
+
+/**
+ * Strip audio parts from a message, replacing them with their transcripts as text.
+ * Used for model providers that don't support native audio input (Anthropic, Bedrock, local).
+ */
+export function stripAudioParts(message: Message): Message {
+  if (typeof message.content === 'string') {
+    return message;
+  }
+
+  const newParts: MessageContentPart[] = [];
+  for (const part of message.content) {
+    if (part.type === 'audio') {
+      // Replace audio with transcript text if available
+      if (part.source.transcript) {
+        newParts.push({ type: 'text', text: `[Voice message]: ${part.source.transcript}` });
+      } else {
+        newParts.push({ type: 'text', text: '[Audio message received but no transcript available]' });
+      }
+    } else {
+      newParts.push(part);
+    }
+  }
+
+  // If all that's left is a single text part, simplify to string content
+  if (newParts.length === 1 && newParts[0].type === 'text') {
+    return { ...message, content: newParts[0].text };
+  }
+
+  return { ...message, content: newParts };
+}