diff --git a/src/models/anthropic.ts b/src/models/anthropic.ts
index 47a5356..a35a324 100644
--- a/src/models/anthropic.ts
+++ b/src/models/anthropic.ts
@@ -41,6 +41,13 @@ function toAnthropicContent(content: string | MessageContentPart[]): string | un
         },
       };
     }
+    // Audio — Anthropic doesn't support native audio input; use transcript fallback
+    if (part.type === 'audio') {
+      if (part.source.transcript) {
+        return { type: 'text', text: `[Voice message]: ${part.source.transcript}` };
+      }
+      return { type: 'text', text: '[Audio message received but no transcript available]' };
+    }
     return part;
   });
 }
diff --git a/src/models/bedrock.ts b/src/models/bedrock.ts
index a15825b..c2e0a26 100644
--- a/src/models/bedrock.ts
+++ b/src/models/bedrock.ts
@@ -170,17 +170,27 @@ function convertMessages(messages: Message[]): BedrockMessage[] {
       if (part.type === 'text') {
         return { text: part.text } as ContentBlock;
       }
-      // Image part — Bedrock uses { image: { format, source: { bytes } } }
-      if (part.source.type === 'base64' && part.source.data) {
-        return {
-          image: {
-            format: part.source.media_type.split('/')[1] as 'jpeg' | 'png' | 'gif' | 'webp',
-            source: { bytes: Uint8Array.from(atob(part.source.data), c => c.charCodeAt(0)) },
-          },
-        } as unknown as ContentBlock;
+      if (part.type === 'image') {
+        // Image part — Bedrock uses { image: { format, source: { bytes } } }
+        if (part.source.type === 'base64' && part.source.data) {
+          return {
+            image: {
+              format: part.source.media_type.split('/')[1] as 'jpeg' | 'png' | 'gif' | 'webp',
+              source: { bytes: Uint8Array.from(atob(part.source.data), c => c.charCodeAt(0)) },
+            },
+          } as unknown as ContentBlock;
+        }
+        // URL images not natively supported by Bedrock — fall back to text placeholder
+        return { text: `[Image: ${part.source.url ?? 'unsupported'}]` } as ContentBlock;
       }
-      // URL images not natively supported by Bedrock — fall back to text placeholder
-      return { text: `[Image: ${part.source.url ?? 'unsupported'}]` } as ContentBlock;
+      // Audio — Bedrock doesn't support native audio input; use transcript fallback
+      if (part.type === 'audio') {
+        if (part.source.transcript) {
+          return { text: `[Voice message]: ${part.source.transcript}` } as ContentBlock;
+        }
+        return { text: '[Audio message received but no transcript available]' } as ContentBlock;
+      }
+      return { text: JSON.stringify(part) } as ContentBlock;
     });
 
     return { role, content: blocks };
diff --git a/src/models/gemini.ts b/src/models/gemini.ts
index bc7b63e..e055a14 100644
--- a/src/models/gemini.ts
+++ b/src/models/gemini.ts
@@ -188,6 +188,15 @@ function convertMessages(messages: Message[]): Content[] {
         // so we pass as a text description. In production, you'd want to fetch + base64 encode.
         return { text: `[Image: ${part.source.url ?? 'unavailable'}]` };
       }
+      // Audio part — Gemini supports native audio via inlineData (same format as images)
+      if (part.type === 'audio') {
+        return {
+          inlineData: {
+            mimeType: part.source.media_type,
+            data: part.source.data,
+          },
+        };
+      }
       return { text: JSON.stringify(part) };
     });
 
diff --git a/src/models/github.ts b/src/models/github.ts
index 38ffeab..65d3d38 100644
--- a/src/models/github.ts
+++ b/src/models/github.ts
@@ -36,6 +36,23 @@ function toOpenAIContent(content: string | MessageContentPart[]): string | OpenA
         : part.source.url!;
       return { type: 'image_url', image_url: { url } };
     }
+    if (part.type === 'audio') {
+      // GitHub Models uses OpenAI-compatible API — native audio via input_audio
+      const formatMap: Record<string, string> = {
+        'audio/wav': 'wav',
+        'audio/mpeg': 'mp3',
+        'audio/mp3': 'mp3',
+        'audio/ogg': 'ogg',
+        'audio/webm': 'webm',
+        'audio/mp4': 'mp4',
+        'audio/x-m4a': 'mp4',
+      };
+      const format = formatMap[part.source.media_type] ?? 'wav';
+      return {
+        type: 'input_audio',
+        input_audio: { data: part.source.data, format },
+      } as unknown as OpenAI.ChatCompletionContentPart;
+    }
     // Fallback — shouldn't happen
     return { type: 'text', text: JSON.stringify(part) };
   });
diff --git a/src/models/index.ts b/src/models/index.ts
index 3dfac1a..5850a63 100644
--- a/src/models/index.ts
+++ b/src/models/index.ts
@@ -10,15 +10,20 @@ export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from '
 export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js';
 export {
   isSupportedImage,
+  isSupportedAudio,
   attachmentToImageSource,
+  attachmentToAudioSource,
   buildUserMessage,
   getMessageText,
   hasImages,
+  hasAudio,
+  stripAudioParts,
 } from './media.js';
 export type {
   Message,
   MessageContentPart,
   ImageSource,
+  AudioSource,
   ChatRequest,
   ChatResponse,
   ChatStreamEvent,
diff --git a/src/models/media.ts b/src/models/media.ts
index 888d079..64a97b9 100644
--- a/src/models/media.ts
+++ b/src/models/media.ts
@@ -3,7 +3,7 @@
  */
 
 import type { Attachment } from '../channels/types.js';
-import type { MessageContentPart, ImageSource, Message } from './types.js';
+import type { MessageContentPart, ImageSource, AudioSource, Message } from './types.js';
 
 /** MIME types that vision models generally accept. */
 const SUPPORTED_IMAGE_TYPES = new Set([
@@ -73,34 +73,55 @@ export function attachmentToImageSource(attachment: Attachment): ImageSource | n
   return null;
 }
 
+/** Convert a channel Attachment to a model AudioSource. Only base64 data is supported. */
+export function attachmentToAudioSource(attachment: Attachment): AudioSource | null {
+  if (!isSupportedAudio(attachment)) {
+    return null;
+  }
+
+  if (!attachment.data) {
+    return null;
+  }
+
+  return {
+    media_type: attachment.mimeType,
+    data: attachment.data,
+  };
+}
+
 /**
  * Build a multimodal Message from text + attachments.
- * If there are no image attachments, returns a plain text Message.
- * If there are image attachments, returns a Message with structured content parts.
+ * If there are no image or audio attachments, returns a plain text Message.
+ * If there are image/audio attachments, returns a Message with structured content parts.
  */
 export function buildUserMessage(text: string, attachments?: Attachment[]): Message {
-  const imageParts: MessageContentPart[] = [];
+  const mediaParts: MessageContentPart[] = [];
 
   if (attachments) {
     for (const att of attachments) {
-      const source = attachmentToImageSource(att);
-      if (source) {
-        imageParts.push({ type: 'image', source });
+      const imageSource = attachmentToImageSource(att);
+      if (imageSource) {
+        mediaParts.push({ type: 'image', source: imageSource });
+        continue;
+      }
+      const audioSource = attachmentToAudioSource(att);
+      if (audioSource) {
+        mediaParts.push({ type: 'audio', source: audioSource });
       }
     }
   }
 
-  // No images — return simple text message (preserves backward compat)
-  if (imageParts.length === 0) {
+  // No media — return simple text message (preserves backward compat)
+  if (mediaParts.length === 0) {
     return { role: 'user', content: text };
   }
 
-  // Build multimodal content: text first, then images
+  // Build multimodal content: text first, then media
   const parts: MessageContentPart[] = [];
   if (text) {
     parts.push({ type: 'text', text });
   }
-  parts.push(...imageParts);
+  parts.push(...mediaParts);
 
   return { role: 'user', content: parts };
 }
@@ -148,6 +169,13 @@ export function getMessageTextWithTools(message: Message): string {
       const content = (block.content as string) ?? '';
       const isError = block.is_error ? ' (error)' : '';
       parts.push(`[Tool result${isError}: ${content}]`);
+    } else if (block.type === 'audio') {
+      const source = block.source as Record<string, unknown>;
+      if (source?.transcript) {
+        parts.push(`[Voice message]: ${source.transcript}`);
+      } else {
+        parts.push('[Audio attachment]');
+      }
     }
   }
   return parts.join('\n');
@@ -298,3 +326,44 @@ export function hasImages(message: Message): boolean {
   }
   return message.content.some(p => p.type === 'image');
 }
+
+/**
+ * Check whether a message contains audio content parts.
+ */
+export function hasAudio(message: Message): boolean {
+  if (typeof message.content === 'string') {
+    return false;
+  }
+  return message.content.some(p => p.type === 'audio');
+}
+
+/**
+ * Strip audio parts from a message, replacing them with their transcripts as text.
+ * Used for model providers that don't support native audio input (Anthropic, Bedrock, local).
+ */
+export function stripAudioParts(message: Message): Message {
+  if (typeof message.content === 'string') {
+    return message;
+  }
+
+  const newParts: MessageContentPart[] = [];
+  for (const part of message.content) {
+    if (part.type === 'audio') {
+      // Replace audio with transcript text if available
+      if (part.source.transcript) {
+        newParts.push({ type: 'text', text: `[Voice message]: ${part.source.transcript}` });
+      } else {
+        newParts.push({ type: 'text', text: '[Audio message received but no transcript available]' });
+      }
+    } else {
+      newParts.push(part);
+    }
+  }
+
+  // If all that's left is a single text part, simplify to string content
+  if (newParts.length === 1 && newParts[0].type === 'text') {
+    return { ...message, content: newParts[0].text };
+  }
+
+  return { ...message, content: newParts };
+}
diff --git a/src/models/openai.ts b/src/models/openai.ts
index c5f527e..92db24b 100644
--- a/src/models/openai.ts
+++ b/src/models/openai.ts
@@ -28,6 +28,24 @@ function toOpenAIContent(content: string | MessageContentPart[]): string | OpenA
         : part.source.url!;
       return { type: 'image_url', image_url: { url } };
     }
+    if (part.type === 'audio') {
+      // OpenAI native audio input via input_audio content part
+      // Determine format from MIME type (OpenAI supports: wav, mp3, flac, opus, ogg, webm)
+      const formatMap: Record<string, string> = {
+        'audio/wav': 'wav',
+        'audio/mpeg': 'mp3',
+        'audio/mp3': 'mp3',
+        'audio/ogg': 'ogg',
+        'audio/webm': 'webm',
+        'audio/mp4': 'mp4',
+        'audio/x-m4a': 'mp4',
+      };
+      const format = formatMap[part.source.media_type] ?? 'wav';
+      return {
+        type: 'input_audio',
+        input_audio: { data: part.source.data, format },
+      } as unknown as OpenAI.ChatCompletionContentPart;
+    }
     // Fallback — shouldn't happen
     return { type: 'text', text: JSON.stringify(part) };
   });
diff --git a/src/models/types.ts b/src/models/types.ts
index b7a6079..462e1a1 100644
--- a/src/models/types.ts
+++ b/src/models/types.ts
@@ -9,10 +9,21 @@ export interface ImageSource {
   url?: string;
 }
 
+/** Audio source for multimodal content blocks. */
+export interface AudioSource {
+  /** MIME type (e.g. "audio/ogg", "audio/mpeg", "audio/wav", "audio/webm"). */
+  media_type: string;
+  /** Base64-encoded audio data. */
+  data: string;
+  /** Optional transcript (from Whisper) — used when the model doesn't support native audio. */
+  transcript?: string;
+}
+
 /** Individual content part within a multimodal message. */
 export type MessageContentPart =
   | { type: 'text'; text: string }
-  | { type: 'image'; source: ImageSource };
+  | { type: 'image'; source: ImageSource }
+  | { type: 'audio'; source: AudioSource };
 
 export interface Message {
   role: 'user' | 'assistant';
@@ -43,6 +54,7 @@ export interface ModelToolCall {
 export type ContentBlock =
   | { type: 'text'; text: string }
   | { type: 'image'; source: ImageSource }
+  | { type: 'audio'; source: AudioSource }
   | { type: 'tool_use'; id: string; name: string; input: unknown };
 
 // Tool result fed back into conversation