diff --git a/src/daemon/routing.ts b/src/daemon/routing.ts
index 657d694..af3833c 100644
--- a/src/daemon/routing.ts
+++ b/src/daemon/routing.ts
@@ -1,6 +1,7 @@
 import type { AudioTranscriptionConfig } from '../models/media.js';
 import type { Attachment } from '../channels/types.js';
 import { isSupportedAudio, transcribeAudio } from '../models/media.js';
+import { supportsAudioInput } from '../models/capabilities.js';
 import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
 import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
 import type { InboundMessage, OutboundMessage } from '../channels/index.js';
@@ -32,7 +33,6 @@ export function createMessageRouter(deps: {
   agentConfigRegistry?: AgentConfigRegistry;
   agentRouter?: AgentRouter;
   sandboxManager?: SandboxManager;
-  audioConfig?: AudioTranscriptionConfig;
 }): {
   handler: (msg: InboundMessage, reply: (response: OutboundMessage) => Promise<void>) => Promise<void>;
   agents: Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>;
@@ -213,18 +213,55 @@ export function createMessageRouter(deps: {
     }
 
     try {
-      // Transcribe audio attachments before processing
-      let messageText = msg.text;
-      const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
-
-      if (audioAttachments.length > 0 && deps.audioConfig) {
-        for (const att of audioAttachments) {
-          const transcript = await transcribeAudio(att, deps.audioConfig);
-          messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
+      // Determine if the active model supports native audio input
+      let effectiveTier: string = deps.config.agents.primary_tier ?? 'default';
+      if (msg.metadata?.modelTier) {
+        effectiveTier = msg.metadata.modelTier as string;
+      } else if (deps.agentRouter && deps.agentConfigRegistry) {
+        const agentName = deps.agentRouter.resolve(msg.channel, msg.senderId);
+        if (agentName) {
+          const agentCfg = deps.agentConfigRegistry.get(agentName);
+          if (agentCfg?.modelTier) {
+            effectiveTier = agentCfg.modelTier;
+          }
         }
       }
 
-      const response = await agent.process(messageText, msg.attachments);
+      // Look up provider/model for the effective tier
+      const modelsConfig = deps.config.models as Record<string, { provider?: string; model?: string } | undefined>;
+      const tierConfig = modelsConfig[effectiveTier] ?? deps.config.models.default;
+      const modelProvider = tierConfig?.provider ?? deps.config.models.default.provider;
+      const modelName = tierConfig?.model ?? deps.config.models.default.model;
+      const nativeAudioSupported = supportsAudioInput(modelProvider, modelName);
+
+      let messageText = msg.text;
+      let attachments = msg.attachments;
+      const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
+
+      if (audioAttachments.length > 0 && !nativeAudioSupported) {
+        // Model doesn't support native audio — transcribe via Whisper and strip audio attachments
+        const audioConfig: AudioTranscriptionConfig | undefined = deps.config.audio?.enabled && deps.config.audio.provider
+          ? {
+            endpoint: deps.config.audio.provider.endpoint,
+            apiKey: deps.config.audio.provider.api_key,
+            model: deps.config.audio.provider.model,
+          }
+          : undefined;
+
+        if (audioConfig?.endpoint) {
+          for (const att of audioAttachments) {
+            const transcript = await transcribeAudio(att, audioConfig);
+            messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
+          }
+        }
+        // Remove audio attachments so buildUserMessage doesn't create audio content parts
+        attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
+        if (attachments.length === 0) { attachments = undefined; }
+      }
+      // If native audio IS supported, we pass attachments through unchanged —
+      // buildUserMessage() in the agent will create native audio content parts
+
+      const response = await agent.process(messageText, attachments);
       const outboundAttachments = collector.drain();
       await reply({
         text: response,
diff --git a/src/models/capabilities.ts b/src/models/capabilities.ts
new file mode 100644
index 0000000..5836b28
--- /dev/null
+++ b/src/models/capabilities.ts
@@ -0,0 +1,46 @@
+/**
+ * Model capability detection for native audio input support.
+ * 
+ * Models that support native audio will receive raw audio data directly.
+ * Models that don't will receive a Whisper transcript as text instead.
+ */
+
+export type ModelProvider = 'anthropic' | 'openai' | 'gemini' | 'bedrock' | 'github' | 'ollama' | 'llamacpp' | 'openrouter' | 'zhipuai' | 'xai';
+
+/** Providers that support native audio input in their API. */
+const AUDIO_CAPABLE_PROVIDERS = new Set<string>([
+  'gemini',
+  'openai',
+  'github',  // GitHub Models uses OpenAI-compatible API
+]);
+
+/**
+ * Models known NOT to support audio despite their provider supporting it.
+ * For example, older OpenAI models or specialized models.
+ */
+const AUDIO_INCAPABLE_MODELS = new Set<string>([
+  // Older OpenAI models that predate audio input support
+  'gpt-3.5-turbo',
+  'gpt-4',
+  'gpt-4-turbo',
+]);
+
+/**
+ * Check whether a provider+model combination supports native audio input.
+ * 
+ * Returns true if the model can receive raw audio data directly via its API,
+ * false if audio must be transcribed to text before sending.
+ */
+export function supportsAudioInput(provider: string, model: string): boolean {
+  // Provider must be in the capable set
+  if (!AUDIO_CAPABLE_PROVIDERS.has(provider)) {
+    return false;
+  }
+
+  // Check model-specific exclusions
+  if (AUDIO_INCAPABLE_MODELS.has(model)) {
+    return false;
+  }
+
+  return true;
+}
diff --git a/src/models/index.ts b/src/models/index.ts
index 5850a63..72574a0 100644
--- a/src/models/index.ts
+++ b/src/models/index.ts
@@ -8,6 +8,7 @@ export { LlamaCppClient, type LlamaCppClientConfig } from './local/index.js';
 export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js';
 export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js';
 export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js';
+export { supportsAudioInput } from './capabilities.js';
 export {
   isSupportedImage,
   isSupportedAudio,