diff --git a/src/models/anthropic.ts b/src/models/anthropic.ts index 47a5356..a35a324 100644 --- a/src/models/anthropic.ts +++ b/src/models/anthropic.ts @@ -41,6 +41,13 @@ function toAnthropicContent(content: string | MessageContentPart[]): string | un }, }; } + // Audio — Anthropic doesn't support native audio input; use transcript fallback + if (part.type === 'audio') { + if (part.source.transcript) { + return { type: 'text', text: `[Voice message]: ${part.source.transcript}` }; + } + return { type: 'text', text: '[Audio message received but no transcript available]' }; + } return part; }); } diff --git a/src/models/bedrock.ts b/src/models/bedrock.ts index a15825b..c2e0a26 100644 --- a/src/models/bedrock.ts +++ b/src/models/bedrock.ts @@ -170,17 +170,27 @@ function convertMessages(messages: Message[]): BedrockMessage[] { if (part.type === 'text') { return { text: part.text } as ContentBlock; } - // Image part — Bedrock uses { image: { format, source: { bytes } } } - if (part.source.type === 'base64' && part.source.data) { - return { - image: { - format: part.source.media_type.split('/')[1] as 'jpeg' | 'png' | 'gif' | 'webp', - source: { bytes: Uint8Array.from(atob(part.source.data), c => c.charCodeAt(0)) }, - }, - } as unknown as ContentBlock; + if (part.type === 'image') { + // Image part — Bedrock uses { image: { format, source: { bytes } } } + if (part.source.type === 'base64' && part.source.data) { + return { + image: { + format: part.source.media_type.split('/')[1] as 'jpeg' | 'png' | 'gif' | 'webp', + source: { bytes: Uint8Array.from(atob(part.source.data), c => c.charCodeAt(0)) }, + }, + } as unknown as ContentBlock; + } + // URL images not natively supported by Bedrock — fall back to text placeholder + return { text: `[Image: ${part.source.url ?? 'unsupported'}]` } as ContentBlock; } - // URL images not natively supported by Bedrock — fall back to text placeholder - return { text: `[Image: ${part.source.url ?? 'unsupported'}]` } as ContentBlock; + // Audio — Bedrock doesn't support native audio input; use transcript fallback + if (part.type === 'audio') { + if (part.source.transcript) { + return { text: `[Voice message]: ${part.source.transcript}` } as ContentBlock; + } + return { text: '[Audio message received but no transcript available]' } as ContentBlock; + } + return { text: JSON.stringify(part) } as ContentBlock; }); return { role, content: blocks }; diff --git a/src/models/gemini.ts b/src/models/gemini.ts index bc7b63e..e055a14 100644 --- a/src/models/gemini.ts +++ b/src/models/gemini.ts @@ -188,6 +188,15 @@ function convertMessages(messages: Message[]): Content[] { // so we pass as a text description. In production, you'd want to fetch + base64 encode. return { text: `[Image: ${part.source.url ?? 'unavailable'}]` }; } + // Audio part — Gemini supports native audio via inlineData (same format as images) + if (part.type === 'audio') { + return { + inlineData: { + mimeType: part.source.media_type, + data: part.source.data, + }, + }; + } return { text: JSON.stringify(part) }; }); diff --git a/src/models/github.ts b/src/models/github.ts index 38ffeab..65d3d38 100644 --- a/src/models/github.ts +++ b/src/models/github.ts @@ -36,6 +36,23 @@ function toOpenAIContent(content: string | MessageContentPart[]): string | OpenA : part.source.url!; return { type: 'image_url', image_url: { url } }; } + if (part.type === 'audio') { + // GitHub Models uses OpenAI-compatible API — native audio via input_audio + const formatMap: Record = { + 'audio/wav': 'wav', + 'audio/mpeg': 'mp3', + 'audio/mp3': 'mp3', + 'audio/ogg': 'ogg', + 'audio/webm': 'webm', + 'audio/mp4': 'mp4', + 'audio/x-m4a': 'mp4', + }; + const format = formatMap[part.source.media_type] ?? 'wav'; + return { + type: 'input_audio', + input_audio: { data: part.source.data, format }, + } as unknown as OpenAI.ChatCompletionContentPart; + } // Fallback — shouldn't happen return { type: 'text', text: JSON.stringify(part) }; }); diff --git a/src/models/index.ts b/src/models/index.ts index 3dfac1a..5850a63 100644 --- a/src/models/index.ts +++ b/src/models/index.ts @@ -10,15 +10,20 @@ export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from ' export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js'; export { isSupportedImage, + isSupportedAudio, attachmentToImageSource, + attachmentToAudioSource, buildUserMessage, getMessageText, hasImages, + hasAudio, + stripAudioParts, } from './media.js'; export type { Message, MessageContentPart, ImageSource, + AudioSource, ChatRequest, ChatResponse, ChatStreamEvent, diff --git a/src/models/media.ts b/src/models/media.ts index 888d079..64a97b9 100644 --- a/src/models/media.ts +++ b/src/models/media.ts @@ -3,7 +3,7 @@ */ import type { Attachment } from '../channels/types.js'; -import type { MessageContentPart, ImageSource, Message } from './types.js'; +import type { MessageContentPart, ImageSource, AudioSource, Message } from './types.js'; /** MIME types that vision models generally accept. */ const SUPPORTED_IMAGE_TYPES = new Set([ @@ -73,34 +73,55 @@ export function attachmentToImageSource(attachment: Attachment): ImageSource | n return null; } +/** Convert a channel Attachment to a model AudioSource. Only base64 data is supported. */ +export function attachmentToAudioSource(attachment: Attachment): AudioSource | null { + if (!isSupportedAudio(attachment)) { + return null; + } + + if (!attachment.data) { + return null; + } + + return { + media_type: attachment.mimeType, + data: attachment.data, + }; +} + /** * Build a multimodal Message from text + attachments. - * If there are no image attachments, returns a plain text Message. - * If there are image attachments, returns a Message with structured content parts. + * If there are no image or audio attachments, returns a plain text Message. + * If there are image/audio attachments, returns a Message with structured content parts. */ export function buildUserMessage(text: string, attachments?: Attachment[]): Message { - const imageParts: MessageContentPart[] = []; + const mediaParts: MessageContentPart[] = []; if (attachments) { for (const att of attachments) { - const source = attachmentToImageSource(att); - if (source) { - imageParts.push({ type: 'image', source }); + const imageSource = attachmentToImageSource(att); + if (imageSource) { + mediaParts.push({ type: 'image', source: imageSource }); + continue; + } + const audioSource = attachmentToAudioSource(att); + if (audioSource) { + mediaParts.push({ type: 'audio', source: audioSource }); } } } - // No images — return simple text message (preserves backward compat) - if (imageParts.length === 0) { + // No media — return simple text message (preserves backward compat) + if (mediaParts.length === 0) { return { role: 'user', content: text }; } - // Build multimodal content: text first, then images + // Build multimodal content: text first, then media const parts: MessageContentPart[] = []; if (text) { parts.push({ type: 'text', text }); } - parts.push(...imageParts); + parts.push(...mediaParts); return { role: 'user', content: parts }; } @@ -148,6 +169,13 @@ export function getMessageTextWithTools(message: Message): string { const content = (block.content as string) ?? ''; const isError = block.is_error ? ' (error)' : ''; parts.push(`[Tool result${isError}: ${content}]`); + } else if (block.type === 'audio') { + const source = block.source as Record; + if (source?.transcript) { + parts.push(`[Voice message]: ${source.transcript}`); + } else { + parts.push('[Audio attachment]'); + } } } return parts.join('\n'); @@ -298,3 +326,44 @@ export function hasImages(message: Message): boolean { } return message.content.some(p => p.type === 'image'); } + +/** + * Check whether a message contains audio content parts. + */ +export function hasAudio(message: Message): boolean { + if (typeof message.content === 'string') { + return false; + } + return message.content.some(p => p.type === 'audio'); +} + +/** + * Strip audio parts from a message, replacing them with their transcripts as text. + * Used for model providers that don't support native audio input (Anthropic, Bedrock, local). + */ +export function stripAudioParts(message: Message): Message { + if (typeof message.content === 'string') { + return message; + } + + const newParts: MessageContentPart[] = []; + for (const part of message.content) { + if (part.type === 'audio') { + // Replace audio with transcript text if available + if (part.source.transcript) { + newParts.push({ type: 'text', text: `[Voice message]: ${part.source.transcript}` }); + } else { + newParts.push({ type: 'text', text: '[Audio message received but no transcript available]' }); + } + } else { + newParts.push(part); + } + } + + // If all that's left is a single text part, simplify to string content + if (newParts.length === 1 && newParts[0].type === 'text') { + return { ...message, content: newParts[0].text }; + } + + return { ...message, content: newParts }; +} diff --git a/src/models/openai.ts b/src/models/openai.ts index c5f527e..92db24b 100644 --- a/src/models/openai.ts +++ b/src/models/openai.ts @@ -28,6 +28,24 @@ function toOpenAIContent(content: string | MessageContentPart[]): string | OpenA : part.source.url!; return { type: 'image_url', image_url: { url } }; } + if (part.type === 'audio') { + // OpenAI native audio input via input_audio content part + // Determine format from MIME type (OpenAI supports: wav, mp3, flac, opus, ogg, webm) + const formatMap: Record = { + 'audio/wav': 'wav', + 'audio/mpeg': 'mp3', + 'audio/mp3': 'mp3', + 'audio/ogg': 'ogg', + 'audio/webm': 'webm', + 'audio/mp4': 'mp4', + 'audio/x-m4a': 'mp4', + }; + const format = formatMap[part.source.media_type] ?? 'wav'; + return { + type: 'input_audio', + input_audio: { data: part.source.data, format }, + } as unknown as OpenAI.ChatCompletionContentPart; + } // Fallback — shouldn't happen return { type: 'text', text: JSON.stringify(part) }; }); diff --git a/src/models/types.ts b/src/models/types.ts index b7a6079..462e1a1 100644 --- a/src/models/types.ts +++ b/src/models/types.ts @@ -9,10 +9,21 @@ export interface ImageSource { url?: string; } +/** Audio source for multimodal content blocks. */ +export interface AudioSource { + /** MIME type (e.g. "audio/ogg", "audio/mpeg", "audio/wav", "audio/webm"). */ + media_type: string; + /** Base64-encoded audio data. */ + data: string; + /** Optional transcript (from Whisper) — used when the model doesn't support native audio. */ + transcript?: string; +} + /** Individual content part within a multimodal message. */ export type MessageContentPart = | { type: 'text'; text: string } - | { type: 'image'; source: ImageSource }; + | { type: 'image'; source: ImageSource } + | { type: 'audio'; source: AudioSource }; export interface Message { role: 'user' | 'assistant'; @@ -43,6 +54,7 @@ export interface ModelToolCall { export type ContentBlock = | { type: 'text'; text: string } | { type: 'image'; source: ImageSource } + | { type: 'audio'; source: AudioSource } | { type: 'tool_use'; id: string; name: string; input: unknown }; // Tool result fed back into conversation