feat(audio): add native audio support to type system and model clients
- Add AudioSource interface and 'audio' variant to MessageContentPart union - Update buildUserMessage() to create audio content parts from attachments - Add attachmentToAudioSource(), hasAudio(), stripAudioParts() helpers - Gemini: native audio via inlineData (same format as images) - OpenAI/GitHub: native audio via input_audio content parts - Anthropic/Bedrock: graceful fallback to transcript text - Update getMessageTextWithTools() to handle audio blocks for local models
This commit is contained in:
@@ -41,6 +41,13 @@ function toAnthropicContent(content: string | MessageContentPart[]): string | un
|
||||
},
|
||||
};
|
||||
}
|
||||
// Audio — Anthropic doesn't support native audio input; use transcript fallback
|
||||
if (part.type === 'audio') {
|
||||
if (part.source.transcript) {
|
||||
return { type: 'text', text: `[Voice message]: ${part.source.transcript}` };
|
||||
}
|
||||
return { type: 'text', text: '[Audio message received but no transcript available]' };
|
||||
}
|
||||
return part;
|
||||
});
|
||||
}
|
||||
|
||||
+20
-10
@@ -170,17 +170,27 @@ function convertMessages(messages: Message[]): BedrockMessage[] {
|
||||
if (part.type === 'text') {
|
||||
return { text: part.text } as ContentBlock;
|
||||
}
|
||||
// Image part — Bedrock uses { image: { format, source: { bytes } } }
|
||||
if (part.source.type === 'base64' && part.source.data) {
|
||||
return {
|
||||
image: {
|
||||
format: part.source.media_type.split('/')[1] as 'jpeg' | 'png' | 'gif' | 'webp',
|
||||
source: { bytes: Uint8Array.from(atob(part.source.data), c => c.charCodeAt(0)) },
|
||||
},
|
||||
} as unknown as ContentBlock;
|
||||
if (part.type === 'image') {
|
||||
// Image part — Bedrock uses { image: { format, source: { bytes } } }
|
||||
if (part.source.type === 'base64' && part.source.data) {
|
||||
return {
|
||||
image: {
|
||||
format: part.source.media_type.split('/')[1] as 'jpeg' | 'png' | 'gif' | 'webp',
|
||||
source: { bytes: Uint8Array.from(atob(part.source.data), c => c.charCodeAt(0)) },
|
||||
},
|
||||
} as unknown as ContentBlock;
|
||||
}
|
||||
// URL images not natively supported by Bedrock — fall back to text placeholder
|
||||
return { text: `[Image: ${part.source.url ?? 'unsupported'}]` } as ContentBlock;
|
||||
}
|
||||
// URL images not natively supported by Bedrock — fall back to text placeholder
|
||||
return { text: `[Image: ${part.source.url ?? 'unsupported'}]` } as ContentBlock;
|
||||
// Audio — Bedrock doesn't support native audio input; use transcript fallback
|
||||
if (part.type === 'audio') {
|
||||
if (part.source.transcript) {
|
||||
return { text: `[Voice message]: ${part.source.transcript}` } as ContentBlock;
|
||||
}
|
||||
return { text: '[Audio message received but no transcript available]' } as ContentBlock;
|
||||
}
|
||||
return { text: JSON.stringify(part) } as ContentBlock;
|
||||
});
|
||||
|
||||
return { role, content: blocks };
|
||||
|
||||
@@ -188,6 +188,15 @@ function convertMessages(messages: Message[]): Content[] {
|
||||
// so we pass as a text description. In production, you'd want to fetch + base64 encode.
|
||||
return { text: `[Image: ${part.source.url ?? 'unavailable'}]` };
|
||||
}
|
||||
// Audio part — Gemini supports native audio via inlineData (same format as images)
|
||||
if (part.type === 'audio') {
|
||||
return {
|
||||
inlineData: {
|
||||
mimeType: part.source.media_type,
|
||||
data: part.source.data,
|
||||
},
|
||||
};
|
||||
}
|
||||
return { text: JSON.stringify(part) };
|
||||
});
|
||||
|
||||
|
||||
@@ -36,6 +36,23 @@ function toOpenAIContent(content: string | MessageContentPart[]): string | OpenA
|
||||
: part.source.url!;
|
||||
return { type: 'image_url', image_url: { url } };
|
||||
}
|
||||
if (part.type === 'audio') {
|
||||
// GitHub Models uses OpenAI-compatible API — native audio via input_audio
|
||||
const formatMap: Record<string, string> = {
|
||||
'audio/wav': 'wav',
|
||||
'audio/mpeg': 'mp3',
|
||||
'audio/mp3': 'mp3',
|
||||
'audio/ogg': 'ogg',
|
||||
'audio/webm': 'webm',
|
||||
'audio/mp4': 'mp4',
|
||||
'audio/x-m4a': 'mp4',
|
||||
};
|
||||
const format = formatMap[part.source.media_type] ?? 'wav';
|
||||
return {
|
||||
type: 'input_audio',
|
||||
input_audio: { data: part.source.data, format },
|
||||
} as unknown as OpenAI.ChatCompletionContentPart;
|
||||
}
|
||||
// Fallback — shouldn't happen
|
||||
return { type: 'text', text: JSON.stringify(part) };
|
||||
});
|
||||
|
||||
@@ -10,15 +10,20 @@ export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from '
|
||||
export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js';
|
||||
export {
|
||||
isSupportedImage,
|
||||
isSupportedAudio,
|
||||
attachmentToImageSource,
|
||||
attachmentToAudioSource,
|
||||
buildUserMessage,
|
||||
getMessageText,
|
||||
hasImages,
|
||||
hasAudio,
|
||||
stripAudioParts,
|
||||
} from './media.js';
|
||||
export type {
|
||||
Message,
|
||||
MessageContentPart,
|
||||
ImageSource,
|
||||
AudioSource,
|
||||
ChatRequest,
|
||||
ChatResponse,
|
||||
ChatStreamEvent,
|
||||
|
||||
+80
-11
@@ -3,7 +3,7 @@
|
||||
*/
|
||||
|
||||
import type { Attachment } from '../channels/types.js';
|
||||
import type { MessageContentPart, ImageSource, Message } from './types.js';
|
||||
import type { MessageContentPart, ImageSource, AudioSource, Message } from './types.js';
|
||||
|
||||
/** MIME types that vision models generally accept. */
|
||||
const SUPPORTED_IMAGE_TYPES = new Set([
|
||||
@@ -73,34 +73,55 @@ export function attachmentToImageSource(attachment: Attachment): ImageSource | n
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Convert a channel Attachment to a model AudioSource. Only base64 data is supported. */
|
||||
export function attachmentToAudioSource(attachment: Attachment): AudioSource | null {
|
||||
if (!isSupportedAudio(attachment)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!attachment.data) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
media_type: attachment.mimeType,
|
||||
data: attachment.data,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a multimodal Message from text + attachments.
|
||||
* If there are no image attachments, returns a plain text Message.
|
||||
* If there are image attachments, returns a Message with structured content parts.
|
||||
* If there are no image or audio attachments, returns a plain text Message.
|
||||
* If there are image/audio attachments, returns a Message with structured content parts.
|
||||
*/
|
||||
export function buildUserMessage(text: string, attachments?: Attachment[]): Message {
|
||||
const imageParts: MessageContentPart[] = [];
|
||||
const mediaParts: MessageContentPart[] = [];
|
||||
|
||||
if (attachments) {
|
||||
for (const att of attachments) {
|
||||
const source = attachmentToImageSource(att);
|
||||
if (source) {
|
||||
imageParts.push({ type: 'image', source });
|
||||
const imageSource = attachmentToImageSource(att);
|
||||
if (imageSource) {
|
||||
mediaParts.push({ type: 'image', source: imageSource });
|
||||
continue;
|
||||
}
|
||||
const audioSource = attachmentToAudioSource(att);
|
||||
if (audioSource) {
|
||||
mediaParts.push({ type: 'audio', source: audioSource });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// No images — return simple text message (preserves backward compat)
|
||||
if (imageParts.length === 0) {
|
||||
// No media — return simple text message (preserves backward compat)
|
||||
if (mediaParts.length === 0) {
|
||||
return { role: 'user', content: text };
|
||||
}
|
||||
|
||||
// Build multimodal content: text first, then images
|
||||
// Build multimodal content: text first, then media
|
||||
const parts: MessageContentPart[] = [];
|
||||
if (text) {
|
||||
parts.push({ type: 'text', text });
|
||||
}
|
||||
parts.push(...imageParts);
|
||||
parts.push(...mediaParts);
|
||||
|
||||
return { role: 'user', content: parts };
|
||||
}
|
||||
@@ -148,6 +169,13 @@ export function getMessageTextWithTools(message: Message): string {
|
||||
const content = (block.content as string) ?? '';
|
||||
const isError = block.is_error ? ' (error)' : '';
|
||||
parts.push(`[Tool result${isError}: ${content}]`);
|
||||
} else if (block.type === 'audio') {
|
||||
const source = block.source as Record<string, unknown>;
|
||||
if (source?.transcript) {
|
||||
parts.push(`[Voice message]: ${source.transcript}`);
|
||||
} else {
|
||||
parts.push('[Audio attachment]');
|
||||
}
|
||||
}
|
||||
}
|
||||
return parts.join('\n');
|
||||
@@ -298,3 +326,44 @@ export function hasImages(message: Message): boolean {
|
||||
}
|
||||
return message.content.some(p => p.type === 'image');
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether a message contains audio content parts.
|
||||
*/
|
||||
export function hasAudio(message: Message): boolean {
|
||||
if (typeof message.content === 'string') {
|
||||
return false;
|
||||
}
|
||||
return message.content.some(p => p.type === 'audio');
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip audio parts from a message, replacing them with their transcripts as text.
|
||||
* Used for model providers that don't support native audio input (Anthropic, Bedrock, local).
|
||||
*/
|
||||
export function stripAudioParts(message: Message): Message {
|
||||
if (typeof message.content === 'string') {
|
||||
return message;
|
||||
}
|
||||
|
||||
const newParts: MessageContentPart[] = [];
|
||||
for (const part of message.content) {
|
||||
if (part.type === 'audio') {
|
||||
// Replace audio with transcript text if available
|
||||
if (part.source.transcript) {
|
||||
newParts.push({ type: 'text', text: `[Voice message]: ${part.source.transcript}` });
|
||||
} else {
|
||||
newParts.push({ type: 'text', text: '[Audio message received but no transcript available]' });
|
||||
}
|
||||
} else {
|
||||
newParts.push(part);
|
||||
}
|
||||
}
|
||||
|
||||
// If all that's left is a single text part, simplify to string content
|
||||
if (newParts.length === 1 && newParts[0].type === 'text') {
|
||||
return { ...message, content: newParts[0].text };
|
||||
}
|
||||
|
||||
return { ...message, content: newParts };
|
||||
}
|
||||
|
||||
@@ -28,6 +28,24 @@ function toOpenAIContent(content: string | MessageContentPart[]): string | OpenA
|
||||
: part.source.url!;
|
||||
return { type: 'image_url', image_url: { url } };
|
||||
}
|
||||
if (part.type === 'audio') {
|
||||
// OpenAI native audio input via input_audio content part
|
||||
// Determine format from MIME type (OpenAI supports: wav, mp3, flac, opus, ogg, webm)
|
||||
const formatMap: Record<string, string> = {
|
||||
'audio/wav': 'wav',
|
||||
'audio/mpeg': 'mp3',
|
||||
'audio/mp3': 'mp3',
|
||||
'audio/ogg': 'ogg',
|
||||
'audio/webm': 'webm',
|
||||
'audio/mp4': 'mp4',
|
||||
'audio/x-m4a': 'mp4',
|
||||
};
|
||||
const format = formatMap[part.source.media_type] ?? 'wav';
|
||||
return {
|
||||
type: 'input_audio',
|
||||
input_audio: { data: part.source.data, format },
|
||||
} as unknown as OpenAI.ChatCompletionContentPart;
|
||||
}
|
||||
// Fallback — shouldn't happen
|
||||
return { type: 'text', text: JSON.stringify(part) };
|
||||
});
|
||||
|
||||
+13
-1
@@ -9,10 +9,21 @@ export interface ImageSource {
|
||||
url?: string;
|
||||
}
|
||||
|
||||
/** Audio source for multimodal content blocks. */
|
||||
export interface AudioSource {
|
||||
/** MIME type (e.g. "audio/ogg", "audio/mpeg", "audio/wav", "audio/webm"). */
|
||||
media_type: string;
|
||||
/** Base64-encoded audio data. */
|
||||
data: string;
|
||||
/** Optional transcript (from Whisper) — used when the model doesn't support native audio. */
|
||||
transcript?: string;
|
||||
}
|
||||
|
||||
/** Individual content part within a multimodal message. */
|
||||
export type MessageContentPart =
|
||||
| { type: 'text'; text: string }
|
||||
| { type: 'image'; source: ImageSource };
|
||||
| { type: 'image'; source: ImageSource }
|
||||
| { type: 'audio'; source: AudioSource };
|
||||
|
||||
export interface Message {
|
||||
role: 'user' | 'assistant';
|
||||
@@ -43,6 +54,7 @@ export interface ModelToolCall {
|
||||
export type ContentBlock =
|
||||
| { type: 'text'; text: string }
|
||||
| { type: 'image'; source: ImageSource }
|
||||
| { type: 'audio'; source: AudioSource }
|
||||
| { type: 'tool_use'; id: string; name: string; input: unknown };
|
||||
|
||||
// Tool result fed back into conversation
|
||||
|
||||
Reference in New Issue
Block a user