/** * Media utilities for converting channel attachments to model content parts. */ import type { Attachment } from '../channels/types.js'; import type { MessageContentPart, ImageSource, AudioSource, Message } from './types.js'; /** MIME types that vision models generally accept. */ const SUPPORTED_IMAGE_TYPES = new Set([ 'image/jpeg', 'image/png', 'image/gif', 'image/webp', ]); /** MIME types that are audio (not image). */ const SUPPORTED_AUDIO_TYPES = new Set([ 'audio/ogg', 'audio/mpeg', 'audio/mp3', 'audio/wav', 'audio/webm', 'audio/mp4', 'audio/x-m4a', ]); /** Check whether an attachment is a supported image type. */ export function isSupportedImage(attachment: Attachment): boolean { return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType); } /** Check whether an attachment is a supported audio type. */ export function isSupportedAudio(attachment: Attachment): boolean { return SUPPORTED_AUDIO_TYPES.has(attachment.mimeType); } /** Convert MIME type to file extension. */ export function mimeToExtension(mime: string): string { const map: Record = { 'audio/ogg': 'ogg', 'audio/mpeg': 'mp3', 'audio/mp3': 'mp3', 'audio/wav': 'wav', 'audio/webm': 'webm', 'audio/mp4': 'm4a', 'audio/x-m4a': 'm4a', }; return map[mime] ?? 'bin'; } /** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */ export function attachmentToImageSource(attachment: Attachment): ImageSource | null { if (!isSupportedImage(attachment)) { return null; } if (attachment.data) { return { type: 'base64', media_type: attachment.mimeType, data: attachment.data, }; } if (attachment.url) { return { type: 'url', media_type: attachment.mimeType, url: attachment.url, }; } return null; } /** Convert a channel Attachment to a model AudioSource. Only base64 data is supported. */ export function attachmentToAudioSource(attachment: Attachment): AudioSource | null { if (!isSupportedAudio(attachment)) { return null; } if (!attachment.data) { return null; } return { media_type: attachment.mimeType, data: attachment.data, }; } /** * Build a multimodal Message from text + attachments. * If there are no image or audio attachments, returns a plain text Message. * If there are image/audio attachments, returns a Message with structured content parts. */ export function buildUserMessage(text: string, attachments?: Attachment[]): Message { const mediaParts: MessageContentPart[] = []; if (attachments) { for (const att of attachments) { const imageSource = attachmentToImageSource(att); if (imageSource) { mediaParts.push({ type: 'image', source: imageSource }); continue; } const audioSource = attachmentToAudioSource(att); if (audioSource) { mediaParts.push({ type: 'audio', source: audioSource }); } } } // No media — return simple text message (preserves backward compat) if (mediaParts.length === 0) { return { role: 'user', content: text }; } // Build multimodal content: text first, then media const parts: MessageContentPart[] = []; if (text) { parts.push({ type: 'text', text }); } parts.push(...mediaParts); return { role: 'user', content: parts }; } /** * Extract the text content from a Message regardless of content format. * For string content, returns the string directly. * For array content, concatenates all text parts. */ export function getMessageText(message: Message): string { if (typeof message.content === 'string') { return message.content; } return message.content .filter((p): p is { type: 'text'; text: string } => p.type === 'text') .map(p => p.text) .join(''); } /** * Serialize a message's content to a plain string, including tool_use and * tool_result structured blocks that getMessageText() would discard. * This is needed for local model backends (llama.cpp, Ollama) whose chat * templates don't understand Anthropic-style structured content blocks. */ export function getMessageTextWithTools(message: Message): string { if (typeof message.content === 'string') { return message.content; } const parts: string[] = []; for (const block of message.content as Record[]) { if (block.type === 'text' && typeof block.text === 'string') { parts.push(block.text); } else if (block.type === 'tool_use') { const name = block.name as string; let argsStr: string; try { argsStr = JSON.stringify(block.input); } catch { argsStr = String(block.input); } parts.push(`[Calling tool: ${name}(${argsStr})]`); } else if (block.type === 'tool_result') { const content = (block.content as string) ?? ''; const isError = block.is_error ? ' (error)' : ''; parts.push(`[Tool result${isError}: ${content}]`); } else if (block.type === 'audio') { const source = block.source as Record; if (source?.transcript) { parts.push(`[Voice message]: ${source.transcript}`); } else { parts.push('[Audio attachment]'); } } } return parts.join('\n'); } interface SimpleMessage { role: 'system' | 'user' | 'assistant'; content: string; } /** * Normalize a message array for local model backends that require strict * role alternation (system? -> user -> assistant -> user -> ...). * * 1. Serializes structured tool_use/tool_result content blocks to text * 2. Drops empty messages * 3. Merges consecutive same-role messages with a newline separator */ export function normalizeMessagesForLocal( system: string | undefined, messages: Message[], ): SimpleMessage[] { const result: SimpleMessage[] = []; if (system) { result.push({ role: 'system', content: system }); } for (const msg of messages) { const text = getMessageTextWithTools(msg); if (!text) {continue;} // drop empty messages const last = result.length > 0 ? result[result.length - 1] : undefined; if (last && last.role === msg.role) { // Merge consecutive same-role messages last.content += '\n\n' + text; } else { result.push({ role: msg.role, content: text }); } } return result; } /** Configuration for audio transcription via Whisper-compatible API. */ export interface AudioTranscriptionConfig { /** Whisper-compatible API endpoint (e.g. "https://api.openai.com/v1/audio/transcriptions") */ endpoint?: string; /** API key for the transcription service */ apiKey?: string; /** Model name (default: "whisper-1") */ model?: string; } /** * Transcribe an audio attachment to text using the OpenAI-compatible /v1/audio/transcriptions API. * Falls back to a placeholder message if no transcription endpoint is configured. */ export async function transcribeAudio( attachment: Attachment, config?: AudioTranscriptionConfig, ): Promise { if (!config?.endpoint) { return '[Audio message received but no transcription service is configured]'; } if (!attachment.data) { return '[Audio message transcription failed]'; } try { const audioBuffer = Buffer.from(attachment.data, 'base64'); const ext = mimeToExtension(attachment.mimeType); const formData = new FormData(); formData.append('file', new Blob([audioBuffer], { type: attachment.mimeType }), `audio.${ext}`); formData.append('model', config.model ?? 'whisper-1'); const headers: Record = {}; if (config.apiKey) { headers['Authorization'] = `Bearer ${config.apiKey}`; } const res = await fetch(config.endpoint, { method: 'POST', body: formData, headers }); if (!res.ok) { throw new Error(`Transcription failed: ${res.status} ${res.statusText}`); } const json = await res.json() as { text: string }; return json.text; } catch (error) { console.error( `Failed to transcribe audio (${attachment.mimeType}):`, error instanceof Error ? error.message : 'Unknown error', ); return '[Audio message transcription failed]'; } } /** * Build a multimodal Message from text + attachments, with optional audio transcription. * Audio attachments are transcribed to text and prepended to the message. * Image attachments are converted to content parts as before. */ export async function buildUserMessageWithAudio( text: string, attachments?: Attachment[], audioConfig?: AudioTranscriptionConfig, ): Promise { const imageParts: MessageContentPart[] = []; // Separate image and audio attachments const imageAttachments = (attachments ?? []).filter(a => isSupportedImage(a)); const audioAttachments = (attachments ?? []).filter(a => isSupportedAudio(a)); // Transcribe audio attachments and prepend to text (only if config is provided) let processedText = text; if (audioConfig?.endpoint) { for (const audioAttachment of audioAttachments) { const transcript = await transcribeAudio(audioAttachment, audioConfig); processedText = `[Voice message]: ${transcript}\n\n${processedText}`; } } // Convert image attachments to content parts for (const att of imageAttachments) { const source = attachmentToImageSource(att); if (source) { imageParts.push({ type: 'image', source }); } } // No images or audio — return simple text message if (imageParts.length === 0) { return { role: 'user', content: processedText }; } // Build multimodal content: text first, then images const parts: MessageContentPart[] = []; if (processedText) { parts.push({ type: 'text', text: processedText }); } parts.push(...imageParts); return { role: 'user', content: parts }; } /** * Check whether a message contains image content parts. */ export function hasImages(message: Message): boolean { if (typeof message.content === 'string') { return false; } return message.content.some(p => p.type === 'image'); } /** * Check whether a message contains audio content parts. */ export function hasAudio(message: Message): boolean { if (typeof message.content === 'string') { return false; } return message.content.some(p => p.type === 'audio'); } /** * Strip audio parts from a message, replacing them with their transcripts as text. * Used for model providers that don't support native audio input (Anthropic, Bedrock, local). */ export function stripAudioParts(message: Message): Message { if (typeof message.content === 'string') { return message; } const newParts: MessageContentPart[] = []; for (const part of message.content) { if (part.type === 'audio') { // Replace audio with transcript text if available if (part.source.transcript) { newParts.push({ type: 'text', text: `[Voice message]: ${part.source.transcript}` }); } else { newParts.push({ type: 'text', text: '[Audio message received but no transcript available]' }); } } else { newParts.push(part); } } // If all that's left is a single text part, simplify to string content if (newParts.length === 1 && newParts[0].type === 'text') { return { ...message, content: newParts[0].text }; } return { ...message, content: newParts }; }