373 lines
11 KiB
TypeScript
373 lines
11 KiB
TypeScript
/**
|
|
* Media utilities for converting channel attachments to model content parts.
|
|
*/
|
|
|
|
import type { Attachment } from '../channels/types.js';
|
|
import type { MessageContentPart, ImageSource, AudioSource, Message } from './types.js';
|
|
|
|
/** MIME types that vision models generally accept. */
|
|
const SUPPORTED_IMAGE_TYPES = new Set([
|
|
'image/jpeg',
|
|
'image/png',
|
|
'image/gif',
|
|
'image/webp',
|
|
]);
|
|
|
|
/** MIME types that are audio (not image). */
|
|
const SUPPORTED_AUDIO_TYPES = new Set([
|
|
'audio/ogg',
|
|
'audio/mpeg',
|
|
'audio/mp3',
|
|
'audio/wav',
|
|
'audio/webm',
|
|
'audio/mp4',
|
|
'audio/x-m4a',
|
|
]);
|
|
|
|
/** Check whether an attachment is a supported image type. */
|
|
export function isSupportedImage(attachment: Attachment): boolean {
|
|
return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType);
|
|
}
|
|
|
|
/** Check whether an attachment is a supported audio type. */
|
|
export function isSupportedAudio(attachment: Attachment): boolean {
|
|
return SUPPORTED_AUDIO_TYPES.has(attachment.mimeType);
|
|
}
|
|
|
|
/** Convert MIME type to file extension. */
|
|
export function mimeToExtension(mime: string): string {
|
|
const map: Record<string, string> = {
|
|
'audio/ogg': 'ogg',
|
|
'audio/mpeg': 'mp3',
|
|
'audio/mp3': 'mp3',
|
|
'audio/wav': 'wav',
|
|
'audio/webm': 'webm',
|
|
'audio/mp4': 'm4a',
|
|
'audio/x-m4a': 'm4a',
|
|
};
|
|
return map[mime] ?? 'bin';
|
|
}
|
|
|
|
/** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */
|
|
export function attachmentToImageSource(attachment: Attachment): ImageSource | null {
|
|
if (!isSupportedImage(attachment)) {
|
|
return null;
|
|
}
|
|
|
|
if (attachment.data) {
|
|
return {
|
|
type: 'base64',
|
|
media_type: attachment.mimeType,
|
|
data: attachment.data,
|
|
};
|
|
}
|
|
|
|
if (attachment.url) {
|
|
return {
|
|
type: 'url',
|
|
media_type: attachment.mimeType,
|
|
url: attachment.url,
|
|
};
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/** Convert a channel Attachment to a model AudioSource. Only base64 data is supported. */
|
|
export function attachmentToAudioSource(attachment: Attachment): AudioSource | null {
|
|
if (!isSupportedAudio(attachment)) {
|
|
return null;
|
|
}
|
|
|
|
if (!attachment.data) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
media_type: attachment.mimeType,
|
|
data: attachment.data,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Build a multimodal Message from text + attachments.
|
|
* If there are no image or audio attachments, returns a plain text Message.
|
|
* If there are image/audio attachments, returns a Message with structured content parts.
|
|
*/
|
|
export function buildUserMessage(text: string, attachments?: Attachment[]): Message {
|
|
const mediaParts: MessageContentPart[] = [];
|
|
|
|
if (attachments) {
|
|
for (const att of attachments) {
|
|
const imageSource = attachmentToImageSource(att);
|
|
if (imageSource) {
|
|
mediaParts.push({ type: 'image', source: imageSource });
|
|
continue;
|
|
}
|
|
const audioSource = attachmentToAudioSource(att);
|
|
if (audioSource) {
|
|
mediaParts.push({ type: 'audio', source: audioSource });
|
|
}
|
|
}
|
|
}
|
|
|
|
// No media — return simple text message (preserves backward compat)
|
|
if (mediaParts.length === 0) {
|
|
return { role: 'user', content: text };
|
|
}
|
|
|
|
// Build multimodal content: text first, then media
|
|
const parts: MessageContentPart[] = [];
|
|
if (text) {
|
|
parts.push({ type: 'text', text });
|
|
}
|
|
parts.push(...mediaParts);
|
|
|
|
return { role: 'user', content: parts };
|
|
}
|
|
|
|
/**
|
|
* Extract the text content from a Message regardless of content format.
|
|
* For string content, returns the string directly.
|
|
* For array content, concatenates all text parts.
|
|
*/
|
|
export function getMessageText(message: Message): string {
|
|
if (typeof message.content === 'string') {
|
|
return message.content;
|
|
}
|
|
return message.content
|
|
.filter((p): p is { type: 'text'; text: string } => p.type === 'text')
|
|
.map(p => p.text)
|
|
.join('');
|
|
}
|
|
|
|
/**
|
|
* Serialize a message's content to a plain string, including tool_use and
|
|
* tool_result structured blocks that getMessageText() would discard.
|
|
* This is needed for local model backends (llama.cpp, Ollama) whose chat
|
|
* templates don't understand Anthropic-style structured content blocks.
|
|
*/
|
|
export function getMessageTextWithTools(message: Message): string {
|
|
if (typeof message.content === 'string') {
|
|
return message.content;
|
|
}
|
|
|
|
const parts: string[] = [];
|
|
for (const block of message.content as Record<string, unknown>[]) {
|
|
if (block.type === 'text' && typeof block.text === 'string') {
|
|
parts.push(block.text);
|
|
} else if (block.type === 'tool_use') {
|
|
const name = block.name as string;
|
|
let argsStr: string;
|
|
try {
|
|
argsStr = JSON.stringify(block.input);
|
|
} catch {
|
|
argsStr = String(block.input);
|
|
}
|
|
parts.push(`[Calling tool: ${name}(${argsStr})]`);
|
|
} else if (block.type === 'tool_result') {
|
|
const content = (block.content as string) ?? '';
|
|
const isError = block.is_error ? ' (error)' : '';
|
|
parts.push(`[Tool result${isError}: ${content}]`);
|
|
} else if (block.type === 'audio') {
|
|
const source = block.source as Record<string, unknown>;
|
|
if (source?.transcript) {
|
|
parts.push(`[Voice message]: ${source.transcript}`);
|
|
} else {
|
|
parts.push('[Audio attachment]');
|
|
}
|
|
}
|
|
}
|
|
return parts.join('\n');
|
|
}
|
|
|
|
interface SimpleMessage {
|
|
role: 'system' | 'user' | 'assistant';
|
|
content: string;
|
|
}
|
|
|
|
/**
|
|
* Normalize a message array for local model backends that require strict
|
|
* role alternation (system? -> user -> assistant -> user -> ...).
|
|
*
|
|
* 1. Serializes structured tool_use/tool_result content blocks to text
|
|
* 2. Drops empty messages
|
|
* 3. Merges consecutive same-role messages with a newline separator
|
|
*/
|
|
export function normalizeMessagesForLocal(
|
|
system: string | undefined,
|
|
messages: Message[],
|
|
): SimpleMessage[] {
|
|
const result: SimpleMessage[] = [];
|
|
|
|
if (system) {
|
|
result.push({ role: 'system', content: system });
|
|
}
|
|
|
|
for (const msg of messages) {
|
|
const text = getMessageTextWithTools(msg);
|
|
if (!text) {continue;} // drop empty messages
|
|
|
|
const last = result.length > 0 ? result[result.length - 1] : undefined;
|
|
if (last && last.role === msg.role) {
|
|
// Merge consecutive same-role messages
|
|
last.content += '\n\n' + text;
|
|
} else {
|
|
result.push({ role: msg.role, content: text });
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/** Configuration for audio transcription via Whisper-compatible API. */
|
|
export interface AudioTranscriptionConfig {
|
|
/** Whisper-compatible API endpoint (e.g. "https://api.openai.com/v1/audio/transcriptions") */
|
|
endpoint?: string;
|
|
/** API key for the transcription service */
|
|
apiKey?: string;
|
|
/** Model name (default: "whisper-1") */
|
|
model?: string;
|
|
}
|
|
|
|
/**
|
|
* Transcribe an audio attachment to text using the OpenAI-compatible /v1/audio/transcriptions API.
|
|
* Falls back to a placeholder message if no transcription endpoint is configured.
|
|
*/
|
|
export async function transcribeAudio(
|
|
attachment: Attachment,
|
|
config?: AudioTranscriptionConfig,
|
|
): Promise<string> {
|
|
if (!config?.endpoint) {
|
|
return '[Audio message received but no transcription service is configured]';
|
|
}
|
|
if (!attachment.data) {
|
|
return '[Audio message transcription failed]';
|
|
}
|
|
|
|
try {
|
|
const audioBuffer = Buffer.from(attachment.data, 'base64');
|
|
const ext = mimeToExtension(attachment.mimeType);
|
|
const formData = new FormData();
|
|
formData.append('file', new Blob([audioBuffer], { type: attachment.mimeType }), `audio.${ext}`);
|
|
formData.append('model', config.model ?? 'whisper-1');
|
|
|
|
const headers: Record<string, string> = {};
|
|
if (config.apiKey) {
|
|
headers['Authorization'] = `Bearer ${config.apiKey}`;
|
|
}
|
|
|
|
const res = await fetch(config.endpoint, { method: 'POST', body: formData, headers });
|
|
if (!res.ok) {
|
|
throw new Error(`Transcription failed: ${res.status} ${res.statusText}`);
|
|
}
|
|
const json = await res.json() as { text: string };
|
|
return json.text;
|
|
} catch (error) {
|
|
console.error(
|
|
`Failed to transcribe audio (${attachment.mimeType}):`,
|
|
error instanceof Error ? error.message : 'Unknown error',
|
|
);
|
|
return '[Audio message transcription failed]';
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Build a multimodal Message from text + attachments, with optional audio transcription.
|
|
* Audio attachments are transcribed to text and prepended to the message.
|
|
* Image attachments are converted to content parts as before.
|
|
*/
|
|
export async function buildUserMessageWithAudio(
|
|
text: string,
|
|
attachments?: Attachment[],
|
|
audioConfig?: AudioTranscriptionConfig,
|
|
): Promise<Message> {
|
|
const imageParts: MessageContentPart[] = [];
|
|
|
|
// Separate image and audio attachments
|
|
const imageAttachments = (attachments ?? []).filter(a => isSupportedImage(a));
|
|
const audioAttachments = (attachments ?? []).filter(a => isSupportedAudio(a));
|
|
|
|
// Transcribe audio attachments and prepend to text (only if config is provided)
|
|
let processedText = text;
|
|
if (audioConfig?.endpoint) {
|
|
for (const audioAttachment of audioAttachments) {
|
|
const transcript = await transcribeAudio(audioAttachment, audioConfig);
|
|
processedText = `[Voice message]: ${transcript}\n\n${processedText}`;
|
|
}
|
|
}
|
|
|
|
// Convert image attachments to content parts
|
|
for (const att of imageAttachments) {
|
|
const source = attachmentToImageSource(att);
|
|
if (source) {
|
|
imageParts.push({ type: 'image', source });
|
|
}
|
|
}
|
|
|
|
// No images or audio — return simple text message
|
|
if (imageParts.length === 0) {
|
|
return { role: 'user', content: processedText };
|
|
}
|
|
|
|
// Build multimodal content: text first, then images
|
|
const parts: MessageContentPart[] = [];
|
|
if (processedText) {
|
|
parts.push({ type: 'text', text: processedText });
|
|
}
|
|
parts.push(...imageParts);
|
|
|
|
return { role: 'user', content: parts };
|
|
}
|
|
|
|
/**
|
|
* Check whether a message contains image content parts.
|
|
*/
|
|
export function hasImages(message: Message): boolean {
|
|
if (typeof message.content === 'string') {
|
|
return false;
|
|
}
|
|
return message.content.some(p => p.type === 'image');
|
|
}
|
|
|
|
/**
|
|
* Check whether a message contains audio content parts.
|
|
*/
|
|
export function hasAudio(message: Message): boolean {
|
|
if (typeof message.content === 'string') {
|
|
return false;
|
|
}
|
|
return message.content.some(p => p.type === 'audio');
|
|
}
|
|
|
|
/**
|
|
* Strip audio parts from a message, replacing them with their transcripts as text.
|
|
* Used for model providers that don't support native audio input (Anthropic, Bedrock, local).
|
|
*/
|
|
export function stripAudioParts(message: Message): Message {
|
|
if (typeof message.content === 'string') {
|
|
return message;
|
|
}
|
|
|
|
const newParts: MessageContentPart[] = [];
|
|
for (const part of message.content) {
|
|
if (part.type === 'audio') {
|
|
// Replace audio with transcript text if available
|
|
if (part.source.transcript) {
|
|
newParts.push({ type: 'text', text: `[Voice message]: ${part.source.transcript}` });
|
|
} else {
|
|
newParts.push({ type: 'text', text: '[Audio message received but no transcript available]' });
|
|
}
|
|
} else {
|
|
newParts.push(part);
|
|
}
|
|
}
|
|
|
|
// If all that's left is a single text part, simplify to string content
|
|
if (newParts.length === 1 && newParts[0].type === 'text') {
|
|
return { ...message, content: newParts[0].text };
|
|
}
|
|
|
|
return { ...message, content: newParts };
|
|
}
|