Files
flynn/src/models/media.ts
T
William Valentin 2a962abcd0 feat: add audio transcription pipeline for voice messages
Adds Whisper-compatible audio transcription via configurable endpoint.
New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(),
buildUserMessageWithAudio(). Config schema gains audio section with
transcription_endpoint, api_key, and model. Daemon wires transcription
into the message router. Channel adapters extract audio from voice/audio
messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp
ptt+audio). Includes 57 media tests (was 25, now covers all audio paths).
2026-02-07 09:09:13 -08:00

229 lines
6.7 KiB
TypeScript

/**
* Media utilities for converting channel attachments to model content parts.
*/
import type { Attachment } from '../channels/types.js';
import type { MessageContentPart, ImageSource, Message } from './types.js';
/** MIME types that vision models generally accept. */
const SUPPORTED_IMAGE_TYPES = new Set([
'image/jpeg',
'image/png',
'image/gif',
'image/webp',
]);
/** MIME types that are audio (not image). */
const SUPPORTED_AUDIO_TYPES = new Set([
'audio/ogg',
'audio/mpeg',
'audio/mp3',
'audio/wav',
'audio/webm',
'audio/mp4',
'audio/x-m4a',
]);
/** Check whether an attachment is a supported image type. */
export function isSupportedImage(attachment: Attachment): boolean {
return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType);
}
/** Check whether an attachment is a supported audio type. */
export function isSupportedAudio(attachment: Attachment): boolean {
return SUPPORTED_AUDIO_TYPES.has(attachment.mimeType);
}
/** Convert MIME type to file extension. */
export function mimeToExtension(mime: string): string {
const map: Record<string, string> = {
'audio/ogg': 'ogg',
'audio/mpeg': 'mp3',
'audio/mp3': 'mp3',
'audio/wav': 'wav',
'audio/webm': 'webm',
'audio/mp4': 'm4a',
'audio/x-m4a': 'm4a',
};
return map[mime] ?? 'bin';
}
/** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */
export function attachmentToImageSource(attachment: Attachment): ImageSource | null {
if (!isSupportedImage(attachment)) {
return null;
}
if (attachment.data) {
return {
type: 'base64',
media_type: attachment.mimeType,
data: attachment.data,
};
}
if (attachment.url) {
return {
type: 'url',
media_type: attachment.mimeType,
url: attachment.url,
};
}
return null;
}
/**
* Build a multimodal Message from text + attachments.
* If there are no image attachments, returns a plain text Message.
* If there are image attachments, returns a Message with structured content parts.
*/
export function buildUserMessage(text: string, attachments?: Attachment[]): Message {
const imageParts: MessageContentPart[] = [];
if (attachments) {
for (const att of attachments) {
const source = attachmentToImageSource(att);
if (source) {
imageParts.push({ type: 'image', source });
}
}
}
// No images — return simple text message (preserves backward compat)
if (imageParts.length === 0) {
return { role: 'user', content: text };
}
// Build multimodal content: text first, then images
const parts: MessageContentPart[] = [];
if (text) {
parts.push({ type: 'text', text });
}
parts.push(...imageParts);
return { role: 'user', content: parts };
}
/**
* Extract the text content from a Message regardless of content format.
* For string content, returns the string directly.
* For array content, concatenates all text parts.
*/
export function getMessageText(message: Message): string {
if (typeof message.content === 'string') {
return message.content;
}
return message.content
.filter((p): p is { type: 'text'; text: string } => p.type === 'text')
.map(p => p.text)
.join('');
}
/** Configuration for audio transcription via Whisper-compatible API. */
export interface AudioTranscriptionConfig {
/** Whisper-compatible API endpoint (e.g. "https://api.openai.com/v1/audio/transcriptions") */
endpoint?: string;
/** API key for the transcription service */
apiKey?: string;
/** Model name (default: "whisper-1") */
model?: string;
}
/**
* Transcribe an audio attachment to text using the OpenAI-compatible /v1/audio/transcriptions API.
* Falls back to a placeholder message if no transcription endpoint is configured.
*/
export async function transcribeAudio(
attachment: Attachment,
config?: AudioTranscriptionConfig,
): Promise<string> {
if (!config?.endpoint) {
return '[Audio message received but no transcription service is configured]';
}
try {
const audioBuffer = Buffer.from(attachment.data!, 'base64');
const ext = mimeToExtension(attachment.mimeType);
const formData = new FormData();
formData.append('file', new Blob([audioBuffer], { type: attachment.mimeType }), `audio.${ext}`);
formData.append('model', config.model ?? 'whisper-1');
const headers: Record<string, string> = {};
if (config.apiKey) {
headers['Authorization'] = `Bearer ${config.apiKey}`;
}
const res = await fetch(config.endpoint, { method: 'POST', body: formData, headers });
if (!res.ok) {
throw new Error(`Transcription failed: ${res.status} ${res.statusText}`);
}
const json = await res.json() as { text: string };
return json.text;
} catch (error) {
console.error(
`Failed to transcribe audio (${attachment.mimeType}):`,
error instanceof Error ? error.message : 'Unknown error',
);
return '[Audio message transcription failed]';
}
}
/**
* Build a multimodal Message from text + attachments, with optional audio transcription.
* Audio attachments are transcribed to text and prepended to the message.
* Image attachments are converted to content parts as before.
*/
export async function buildUserMessageWithAudio(
text: string,
attachments?: Attachment[],
audioConfig?: AudioTranscriptionConfig,
): Promise<Message> {
const imageParts: MessageContentPart[] = [];
// Separate image and audio attachments
const imageAttachments = (attachments ?? []).filter(a => isSupportedImage(a));
const audioAttachments = (attachments ?? []).filter(a => isSupportedAudio(a));
// Transcribe audio attachments and prepend to text (only if config is provided)
let processedText = text;
if (audioConfig?.endpoint) {
for (const audioAttachment of audioAttachments) {
const transcript = await transcribeAudio(audioAttachment, audioConfig);
processedText = `[Voice message]: ${transcript}\n\n${processedText}`;
}
}
// Convert image attachments to content parts
for (const att of imageAttachments) {
const source = attachmentToImageSource(att);
if (source) {
imageParts.push({ type: 'image', source });
}
}
// No images or audio — return simple text message
if (imageParts.length === 0) {
return { role: 'user', content: processedText };
}
// Build multimodal content: text first, then images
const parts: MessageContentPart[] = [];
if (processedText) {
parts.push({ type: 'text', text: processedText });
}
parts.push(...imageParts);
return { role: 'user', content: parts };
}
/**
* Check whether a message contains image content parts.
*/
export function hasImages(message: Message): boolean {
if (typeof message.content === 'string') {
return false;
}
return message.content.some(p => p.type === 'image');
}