feat: add audio transcription pipeline for voice messages
Adds Whisper-compatible audio transcription via configurable endpoint. New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(), buildUserMessageWithAudio(). Config schema gains audio section with transcription_endpoint, api_key, and model. Daemon wires transcription into the message router. Channel adapters extract audio from voice/audio messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp ptt+audio). Includes 57 media tests (was 25, now covers all audio paths).
This commit is contained in:
@@ -13,11 +13,41 @@ const SUPPORTED_IMAGE_TYPES = new Set([
|
||||
'image/webp',
|
||||
]);
|
||||
|
||||
/** MIME types that are audio (not image). */
|
||||
const SUPPORTED_AUDIO_TYPES = new Set([
|
||||
'audio/ogg',
|
||||
'audio/mpeg',
|
||||
'audio/mp3',
|
||||
'audio/wav',
|
||||
'audio/webm',
|
||||
'audio/mp4',
|
||||
'audio/x-m4a',
|
||||
]);
|
||||
|
||||
/** Check whether an attachment is a supported image type. */
|
||||
export function isSupportedImage(attachment: Attachment): boolean {
|
||||
return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType);
|
||||
}
|
||||
|
||||
/** Check whether an attachment is a supported audio type. */
|
||||
export function isSupportedAudio(attachment: Attachment): boolean {
|
||||
return SUPPORTED_AUDIO_TYPES.has(attachment.mimeType);
|
||||
}
|
||||
|
||||
/** Convert MIME type to file extension. */
|
||||
export function mimeToExtension(mime: string): string {
|
||||
const map: Record<string, string> = {
|
||||
'audio/ogg': 'ogg',
|
||||
'audio/mpeg': 'mp3',
|
||||
'audio/mp3': 'mp3',
|
||||
'audio/wav': 'wav',
|
||||
'audio/webm': 'webm',
|
||||
'audio/mp4': 'm4a',
|
||||
'audio/x-m4a': 'm4a',
|
||||
};
|
||||
return map[mime] ?? 'bin';
|
||||
}
|
||||
|
||||
/** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */
|
||||
export function attachmentToImageSource(attachment: Attachment): ImageSource | null {
|
||||
if (!isSupportedImage(attachment)) {
|
||||
@@ -90,6 +120,103 @@ export function getMessageText(message: Message): string {
|
||||
.join('');
|
||||
}
|
||||
|
||||
/** Configuration for audio transcription via Whisper-compatible API. */
|
||||
export interface AudioTranscriptionConfig {
|
||||
/** Whisper-compatible API endpoint (e.g. "https://api.openai.com/v1/audio/transcriptions") */
|
||||
endpoint?: string;
|
||||
/** API key for the transcription service */
|
||||
apiKey?: string;
|
||||
/** Model name (default: "whisper-1") */
|
||||
model?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe an audio attachment to text using the OpenAI-compatible /v1/audio/transcriptions API.
|
||||
* Falls back to a placeholder message if no transcription endpoint is configured.
|
||||
*/
|
||||
export async function transcribeAudio(
|
||||
attachment: Attachment,
|
||||
config?: AudioTranscriptionConfig,
|
||||
): Promise<string> {
|
||||
if (!config?.endpoint) {
|
||||
return '[Audio message received but no transcription service is configured]';
|
||||
}
|
||||
|
||||
try {
|
||||
const audioBuffer = Buffer.from(attachment.data!, 'base64');
|
||||
const ext = mimeToExtension(attachment.mimeType);
|
||||
const formData = new FormData();
|
||||
formData.append('file', new Blob([audioBuffer], { type: attachment.mimeType }), `audio.${ext}`);
|
||||
formData.append('model', config.model ?? 'whisper-1');
|
||||
|
||||
const headers: Record<string, string> = {};
|
||||
if (config.apiKey) {
|
||||
headers['Authorization'] = `Bearer ${config.apiKey}`;
|
||||
}
|
||||
|
||||
const res = await fetch(config.endpoint, { method: 'POST', body: formData, headers });
|
||||
if (!res.ok) {
|
||||
throw new Error(`Transcription failed: ${res.status} ${res.statusText}`);
|
||||
}
|
||||
const json = await res.json() as { text: string };
|
||||
return json.text;
|
||||
} catch (error) {
|
||||
console.error(
|
||||
`Failed to transcribe audio (${attachment.mimeType}):`,
|
||||
error instanceof Error ? error.message : 'Unknown error',
|
||||
);
|
||||
return '[Audio message transcription failed]';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a multimodal Message from text + attachments, with optional audio transcription.
|
||||
* Audio attachments are transcribed to text and prepended to the message.
|
||||
* Image attachments are converted to content parts as before.
|
||||
*/
|
||||
export async function buildUserMessageWithAudio(
|
||||
text: string,
|
||||
attachments?: Attachment[],
|
||||
audioConfig?: AudioTranscriptionConfig,
|
||||
): Promise<Message> {
|
||||
const imageParts: MessageContentPart[] = [];
|
||||
|
||||
// Separate image and audio attachments
|
||||
const imageAttachments = (attachments ?? []).filter(a => isSupportedImage(a));
|
||||
const audioAttachments = (attachments ?? []).filter(a => isSupportedAudio(a));
|
||||
|
||||
// Transcribe audio attachments and prepend to text (only if config is provided)
|
||||
let processedText = text;
|
||||
if (audioConfig?.endpoint) {
|
||||
for (const audioAttachment of audioAttachments) {
|
||||
const transcript = await transcribeAudio(audioAttachment, audioConfig);
|
||||
processedText = `[Voice message]: ${transcript}\n\n${processedText}`;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert image attachments to content parts
|
||||
for (const att of imageAttachments) {
|
||||
const source = attachmentToImageSource(att);
|
||||
if (source) {
|
||||
imageParts.push({ type: 'image', source });
|
||||
}
|
||||
}
|
||||
|
||||
// No images or audio — return simple text message
|
||||
if (imageParts.length === 0) {
|
||||
return { role: 'user', content: processedText };
|
||||
}
|
||||
|
||||
// Build multimodal content: text first, then images
|
||||
const parts: MessageContentPart[] = [];
|
||||
if (processedText) {
|
||||
parts.push({ type: 'text', text: processedText });
|
||||
}
|
||||
parts.push(...imageParts);
|
||||
|
||||
return { role: 'user', content: parts };
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether a message contains image content parts.
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user