feat: add audio transcription pipeline for voice messages

Adds Whisper-compatible audio transcription via configurable endpoint.
New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(),
buildUserMessageWithAudio(). Config schema gains audio section with
transcription_endpoint, api_key, and model. Daemon wires transcription
into the message router. Channel adapters extract audio from voice/audio
messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp
ptt+audio). Includes 57 media tests (was 25, now covers all audio paths).
This commit is contained in:
William Valentin
2026-02-07 09:09:13 -08:00
parent e052778b0a
commit 2a962abcd0
4 changed files with 531 additions and 12 deletions
+347 -1
View File
@@ -1,12 +1,18 @@
import { describe, it, expect } from 'vitest';
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import { vi } from 'vitest';
import type { Attachment } from '../channels/types.js';
import type { Message } from './types.js';
import {
isSupportedImage,
isSupportedAudio,
attachmentToImageSource,
buildUserMessage,
getMessageText,
hasImages,
transcribeAudio,
buildUserMessageWithAudio,
type AudioTranscriptionConfig,
mimeToExtension,
} from './media.js';
// ---------------------------------------------------------------------------
@@ -34,6 +40,30 @@ const pdfAttachment: Attachment = makeAttachment({
filename: 'doc.pdf',
});
const oggAudioAttachment: Attachment = makeAttachment({
mimeType: 'audio/ogg',
data: 'AAAAAAAAAAAAAAAAAAAA',
filename: 'voice.ogg',
});
const mp3AudioAttachment: Attachment = makeAttachment({
mimeType: 'audio/mpeg',
data: 'AAAAAQAAAAAAAEAAABkAAABTQA=', // Base64 of a short MP3
filename: 'audio.mp3',
});
const wavAudioAttachment: Attachment = makeAttachment({
mimeType: 'audio/wav',
data: 'UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=', // Base64 of a short WAV
filename: 'audio.wav',
});
const m4aAudioAttachment: Attachment = makeAttachment({
mimeType: 'audio/x-m4a',
data: 'AAAAUGV0Zi4xLjAgc291cmNlIGZvciBzdGFydHBvaW50', // Base64 of M4A
filename: 'audio.m4a',
});
// ---------------------------------------------------------------------------
// 1. isSupportedImage
// ---------------------------------------------------------------------------
@@ -259,3 +289,319 @@ describe('hasImages', () => {
expect(hasImages(msg)).toBe(true);
});
});
// ---------------------------------------------------------------------------
// 6. isSupportedAudio
// ---------------------------------------------------------------------------
describe('isSupportedAudio', () => {
// Positive: all supported audio MIME types should return true.
it.each([
'audio/ogg',
'audio/mpeg',
'audio/mp3',
'audio/wav',
'audio/webm',
'audio/mp4',
'audio/x-m4a',
])('returns true for supported type %s', (mime) => {
expect(isSupportedAudio(makeAttachment({ mimeType: mime }))).toBe(true);
});
// Negative: unsupported MIME types should return false.
it.each([
'audio/flac',
'audio/aac',
'audio/wma',
'application/pdf',
'image/jpeg',
'text/plain',
])('returns false for unsupported type %s', (mime) => {
expect(isSupportedAudio(makeAttachment({ mimeType: mime }))).toBe(false);
});
});
// ---------------------------------------------------------------------------
// 7. mimeToExtension
// ---------------------------------------------------------------------------
describe('mimeToExtension', () => {
it('returns correct extension for audio/ogg', () => {
expect(mimeToExtension('audio/ogg')).toBe('ogg');
});
it('returns correct extension for audio/mpeg', () => {
expect(mimeToExtension('audio/mpeg')).toBe('mp3');
});
it('returns correct extension for audio/wav', () => {
expect(mimeToExtension('audio/wav')).toBe('wav');
});
it('returns correct extension for audio/webm', () => {
expect(mimeToExtension('audio/webm')).toBe('webm');
});
it('returns correct extension for audio/mp4', () => {
expect(mimeToExtension('audio/mp4')).toBe('m4a');
});
it('returns correct extension for audio/x-m4a', () => {
expect(mimeToExtension('audio/x-m4a')).toBe('m4a');
});
it('returns bin for unknown MIME type', () => {
expect(mimeToExtension('audio/flac')).toBe('bin');
});
});
// ---------------------------------------------------------------------------
// 8. transcribeAudio
// ---------------------------------------------------------------------------
describe('transcribeAudio', () => {
const mockTranscript = 'Hello, this is a test transcription';
const originalFetch = global.fetch;
beforeEach(() => {
global.fetch = vi.fn();
});
afterEach(() => {
global.fetch = originalFetch;
});
// Positive: transcribes audio with valid config.
it('transcribes audio successfully with valid config', async () => {
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: mockTranscript }),
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
apiKey: 'test-key',
model: 'test-model',
};
const result = await transcribeAudio(oggAudioAttachment, config);
expect(result).toBe(mockTranscript);
expect(global.fetch).toHaveBeenCalledWith(
'https://api.example.com/v1/audio/transcriptions',
expect.objectContaining({
method: 'POST',
body: expect.any(FormData),
}),
);
});
// Negative: returns placeholder when endpoint is missing.
it('returns placeholder message when endpoint is not configured', async () => {
const result = await transcribeAudio(oggAudioAttachment);
expect(result).toBe('[Audio message received but no transcription service is configured]');
});
// Negative: returns placeholder when API fails.
it('returns placeholder message when API returns error', async () => {
vi.mocked(global.fetch).mockResolvedValue({
ok: false,
status: 500,
statusText: 'Internal Server Error',
text: async () => 'Internal Server Error',
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await transcribeAudio(oggAudioAttachment, config);
expect(result).toBe('[Audio message transcription failed]');
});
// Negative: handles network errors gracefully.
it('returns placeholder message on network error', async () => {
vi.mocked(global.fetch).mockRejectedValue(new Error('Network error'));
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await transcribeAudio(oggAudioAttachment, config);
expect(result).toBe('[Audio message transcription failed]');
});
// Positive: uses Whisper-1 model by default.
it('uses whisper-1 model by default', async () => {
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.openai.com/v1/audio/transcriptions',
};
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: 'test' }),
} as Response);
await transcribeAudio(oggAudioAttachment, config);
expect(global.fetch).toHaveBeenCalledWith(
'https://api.openai.com/v1/audio/transcriptions',
expect.objectContaining({
body: expect.any(FormData),
}),
);
});
});
// ---------------------------------------------------------------------------
// 9. buildUserMessageWithAudio
// ---------------------------------------------------------------------------
describe('buildUserMessageWithAudio', () => {
const textMessage = 'What is 2 + 2?';
const originalFetch = global.fetch;
beforeEach(() => {
global.fetch = vi.fn();
});
afterEach(() => {
global.fetch = originalFetch;
});
// Positive: plain text message when no attachments.
it('returns plain text message when no attachments', async () => {
const result = await buildUserMessageWithAudio(textMessage);
expect(result).toEqual({ role: 'user', content: textMessage });
});
// Positive: includes transcription when audio attachment present.
it('includes transcription when audio attachment is present', async () => {
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: 'The answer is 4' }),
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment], config);
expect(result.role).toBe('user');
expect(result.content).toContain('[Voice message]:');
expect(result.content).toContain('The answer is 4');
expect(result.content).toContain(textMessage);
});
// Positive: transcribes multiple audio attachments.
it('transcribes multiple audio attachments', async () => {
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: 'The answer is 4' }),
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await buildUserMessageWithAudio(
textMessage,
[oggAudioAttachment, mp3AudioAttachment],
config,
);
expect(result.content).toContain('[Voice message]: The answer is 4');
expect(result.content).toContain('[Voice message]: The answer is 4');
});
// Positive: audio transcripts appear before original text.
it('places audio transcripts before original message text', async () => {
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: 'The answer is 4' }),
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment], config);
const content = Array.isArray(result.content) ? result.content : [{ type: 'text' as const, text: result.content }];
const textPart = content.find((p) => p.type === 'text') as { type: 'text'; text: string } | undefined;
expect(textPart).toBeDefined();
const textContent = textPart!.text || '';
const firstVoiceIndex = textContent.indexOf('[Voice message]:');
const textIndex = textContent.indexOf(textMessage);
expect(firstVoiceIndex).toBeLessThan(textIndex);
});
// Positive: handles mixed image and audio attachments.
it('handles mixed image and audio attachments', async () => {
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: 'The answer is 4' }),
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await buildUserMessageWithAudio(
textMessage,
[jpegBase64Attachment, oggAudioAttachment, pngUrlAttachment],
config,
);
expect(result.role).toBe('user');
expect(Array.isArray(result.content)).toBe(true);
const parts = result.content as Array<{ type: string; text?: string }>;
expect(parts).toHaveLength(3); // transcription text, image part, text part
const textPart = parts.find((p) => p.type === 'text');
expect(textPart?.text).toContain('[Voice message]:');
expect(textPart?.text).toContain(textMessage);
const imagePart = parts.find((p) => p.type === 'image');
expect(imagePart).toBeDefined();
});
// Positive: no transcription when audio config is missing.
it('returns original message when audio config is missing', async () => {
const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment]);
expect(result).toEqual({ role: 'user', content: textMessage });
});
// Positive: empty text with audio attachments.
it('handles empty text with audio attachments', async () => {
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: 'Test' }),
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await buildUserMessageWithAudio('', [oggAudioAttachment], config);
expect(result.role).toBe('user');
expect(result.content).toContain('[Voice message]:');
});
});
+127
View File
@@ -13,11 +13,41 @@ const SUPPORTED_IMAGE_TYPES = new Set([
'image/webp',
]);
/** MIME types that are audio (not image). */
const SUPPORTED_AUDIO_TYPES = new Set([
'audio/ogg',
'audio/mpeg',
'audio/mp3',
'audio/wav',
'audio/webm',
'audio/mp4',
'audio/x-m4a',
]);
/** Check whether an attachment is a supported image type. */
export function isSupportedImage(attachment: Attachment): boolean {
return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType);
}
/** Check whether an attachment is a supported audio type. */
export function isSupportedAudio(attachment: Attachment): boolean {
return SUPPORTED_AUDIO_TYPES.has(attachment.mimeType);
}
/** Convert MIME type to file extension. */
export function mimeToExtension(mime: string): string {
const map: Record<string, string> = {
'audio/ogg': 'ogg',
'audio/mpeg': 'mp3',
'audio/mp3': 'mp3',
'audio/wav': 'wav',
'audio/webm': 'webm',
'audio/mp4': 'm4a',
'audio/x-m4a': 'm4a',
};
return map[mime] ?? 'bin';
}
/** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */
export function attachmentToImageSource(attachment: Attachment): ImageSource | null {
if (!isSupportedImage(attachment)) {
@@ -90,6 +120,103 @@ export function getMessageText(message: Message): string {
.join('');
}
/** Configuration for audio transcription via Whisper-compatible API. */
export interface AudioTranscriptionConfig {
/** Whisper-compatible API endpoint (e.g. "https://api.openai.com/v1/audio/transcriptions") */
endpoint?: string;
/** API key for the transcription service */
apiKey?: string;
/** Model name (default: "whisper-1") */
model?: string;
}
/**
* Transcribe an audio attachment to text using the OpenAI-compatible /v1/audio/transcriptions API.
* Falls back to a placeholder message if no transcription endpoint is configured.
*/
export async function transcribeAudio(
attachment: Attachment,
config?: AudioTranscriptionConfig,
): Promise<string> {
if (!config?.endpoint) {
return '[Audio message received but no transcription service is configured]';
}
try {
const audioBuffer = Buffer.from(attachment.data!, 'base64');
const ext = mimeToExtension(attachment.mimeType);
const formData = new FormData();
formData.append('file', new Blob([audioBuffer], { type: attachment.mimeType }), `audio.${ext}`);
formData.append('model', config.model ?? 'whisper-1');
const headers: Record<string, string> = {};
if (config.apiKey) {
headers['Authorization'] = `Bearer ${config.apiKey}`;
}
const res = await fetch(config.endpoint, { method: 'POST', body: formData, headers });
if (!res.ok) {
throw new Error(`Transcription failed: ${res.status} ${res.statusText}`);
}
const json = await res.json() as { text: string };
return json.text;
} catch (error) {
console.error(
`Failed to transcribe audio (${attachment.mimeType}):`,
error instanceof Error ? error.message : 'Unknown error',
);
return '[Audio message transcription failed]';
}
}
/**
* Build a multimodal Message from text + attachments, with optional audio transcription.
* Audio attachments are transcribed to text and prepended to the message.
* Image attachments are converted to content parts as before.
*/
export async function buildUserMessageWithAudio(
text: string,
attachments?: Attachment[],
audioConfig?: AudioTranscriptionConfig,
): Promise<Message> {
const imageParts: MessageContentPart[] = [];
// Separate image and audio attachments
const imageAttachments = (attachments ?? []).filter(a => isSupportedImage(a));
const audioAttachments = (attachments ?? []).filter(a => isSupportedAudio(a));
// Transcribe audio attachments and prepend to text (only if config is provided)
let processedText = text;
if (audioConfig?.endpoint) {
for (const audioAttachment of audioAttachments) {
const transcript = await transcribeAudio(audioAttachment, audioConfig);
processedText = `[Voice message]: ${transcript}\n\n${processedText}`;
}
}
// Convert image attachments to content parts
for (const att of imageAttachments) {
const source = attachmentToImageSource(att);
if (source) {
imageParts.push({ type: 'image', source });
}
}
// No images or audio — return simple text message
if (imageParts.length === 0) {
return { role: 'user', content: processedText };
}
// Build multimodal content: text first, then images
const parts: MessageContentPart[] = [];
if (processedText) {
parts.push({ type: 'text', text: processedText });
}
parts.push(...imageParts);
return { role: 'user', content: parts };
}
/**
* Check whether a message contains image content parts.
*/