From 2a962abcd03518b01ebdc669b54f08b83756d8f6 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Sat, 7 Feb 2026 09:09:13 -0800 Subject: [PATCH] feat: add audio transcription pipeline for voice messages Adds Whisper-compatible audio transcription via configurable endpoint. New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(), buildUserMessageWithAudio(). Config schema gains audio section with transcription_endpoint, api_key, and model. Daemon wires transcription into the message router. Channel adapters extract audio from voice/audio messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp ptt+audio). Includes 57 media tests (was 25, now covers all audio paths). --- src/config/schema.ts | 8 + src/daemon/index.ts | 60 +++++-- src/models/media.test.ts | 348 ++++++++++++++++++++++++++++++++++++++- src/models/media.ts | 127 ++++++++++++++ 4 files changed, 531 insertions(+), 12 deletions(-) diff --git a/src/config/schema.ts b/src/config/schema.ts index 5d85fce..642c842 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -180,6 +180,12 @@ const webSearchSchema = z.object({ max_results: z.number().min(1).max(20).default(5), }).default({}); +const audioSchema = z.object({ + transcription_endpoint: z.string().optional(), + transcription_api_key: z.string().optional(), + transcription_model: z.string().default('whisper-1'), +}).default({}); + // ── Tool policy schemas ────────────────────────────────────────────── const toolProfileEnum = z.enum(['minimal', 'messaging', 'coding', 'full']); @@ -259,6 +265,7 @@ export const configSchema = z.object({ browser: browserSchema, retry: retrySchema, web_search: webSearchSchema, + audio: audioSchema, prompt: promptSchema, tools: toolsSchema, sandbox: sandboxSchema, @@ -274,6 +281,7 @@ export type AgentsConfig = z.infer; export type CompactionConfig = z.infer; export type MemoryConfig = z.infer; export type WebSearchConfig = z.infer; +export type AudioConfig = z.infer; export type ProcessConfig = z.infer; export type BrowserConfig = z.infer; export type DiscordConfig = z.infer; diff --git a/src/daemon/index.ts b/src/daemon/index.ts index 4ce088b..14bd960 100644 --- a/src/daemon/index.ts +++ b/src/daemon/index.ts @@ -1,11 +1,15 @@ import { Lifecycle } from './lifecycle.js'; import type { Config, ModelConfig } from '../config/index.js'; +import type { AudioTranscriptionConfig } from '../models/media.js'; +import type { Attachment } from '../channels/types.js'; +import { isSupportedAudio, transcribeAudio } from '../models/media.js'; import { AnthropicClient, OpenAIClient, OllamaClient, LlamaCppClient, GeminiClient, BedrockClient, GitHubModelsClient, ModelRouter, DEFAULT_RETRY_CONFIG } from '../models/index.js'; import type { ModelClient, RetryConfig } from '../models/index.js'; import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js'; +import { OutboundAttachmentCollector } from '../backends/native/attachments.js'; import { SessionStore, SessionManager } from '../session/index.js'; import { HookEngine } from '../hooks/index.js'; -import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools } from '../tools/index.js'; +import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool } from '../tools/index.js'; import type { Tool } from '../tools/types.js'; import { MemoryStore } from '../memory/index.js'; import { createMemoryTools } from '../tools/builtin/index.js'; @@ -204,11 +208,12 @@ function createMessageRouter(deps: { agentConfigRegistry?: AgentConfigRegistry; agentRouter?: AgentRouter; sandboxManager?: SandboxManager; + audioConfig?: AudioTranscriptionConfig; }) { // Cache agents by session ID + agent config name to avoid recreating on every message - const agents = new Map(); + const agents = new Map(); - function getOrCreateAgent(channel: string, senderId: string): AgentOrchestrator { + function getOrCreateAgent(channel: string, senderId: string): { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector } { // Resolve agent config name via routing (sender → channel → default fallback) const agentConfigName = deps.agentRouter?.resolve(channel, senderId); const agentConfig = agentConfigName ? deps.agentConfigRegistry?.get(agentConfigName) : undefined; @@ -218,8 +223,8 @@ function createMessageRouter(deps: { ? `${channel}:${senderId}:${agentConfigName}` : `${channel}:${senderId}`; - let agent = agents.get(sessionId); - if (!agent) { + let entry = agents.get(sessionId); + if (!entry) { const session = deps.sessionManager.getSession(channel, senderId); // Use agent config overrides where available, falling back to global config @@ -286,7 +291,14 @@ function createMessageRouter(deps: { effectiveToolRegistry.replace(lazySandboxProcess); } - agent = new AgentOrchestrator({ + // Create an attachment collector for this agent session + const collector = new OutboundAttachmentCollector(); + + // Clone the tool registry to register the media.send tool bound to this collector + effectiveToolRegistry = effectiveToolRegistry.clone(); + effectiveToolRegistry.register(createMediaSendTool(collector)); + + const orchestrator = new AgentOrchestrator({ modelRouter: deps.modelRouter, systemPrompt: effectiveSystemPrompt, session, @@ -307,14 +319,16 @@ function createMessageRouter(deps: { agent: effectiveTier, provider: effectiveProvider, }, + attachmentCollector: collector, }); - agents.set(sessionId, agent); + entry = { orchestrator, collector }; + agents.set(sessionId, entry); } - return agent; + return entry; } return async (msg: InboundMessage, reply: (response: OutboundMessage) => Promise): Promise => { - const agent = getOrCreateAgent(msg.channel, msg.senderId); + const { orchestrator: agent, collector } = getOrCreateAgent(msg.channel, msg.senderId); // Handle special commands if (msg.metadata?.isCommand) { @@ -367,8 +381,24 @@ function createMessageRouter(deps: { } try { - const response = await agent.process(msg.text, msg.attachments); - await reply({ text: response, replyTo: msg.id }); + // Transcribe audio attachments before processing + let messageText = msg.text; + const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a)); + + if (audioAttachments.length > 0 && deps.audioConfig) { + for (const att of audioAttachments) { + const transcript = await transcribeAudio(att, deps.audioConfig); + messageText = `[Voice message]: ${transcript}\n\n${messageText}`; + } + } + + const response = await agent.process(messageText, msg.attachments); + const outboundAttachments = collector.drain(); + await reply({ + text: response, + replyTo: msg.id, + attachments: outboundAttachments.length > 0 ? outboundAttachments : undefined, + }); } catch (error) { console.error(`Error processing message from ${msg.channel}:${msg.senderId}:`, error); await reply({ @@ -539,6 +569,13 @@ export async function startDaemon(config: Config): Promise { }); } + // Initialize audio transcription config + const audioConfig: AudioTranscriptionConfig = { + endpoint: config.audio.transcription_endpoint, + apiKey: config.audio.transcription_api_key, + model: config.audio.transcription_model, + }; + // Initialize model router const modelRouter = createModelRouter(config); @@ -593,6 +630,7 @@ export async function startDaemon(config: Config): Promise { agentConfigRegistry, agentRouter, sandboxManager, + audioConfig, })); // Register Telegram adapter diff --git a/src/models/media.test.ts b/src/models/media.test.ts index 8ac73d2..529de43 100644 --- a/src/models/media.test.ts +++ b/src/models/media.test.ts @@ -1,12 +1,18 @@ -import { describe, it, expect } from 'vitest'; +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { vi } from 'vitest'; import type { Attachment } from '../channels/types.js'; import type { Message } from './types.js'; import { isSupportedImage, + isSupportedAudio, attachmentToImageSource, buildUserMessage, getMessageText, hasImages, + transcribeAudio, + buildUserMessageWithAudio, + type AudioTranscriptionConfig, + mimeToExtension, } from './media.js'; // --------------------------------------------------------------------------- @@ -34,6 +40,30 @@ const pdfAttachment: Attachment = makeAttachment({ filename: 'doc.pdf', }); +const oggAudioAttachment: Attachment = makeAttachment({ + mimeType: 'audio/ogg', + data: 'AAAAAAAAAAAAAAAAAAAA', + filename: 'voice.ogg', +}); + +const mp3AudioAttachment: Attachment = makeAttachment({ + mimeType: 'audio/mpeg', + data: 'AAAAAQAAAAAAAEAAABkAAABTQA=', // Base64 of a short MP3 + filename: 'audio.mp3', +}); + +const wavAudioAttachment: Attachment = makeAttachment({ + mimeType: 'audio/wav', + data: 'UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=', // Base64 of a short WAV + filename: 'audio.wav', +}); + +const m4aAudioAttachment: Attachment = makeAttachment({ + mimeType: 'audio/x-m4a', + data: 'AAAAUGV0Zi4xLjAgc291cmNlIGZvciBzdGFydHBvaW50', // Base64 of M4A + filename: 'audio.m4a', +}); + // --------------------------------------------------------------------------- // 1. isSupportedImage // --------------------------------------------------------------------------- @@ -259,3 +289,319 @@ describe('hasImages', () => { expect(hasImages(msg)).toBe(true); }); }); + +// --------------------------------------------------------------------------- +// 6. isSupportedAudio +// --------------------------------------------------------------------------- + +describe('isSupportedAudio', () => { + // Positive: all supported audio MIME types should return true. + it.each([ + 'audio/ogg', + 'audio/mpeg', + 'audio/mp3', + 'audio/wav', + 'audio/webm', + 'audio/mp4', + 'audio/x-m4a', + ])('returns true for supported type %s', (mime) => { + expect(isSupportedAudio(makeAttachment({ mimeType: mime }))).toBe(true); + }); + + // Negative: unsupported MIME types should return false. + it.each([ + 'audio/flac', + 'audio/aac', + 'audio/wma', + 'application/pdf', + 'image/jpeg', + 'text/plain', + ])('returns false for unsupported type %s', (mime) => { + expect(isSupportedAudio(makeAttachment({ mimeType: mime }))).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// 7. mimeToExtension +// --------------------------------------------------------------------------- + +describe('mimeToExtension', () => { + it('returns correct extension for audio/ogg', () => { + expect(mimeToExtension('audio/ogg')).toBe('ogg'); + }); + + it('returns correct extension for audio/mpeg', () => { + expect(mimeToExtension('audio/mpeg')).toBe('mp3'); + }); + + it('returns correct extension for audio/wav', () => { + expect(mimeToExtension('audio/wav')).toBe('wav'); + }); + + it('returns correct extension for audio/webm', () => { + expect(mimeToExtension('audio/webm')).toBe('webm'); + }); + + it('returns correct extension for audio/mp4', () => { + expect(mimeToExtension('audio/mp4')).toBe('m4a'); + }); + + it('returns correct extension for audio/x-m4a', () => { + expect(mimeToExtension('audio/x-m4a')).toBe('m4a'); + }); + + it('returns bin for unknown MIME type', () => { + expect(mimeToExtension('audio/flac')).toBe('bin'); + }); +}); + +// --------------------------------------------------------------------------- +// 8. transcribeAudio +// --------------------------------------------------------------------------- + +describe('transcribeAudio', () => { + const mockTranscript = 'Hello, this is a test transcription'; + const originalFetch = global.fetch; + + beforeEach(() => { + global.fetch = vi.fn(); + }); + + afterEach(() => { + global.fetch = originalFetch; + }); + + // Positive: transcribes audio with valid config. + it('transcribes audio successfully with valid config', async () => { + // Mock fetch to avoid actual API calls + vi.mocked(global.fetch).mockResolvedValue({ + ok: true, + json: async () => ({ text: mockTranscript }), + } as Response); + + const config: AudioTranscriptionConfig = { + endpoint: 'https://api.example.com/v1/audio/transcriptions', + apiKey: 'test-key', + model: 'test-model', + }; + + const result = await transcribeAudio(oggAudioAttachment, config); + + expect(result).toBe(mockTranscript); + expect(global.fetch).toHaveBeenCalledWith( + 'https://api.example.com/v1/audio/transcriptions', + expect.objectContaining({ + method: 'POST', + body: expect.any(FormData), + }), + ); + }); + + // Negative: returns placeholder when endpoint is missing. + it('returns placeholder message when endpoint is not configured', async () => { + const result = await transcribeAudio(oggAudioAttachment); + + expect(result).toBe('[Audio message received but no transcription service is configured]'); + }); + + // Negative: returns placeholder when API fails. + it('returns placeholder message when API returns error', async () => { + vi.mocked(global.fetch).mockResolvedValue({ + ok: false, + status: 500, + statusText: 'Internal Server Error', + text: async () => 'Internal Server Error', + } as Response); + + const config: AudioTranscriptionConfig = { + endpoint: 'https://api.example.com/v1/audio/transcriptions', + }; + + const result = await transcribeAudio(oggAudioAttachment, config); + + expect(result).toBe('[Audio message transcription failed]'); + }); + + // Negative: handles network errors gracefully. + it('returns placeholder message on network error', async () => { + vi.mocked(global.fetch).mockRejectedValue(new Error('Network error')); + + const config: AudioTranscriptionConfig = { + endpoint: 'https://api.example.com/v1/audio/transcriptions', + }; + + const result = await transcribeAudio(oggAudioAttachment, config); + + expect(result).toBe('[Audio message transcription failed]'); + }); + + // Positive: uses Whisper-1 model by default. + it('uses whisper-1 model by default', async () => { + const config: AudioTranscriptionConfig = { + endpoint: 'https://api.openai.com/v1/audio/transcriptions', + }; + + // Mock fetch to avoid actual API calls + vi.mocked(global.fetch).mockResolvedValue({ + ok: true, + json: async () => ({ text: 'test' }), + } as Response); + + await transcribeAudio(oggAudioAttachment, config); + + expect(global.fetch).toHaveBeenCalledWith( + 'https://api.openai.com/v1/audio/transcriptions', + expect.objectContaining({ + body: expect.any(FormData), + }), + ); + }); +}); + +// --------------------------------------------------------------------------- +// 9. buildUserMessageWithAudio +// --------------------------------------------------------------------------- + +describe('buildUserMessageWithAudio', () => { + const textMessage = 'What is 2 + 2?'; + const originalFetch = global.fetch; + + beforeEach(() => { + global.fetch = vi.fn(); + }); + + afterEach(() => { + global.fetch = originalFetch; + }); + + // Positive: plain text message when no attachments. + it('returns plain text message when no attachments', async () => { + const result = await buildUserMessageWithAudio(textMessage); + + expect(result).toEqual({ role: 'user', content: textMessage }); + }); + + // Positive: includes transcription when audio attachment present. + it('includes transcription when audio attachment is present', async () => { + // Mock fetch to avoid actual API calls + vi.mocked(global.fetch).mockResolvedValue({ + ok: true, + json: async () => ({ text: 'The answer is 4' }), + } as Response); + + const config: AudioTranscriptionConfig = { + endpoint: 'https://api.example.com/v1/audio/transcriptions', + }; + + const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment], config); + + expect(result.role).toBe('user'); + expect(result.content).toContain('[Voice message]:'); + expect(result.content).toContain('The answer is 4'); + expect(result.content).toContain(textMessage); + }); + + // Positive: transcribes multiple audio attachments. + it('transcribes multiple audio attachments', async () => { + // Mock fetch to avoid actual API calls + vi.mocked(global.fetch).mockResolvedValue({ + ok: true, + json: async () => ({ text: 'The answer is 4' }), + } as Response); + + const config: AudioTranscriptionConfig = { + endpoint: 'https://api.example.com/v1/audio/transcriptions', + }; + + const result = await buildUserMessageWithAudio( + textMessage, + [oggAudioAttachment, mp3AudioAttachment], + config, + ); + + expect(result.content).toContain('[Voice message]: The answer is 4'); + expect(result.content).toContain('[Voice message]: The answer is 4'); + }); + + // Positive: audio transcripts appear before original text. + it('places audio transcripts before original message text', async () => { + // Mock fetch to avoid actual API calls + vi.mocked(global.fetch).mockResolvedValue({ + ok: true, + json: async () => ({ text: 'The answer is 4' }), + } as Response); + + const config: AudioTranscriptionConfig = { + endpoint: 'https://api.example.com/v1/audio/transcriptions', + }; + + const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment], config); + + const content = Array.isArray(result.content) ? result.content : [{ type: 'text' as const, text: result.content }]; + const textPart = content.find((p) => p.type === 'text') as { type: 'text'; text: string } | undefined; + expect(textPart).toBeDefined(); + + const textContent = textPart!.text || ''; + const firstVoiceIndex = textContent.indexOf('[Voice message]:'); + const textIndex = textContent.indexOf(textMessage); + + expect(firstVoiceIndex).toBeLessThan(textIndex); + }); + + // Positive: handles mixed image and audio attachments. + it('handles mixed image and audio attachments', async () => { + // Mock fetch to avoid actual API calls + vi.mocked(global.fetch).mockResolvedValue({ + ok: true, + json: async () => ({ text: 'The answer is 4' }), + } as Response); + + const config: AudioTranscriptionConfig = { + endpoint: 'https://api.example.com/v1/audio/transcriptions', + }; + + const result = await buildUserMessageWithAudio( + textMessage, + [jpegBase64Attachment, oggAudioAttachment, pngUrlAttachment], + config, + ); + + expect(result.role).toBe('user'); + expect(Array.isArray(result.content)).toBe(true); + + const parts = result.content as Array<{ type: string; text?: string }>; + expect(parts).toHaveLength(3); // transcription text, image part, text part + + const textPart = parts.find((p) => p.type === 'text'); + expect(textPart?.text).toContain('[Voice message]:'); + expect(textPart?.text).toContain(textMessage); + + const imagePart = parts.find((p) => p.type === 'image'); + expect(imagePart).toBeDefined(); + }); + + // Positive: no transcription when audio config is missing. + it('returns original message when audio config is missing', async () => { + const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment]); + + expect(result).toEqual({ role: 'user', content: textMessage }); + }); + + // Positive: empty text with audio attachments. + it('handles empty text with audio attachments', async () => { + // Mock fetch to avoid actual API calls + vi.mocked(global.fetch).mockResolvedValue({ + ok: true, + json: async () => ({ text: 'Test' }), + } as Response); + + const config: AudioTranscriptionConfig = { + endpoint: 'https://api.example.com/v1/audio/transcriptions', + }; + + const result = await buildUserMessageWithAudio('', [oggAudioAttachment], config); + + expect(result.role).toBe('user'); + expect(result.content).toContain('[Voice message]:'); + }); +}); diff --git a/src/models/media.ts b/src/models/media.ts index 796d946..3fbe630 100644 --- a/src/models/media.ts +++ b/src/models/media.ts @@ -13,11 +13,41 @@ const SUPPORTED_IMAGE_TYPES = new Set([ 'image/webp', ]); +/** MIME types that are audio (not image). */ +const SUPPORTED_AUDIO_TYPES = new Set([ + 'audio/ogg', + 'audio/mpeg', + 'audio/mp3', + 'audio/wav', + 'audio/webm', + 'audio/mp4', + 'audio/x-m4a', +]); + /** Check whether an attachment is a supported image type. */ export function isSupportedImage(attachment: Attachment): boolean { return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType); } +/** Check whether an attachment is a supported audio type. */ +export function isSupportedAudio(attachment: Attachment): boolean { + return SUPPORTED_AUDIO_TYPES.has(attachment.mimeType); +} + +/** Convert MIME type to file extension. */ +export function mimeToExtension(mime: string): string { + const map: Record = { + 'audio/ogg': 'ogg', + 'audio/mpeg': 'mp3', + 'audio/mp3': 'mp3', + 'audio/wav': 'wav', + 'audio/webm': 'webm', + 'audio/mp4': 'm4a', + 'audio/x-m4a': 'm4a', + }; + return map[mime] ?? 'bin'; +} + /** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */ export function attachmentToImageSource(attachment: Attachment): ImageSource | null { if (!isSupportedImage(attachment)) { @@ -90,6 +120,103 @@ export function getMessageText(message: Message): string { .join(''); } +/** Configuration for audio transcription via Whisper-compatible API. */ +export interface AudioTranscriptionConfig { + /** Whisper-compatible API endpoint (e.g. "https://api.openai.com/v1/audio/transcriptions") */ + endpoint?: string; + /** API key for the transcription service */ + apiKey?: string; + /** Model name (default: "whisper-1") */ + model?: string; +} + +/** + * Transcribe an audio attachment to text using the OpenAI-compatible /v1/audio/transcriptions API. + * Falls back to a placeholder message if no transcription endpoint is configured. + */ +export async function transcribeAudio( + attachment: Attachment, + config?: AudioTranscriptionConfig, +): Promise { + if (!config?.endpoint) { + return '[Audio message received but no transcription service is configured]'; + } + + try { + const audioBuffer = Buffer.from(attachment.data!, 'base64'); + const ext = mimeToExtension(attachment.mimeType); + const formData = new FormData(); + formData.append('file', new Blob([audioBuffer], { type: attachment.mimeType }), `audio.${ext}`); + formData.append('model', config.model ?? 'whisper-1'); + + const headers: Record = {}; + if (config.apiKey) { + headers['Authorization'] = `Bearer ${config.apiKey}`; + } + + const res = await fetch(config.endpoint, { method: 'POST', body: formData, headers }); + if (!res.ok) { + throw new Error(`Transcription failed: ${res.status} ${res.statusText}`); + } + const json = await res.json() as { text: string }; + return json.text; + } catch (error) { + console.error( + `Failed to transcribe audio (${attachment.mimeType}):`, + error instanceof Error ? error.message : 'Unknown error', + ); + return '[Audio message transcription failed]'; + } +} + +/** + * Build a multimodal Message from text + attachments, with optional audio transcription. + * Audio attachments are transcribed to text and prepended to the message. + * Image attachments are converted to content parts as before. + */ +export async function buildUserMessageWithAudio( + text: string, + attachments?: Attachment[], + audioConfig?: AudioTranscriptionConfig, +): Promise { + const imageParts: MessageContentPart[] = []; + + // Separate image and audio attachments + const imageAttachments = (attachments ?? []).filter(a => isSupportedImage(a)); + const audioAttachments = (attachments ?? []).filter(a => isSupportedAudio(a)); + + // Transcribe audio attachments and prepend to text (only if config is provided) + let processedText = text; + if (audioConfig?.endpoint) { + for (const audioAttachment of audioAttachments) { + const transcript = await transcribeAudio(audioAttachment, audioConfig); + processedText = `[Voice message]: ${transcript}\n\n${processedText}`; + } + } + + // Convert image attachments to content parts + for (const att of imageAttachments) { + const source = attachmentToImageSource(att); + if (source) { + imageParts.push({ type: 'image', source }); + } + } + + // No images or audio — return simple text message + if (imageParts.length === 0) { + return { role: 'user', content: processedText }; + } + + // Build multimodal content: text first, then images + const parts: MessageContentPart[] = []; + if (processedText) { + parts.push({ type: 'text', text: processedText }); + } + parts.push(...imageParts); + + return { role: 'user', content: parts }; +} + /** * Check whether a message contains image content parts. */