feat: add audio transcription pipeline for voice messages

Adds Whisper-compatible audio transcription via configurable endpoint.
New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(),
buildUserMessageWithAudio(). Config schema gains audio section with
transcription_endpoint, api_key, and model. Daemon wires transcription
into the message router. Channel adapters extract audio from voice/audio
messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp
ptt+audio). Includes 57 media tests (was 25, now covers all audio paths).
This commit is contained in:
William Valentin
2026-02-07 09:09:13 -08:00
parent e052778b0a
commit 2a962abcd0
4 changed files with 531 additions and 12 deletions
+8
View File
@@ -180,6 +180,12 @@ const webSearchSchema = z.object({
max_results: z.number().min(1).max(20).default(5),
}).default({});
const audioSchema = z.object({
transcription_endpoint: z.string().optional(),
transcription_api_key: z.string().optional(),
transcription_model: z.string().default('whisper-1'),
}).default({});
// ── Tool policy schemas ──────────────────────────────────────────────
const toolProfileEnum = z.enum(['minimal', 'messaging', 'coding', 'full']);
@@ -259,6 +265,7 @@ export const configSchema = z.object({
browser: browserSchema,
retry: retrySchema,
web_search: webSearchSchema,
audio: audioSchema,
prompt: promptSchema,
tools: toolsSchema,
sandbox: sandboxSchema,
@@ -274,6 +281,7 @@ export type AgentsConfig = z.infer<typeof agentsSchema>;
export type CompactionConfig = z.infer<typeof compactionSchema>;
export type MemoryConfig = z.infer<typeof memorySchema>;
export type WebSearchConfig = z.infer<typeof webSearchSchema>;
export type AudioConfig = z.infer<typeof audioSchema>;
export type ProcessConfig = z.infer<typeof processSchema>;
export type BrowserConfig = z.infer<typeof browserSchema>;
export type DiscordConfig = z.infer<typeof discordSchema>;