feat: add audio transcription pipeline for voice messages

Adds Whisper-compatible audio transcription via configurable endpoint. New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(), buildUserMessageWithAudio(). Config schema gains audio section with transcription_endpoint, api_key, and model. Daemon wires transcription into the message router. Channel adapters extract audio from voice/audio messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp ptt+audio). Includes 57 media tests (was 25, now covers all audio paths).
2026-02-07 09:09:13 -08:00
parent e052778b0a
commit 2a962abcd0
4 changed files with 531 additions and 12 deletions
@@ -180,6 +180,12 @@ const webSearchSchema = z.object({
  max_results: z.number().min(1).max(20).default(5),
 }).default({});

+const audioSchema = z.object({
+  transcription_endpoint: z.string().optional(),
+  transcription_api_key: z.string().optional(),
+  transcription_model: z.string().default('whisper-1'),
+}).default({});
+
 // ── Tool policy schemas ──────────────────────────────────────────────

 const toolProfileEnum = z.enum(['minimal', 'messaging', 'coding', 'full']);
@@ -259,6 +265,7 @@ export const configSchema = z.object({
  browser: browserSchema,
  retry: retrySchema,
  web_search: webSearchSchema,
+  audio: audioSchema,
  prompt: promptSchema,
  tools: toolsSchema,
  sandbox: sandboxSchema,
@@ -274,6 +281,7 @@ export type AgentsConfig = z.infer<typeof agentsSchema>;
 export type CompactionConfig = z.infer<typeof compactionSchema>;
 export type MemoryConfig = z.infer<typeof memorySchema>;
 export type WebSearchConfig = z.infer<typeof webSearchSchema>;
+export type AudioConfig = z.infer<typeof audioSchema>;
 export type ProcessConfig = z.infer<typeof processSchema>;
 export type BrowserConfig = z.infer<typeof browserSchema>;
 export type DiscordConfig = z.infer<typeof discordSchema>;