feat: add audio transcription pipeline for voice messages
Adds Whisper-compatible audio transcription via configurable endpoint. New functions: isSupportedAudio(), mimeToExtension(), transcribeAudio(), buildUserMessageWithAudio(). Config schema gains audio section with transcription_endpoint, api_key, and model. Daemon wires transcription into the message router. Channel adapters extract audio from voice/audio messages (Telegram voice+audio, Discord audio/*, Slack audio/*, WhatsApp ptt+audio). Includes 57 media tests (was 25, now covers all audio paths).
This commit is contained in:
@@ -180,6 +180,12 @@ const webSearchSchema = z.object({
|
||||
max_results: z.number().min(1).max(20).default(5),
|
||||
}).default({});
|
||||
|
||||
const audioSchema = z.object({
|
||||
transcription_endpoint: z.string().optional(),
|
||||
transcription_api_key: z.string().optional(),
|
||||
transcription_model: z.string().default('whisper-1'),
|
||||
}).default({});
|
||||
|
||||
// ── Tool policy schemas ──────────────────────────────────────────────
|
||||
|
||||
const toolProfileEnum = z.enum(['minimal', 'messaging', 'coding', 'full']);
|
||||
@@ -259,6 +265,7 @@ export const configSchema = z.object({
|
||||
browser: browserSchema,
|
||||
retry: retrySchema,
|
||||
web_search: webSearchSchema,
|
||||
audio: audioSchema,
|
||||
prompt: promptSchema,
|
||||
tools: toolsSchema,
|
||||
sandbox: sandboxSchema,
|
||||
@@ -274,6 +281,7 @@ export type AgentsConfig = z.infer<typeof agentsSchema>;
|
||||
export type CompactionConfig = z.infer<typeof compactionSchema>;
|
||||
export type MemoryConfig = z.infer<typeof memorySchema>;
|
||||
export type WebSearchConfig = z.infer<typeof webSearchSchema>;
|
||||
export type AudioConfig = z.infer<typeof audioSchema>;
|
||||
export type ProcessConfig = z.infer<typeof processSchema>;
|
||||
export type BrowserConfig = z.infer<typeof browserSchema>;
|
||||
export type DiscordConfig = z.infer<typeof discordSchema>;
|
||||
|
||||
Reference in New Issue
Block a user