feat(audio): add audio.transcribe tool with Whisper-compatible API support

- Add createAudioTranscribeTool with OpenAI/Groq/Ollama/llama.cpp provider support
- Refactor audio config schema to nested audio.enabled + audio.provider structure
- Move audio tool registration to initTools() for conditional enablement
- Fix duplication bug in audio-transcribe.ts URL download handler
- Support base64 data and URL-based audio input with format detection
This commit is contained in:
William Valentin
2026-02-11 18:13:19 -08:00
parent 5491d5a82a
commit a875bcc4ae
6 changed files with 240 additions and 13 deletions
+1 -8
View File
@@ -5,7 +5,6 @@ import { mkdirSync } from 'fs';
// ── Config & Types ──
import type { Config } from '../config/index.js';
import type { AudioTranscriptionConfig } from '../models/media.js';
import type { ToolRegistry, ToolExecutor, BrowserManager } from '../tools/index.js';
import type { AgentConfigRegistry, AgentRouter } from '../agents/index.js';
import type { SandboxManager } from '../sandbox/index.js';
@@ -100,12 +99,6 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
const { skillRegistry, skillInstaller } = initSkills(config);
const { agentConfigRegistry, agentRouter, sandboxManager } = await initAgents({ config, lifecycle });
// ── Model & Prompt ──
const audioConfig: AudioTranscriptionConfig = {
endpoint: config.audio.transcription_endpoint,
apiKey: config.audio.transcription_api_key,
model: config.audio.transcription_model,
};
const modelRouter = createModelRouter(config);
// Restore persisted model tier
@@ -133,7 +126,7 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
const messageRouter = createMessageRouter({
sessionManager, modelRouter, systemPrompt, toolRegistry, toolExecutor,
config, memoryStore, agentConfigRegistry, agentRouter, sandboxManager, audioConfig,
config, memoryStore, agentConfigRegistry, agentRouter, sandboxManager,
});
channelRegistry.setMessageHandler(messageRouter.handler);
channelAgents = messageRouter.agents;
+14 -1
View File
@@ -1,7 +1,8 @@
import type { Config } from '../config/index.js';
import type { Lifecycle } from './lifecycle.js';
import type { AudioTranscriptionConfig } from '../models/media.js';
import { HookEngine } from '../hooks/index.js';
import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools } from '../tools/index.js';
import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createAudioTranscribeTool } from '../tools/index.js';
export interface ToolsDeps {
config: Config;
@@ -52,6 +53,18 @@ export function initTools(deps: ToolsDeps): ToolsResult {
console.log('Process manager stopped');
});
// Register audio transcription tool if configured
if (config.audio?.enabled && config.audio.provider) {
const audioConfig: AudioTranscriptionConfig = {
endpoint: config.audio.provider.endpoint,
apiKey: config.audio.provider.api_key,
model: config.audio.provider.model,
};
const audioTool = createAudioTranscribeTool(audioConfig);
toolRegistry.register(audioTool);
console.log(`Audio transcription enabled (type=${config.audio.provider.type}, endpoint=${audioConfig.endpoint})`);
}
// Initialize browser manager and register browser tools (if enabled)
let browserManager: BrowserManager | undefined;
if (config.browser?.enabled) {