diff --git a/src/config/schema.ts b/src/config/schema.ts index a043e62..cded816 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -314,10 +314,16 @@ const webSearchSchema = z.object({ max_results: z.number().min(1).max(20).default(5), }).default({}); +const audioProviderSchema = z.object({ + type: z.enum(['openai', 'groq', 'ollama', 'llamacpp', 'custom']), + endpoint: z.string().optional(), + api_key: z.string().optional(), + model: z.string().optional(), +}); + const audioSchema = z.object({ - transcription_endpoint: z.string().optional(), - transcription_api_key: z.string().optional(), - transcription_model: z.string().default('whisper-1'), + enabled: z.boolean().default(false), + provider: audioProviderSchema.optional(), }).default({}); // ── Tool policy schemas ────────────────────────────────────────────── diff --git a/src/daemon/index.ts b/src/daemon/index.ts index 8737b06..8d40cbb 100644 --- a/src/daemon/index.ts +++ b/src/daemon/index.ts @@ -5,7 +5,6 @@ import { mkdirSync } from 'fs'; // ── Config & Types ── import type { Config } from '../config/index.js'; -import type { AudioTranscriptionConfig } from '../models/media.js'; import type { ToolRegistry, ToolExecutor, BrowserManager } from '../tools/index.js'; import type { AgentConfigRegistry, AgentRouter } from '../agents/index.js'; import type { SandboxManager } from '../sandbox/index.js'; @@ -100,12 +99,6 @@ export async function startDaemon(config: Config): Promise { const { skillRegistry, skillInstaller } = initSkills(config); const { agentConfigRegistry, agentRouter, sandboxManager } = await initAgents({ config, lifecycle }); - // ── Model & Prompt ── - const audioConfig: AudioTranscriptionConfig = { - endpoint: config.audio.transcription_endpoint, - apiKey: config.audio.transcription_api_key, - model: config.audio.transcription_model, - }; const modelRouter = createModelRouter(config); // Restore persisted model tier @@ -133,7 +126,7 @@ export async function startDaemon(config: Config): Promise { const messageRouter = createMessageRouter({ sessionManager, modelRouter, systemPrompt, toolRegistry, toolExecutor, - config, memoryStore, agentConfigRegistry, agentRouter, sandboxManager, audioConfig, + config, memoryStore, agentConfigRegistry, agentRouter, sandboxManager, }); channelRegistry.setMessageHandler(messageRouter.handler); channelAgents = messageRouter.agents; diff --git a/src/daemon/tools.ts b/src/daemon/tools.ts index 0c55d9a..ab81304 100644 --- a/src/daemon/tools.ts +++ b/src/daemon/tools.ts @@ -1,7 +1,8 @@ import type { Config } from '../config/index.js'; import type { Lifecycle } from './lifecycle.js'; +import type { AudioTranscriptionConfig } from '../models/media.js'; import { HookEngine } from '../hooks/index.js'; -import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools } from '../tools/index.js'; +import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createAudioTranscribeTool } from '../tools/index.js'; export interface ToolsDeps { config: Config; @@ -52,6 +53,18 @@ export function initTools(deps: ToolsDeps): ToolsResult { console.log('Process manager stopped'); }); + // Register audio transcription tool if configured + if (config.audio?.enabled && config.audio.provider) { + const audioConfig: AudioTranscriptionConfig = { + endpoint: config.audio.provider.endpoint, + apiKey: config.audio.provider.api_key, + model: config.audio.provider.model, + }; + const audioTool = createAudioTranscribeTool(audioConfig); + toolRegistry.register(audioTool); + console.log(`Audio transcription enabled (type=${config.audio.provider.type}, endpoint=${audioConfig.endpoint})`); + } + // Initialize browser manager and register browser tools (if enabled) let browserManager: BrowserManager | undefined; if (config.browser?.enabled) { diff --git a/src/tools/builtin/audio-transcribe.ts b/src/tools/builtin/audio-transcribe.ts new file mode 100644 index 0000000..4930920 --- /dev/null +++ b/src/tools/builtin/audio-transcribe.ts @@ -0,0 +1,214 @@ +import type { Tool, ToolResult } from '../types.js'; + +interface AudioTranscribeArgs { + data?: string; + url?: string; + mime_type?: string; + language?: string; + prompt?: string; +} + +const SUPPORTED_MIME_TYPES = new Set([ + 'audio/ogg', + 'audio/mpeg', + 'audio/mp3', + 'audio/wav', + 'audio/webm', + 'audio/mp4', + 'audio/x-m4a', +]); + +const PROVIDER_ENDPOINTS: Record = { + openai: 'https://api.openai.com/v1/audio/transcriptions', + groq: 'https://api.groq.com/openai/v1/audio/transcriptions', + ollama: 'http://localhost:11434/api/generate', + llamacpp: 'http://localhost:8080/v1/audio/transcriptions', +}; + +function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: string } { + const hasData = args.data !== undefined && args.data !== ''; + const hasUrl = args.url !== undefined && args.url !== ''; + + if (!hasData && !hasUrl) { + return { valid: false, error: 'Either data or url must be provided' }; + } + + if (hasData && hasUrl) { + return { valid: false, error: 'Only one of data or url can be provided' }; + } + + if (hasData && !args.mime_type) { + return { valid: false, error: 'mime_type is required when using data' }; + } + + if (args.mime_type && !SUPPORTED_MIME_TYPES.has(args.mime_type)) { + return { valid: false, error: `Unsupported MIME type: ${args.mime_type}. Supported: ${Array.from(SUPPORTED_MIME_TYPES).join(', ')}` }; + } + + return { valid: true }; +} + +interface AudioTranscriptionConfig { + endpoint?: string; + apiKey?: string; + model?: string; +} + +export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig): Tool { + return { + name: 'audio.transcribe', + description: 'Transcribe an audio file to text using a Whisper-compatible API. Accepts base64-encoded audio data or a URL to download to audio file.', + inputSchema: { + type: 'object', + properties: { + data: { + type: 'string', + description: 'Base64-encoded audio data (alternative to url)', + }, + url: { + type: 'string', + description: 'URL to download to audio file (alternative to data)', + }, + mime_type: { + type: 'string', + description: 'MIME type of audio file (required when using data, e.g., audio/ogg, audio/mpeg, audio/wav)', + }, + language: { + type: 'string', + description: 'Language code (e.g., en, es, fr) - optional', + }, + prompt: { + type: 'string', + description: 'Optional text to guide transcription (OpenAI/Groq/custom only)', + }, + }, + }, + + execute: async (rawArgs: unknown): Promise => { + const args = rawArgs as AudioTranscribeArgs; + + const validation = validateInput(args); + if (!validation.valid) { + return { + success: false, + output: '', + error: validation.error, + }; + } + + if (!audioConfig?.endpoint) { + return { + success: false, + output: '', + error: 'Audio transcription endpoint not configured. Set audio.endpoint in config.yaml', + }; + } + + try { + let filename = 'audio.bin'; + let audioBlob: Blob | undefined; + + if (args.data) { + const rawBuffer = Buffer.from(args.data, 'base64'); + const audioBuffer = rawBuffer.buffer; + + const extMap: Record = { + 'audio/ogg': 'ogg', + 'audio/mpeg': 'mp3', + 'audio/mp3': 'mp3', + 'audio/wav': 'wav', + 'audio/webm': 'webm', + 'audio/mp4': 'm4a', + 'audio/x-m4a': 'm4a', + }; + const ext = extMap[args.mime_type!] || 'bin'; + filename = `audio.${ext}`; + + const mimeType = args.mime_type ?? 'audio/wav'; + audioBlob = new Blob([audioBuffer], { type: mimeType }); + } else if (args.url) { + const response = await fetch(args.url); + if (!response.ok) { + throw new Error(`Failed to download audio from ${args.url}: ${response.status} ${response.statusText}`); + } + const arrayBuffer = await response.arrayBuffer(); + + const urlExt = args.url.split('.').pop()?.split('?')[0] || 'bin'; + filename = `audio.${urlExt}`; + + audioBlob = new Blob([arrayBuffer], { type: 'audio/wav' }); + } + + const endpoint = audioConfig.endpoint; + const model = audioConfig.model ?? 'whisper-1'; + + if (endpoint === PROVIDER_ENDPOINTS.ollama) { + const ollamaResponse = await fetch(endpoint, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: model, + audio: args.data ?? args.url, + stream: false, + }), + }); + + if (!ollamaResponse.ok) { + throw new Error(`Ollama transcription failed: ${ollamaResponse.status}`); + } + + const ollamaJson = await ollamaResponse.json() as { response?: string }; + return { + success: true, + output: ollamaJson.response ?? 'No response from Ollama', + }; + } + + const formData = new FormData(); + formData.append('file', audioBlob, filename); + formData.append('model', model); + + if (args.language) { + formData.append('language', args.language); + } + + if (args.prompt) { + formData.append('prompt', args.prompt); + } + + const fetchOptions: RequestInit = { + method: 'POST', + body: formData, + }; + + const headers: Record = {}; + if (audioConfig.apiKey) { + headers['Authorization'] = `Bearer ${audioConfig.apiKey}`; + } + + if (Object.keys(headers).length > 0) { + fetchOptions.headers = headers; + } + + const response = await fetch(endpoint, fetchOptions); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Transcription request failed (${response.status}): ${errorText}`); + } + + const json = await response.json() as { text: string }; + return { + success: true, + output: json.text, + }; + } catch (error) { + return { + success: false, + output: '', + error: error instanceof Error ? error.message : 'Unknown error occurred', + }; + } + }, + }; +} diff --git a/src/tools/builtin/index.ts b/src/tools/builtin/index.ts index daabe78..e247ac6 100644 --- a/src/tools/builtin/index.ts +++ b/src/tools/builtin/index.ts @@ -8,6 +8,7 @@ export { systemInfoTool } from './system-info.js'; export { webFetchTool } from './web-fetch.js'; export { createMediaSendTool } from './media-send.js'; export { createImageAnalyzeTool } from './image-analyze.js'; +export { createAudioTranscribeTool } from './audio-transcribe.js'; export { createMemoryReadTool } from './memory-read.js'; export { createMemoryWriteTool } from './memory-write.js'; export { createMemorySearchTool } from './memory-search.js'; diff --git a/src/tools/index.ts b/src/tools/index.ts index 5a1ec7e..ff53526 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -5,7 +5,7 @@ export { ToolExecutor } from './executor.js'; export type { ToolExecutorConfig } from './executor.js'; export { ToolPolicy } from './policy.js'; export type { ToolPolicyContext } from './policy.js'; -export { allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool, createSessionTools, createAgentsListTool, createMessageSendTool, createCronTools, createGmailTools, createGcalTools, createGdocsTools, createGdriveTools, createGtasksTools } from './builtin/index.js'; +export { allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool, createAudioTranscribeTool, createSessionTools, createAgentsListTool, createMessageSendTool, createCronTools, createGmailTools, createGcalTools, createGdocsTools, createGdriveTools, createGtasksTools } from './builtin/index.js'; export type { WebSearchConfig } from './builtin/web-search.js'; export type { ProcessManagerConfig } from './builtin/process/index.js'; export type { BrowserManagerConfig } from './builtin/browser/index.js';