feat(audio): add audio.transcribe tool with Whisper-compatible API support
- Add createAudioTranscribeTool with OpenAI/Groq/Ollama/llama.cpp provider support - Refactor audio config schema to nested audio.enabled + audio.provider structure - Move audio tool registration to initTools() for conditional enablement - Fix duplication bug in audio-transcribe.ts URL download handler - Support base64 data and URL-based audio input with format detection
This commit is contained in:
@@ -314,10 +314,16 @@ const webSearchSchema = z.object({
|
|||||||
max_results: z.number().min(1).max(20).default(5),
|
max_results: z.number().min(1).max(20).default(5),
|
||||||
}).default({});
|
}).default({});
|
||||||
|
|
||||||
|
const audioProviderSchema = z.object({
|
||||||
|
type: z.enum(['openai', 'groq', 'ollama', 'llamacpp', 'custom']),
|
||||||
|
endpoint: z.string().optional(),
|
||||||
|
api_key: z.string().optional(),
|
||||||
|
model: z.string().optional(),
|
||||||
|
});
|
||||||
|
|
||||||
const audioSchema = z.object({
|
const audioSchema = z.object({
|
||||||
transcription_endpoint: z.string().optional(),
|
enabled: z.boolean().default(false),
|
||||||
transcription_api_key: z.string().optional(),
|
provider: audioProviderSchema.optional(),
|
||||||
transcription_model: z.string().default('whisper-1'),
|
|
||||||
}).default({});
|
}).default({});
|
||||||
|
|
||||||
// ── Tool policy schemas ──────────────────────────────────────────────
|
// ── Tool policy schemas ──────────────────────────────────────────────
|
||||||
|
|||||||
+1
-8
@@ -5,7 +5,6 @@ import { mkdirSync } from 'fs';
|
|||||||
|
|
||||||
// ── Config & Types ──
|
// ── Config & Types ──
|
||||||
import type { Config } from '../config/index.js';
|
import type { Config } from '../config/index.js';
|
||||||
import type { AudioTranscriptionConfig } from '../models/media.js';
|
|
||||||
import type { ToolRegistry, ToolExecutor, BrowserManager } from '../tools/index.js';
|
import type { ToolRegistry, ToolExecutor, BrowserManager } from '../tools/index.js';
|
||||||
import type { AgentConfigRegistry, AgentRouter } from '../agents/index.js';
|
import type { AgentConfigRegistry, AgentRouter } from '../agents/index.js';
|
||||||
import type { SandboxManager } from '../sandbox/index.js';
|
import type { SandboxManager } from '../sandbox/index.js';
|
||||||
@@ -100,12 +99,6 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
|
|||||||
const { skillRegistry, skillInstaller } = initSkills(config);
|
const { skillRegistry, skillInstaller } = initSkills(config);
|
||||||
const { agentConfigRegistry, agentRouter, sandboxManager } = await initAgents({ config, lifecycle });
|
const { agentConfigRegistry, agentRouter, sandboxManager } = await initAgents({ config, lifecycle });
|
||||||
|
|
||||||
// ── Model & Prompt ──
|
|
||||||
const audioConfig: AudioTranscriptionConfig = {
|
|
||||||
endpoint: config.audio.transcription_endpoint,
|
|
||||||
apiKey: config.audio.transcription_api_key,
|
|
||||||
model: config.audio.transcription_model,
|
|
||||||
};
|
|
||||||
const modelRouter = createModelRouter(config);
|
const modelRouter = createModelRouter(config);
|
||||||
|
|
||||||
// Restore persisted model tier
|
// Restore persisted model tier
|
||||||
@@ -133,7 +126,7 @@ export async function startDaemon(config: Config): Promise<DaemonContext> {
|
|||||||
|
|
||||||
const messageRouter = createMessageRouter({
|
const messageRouter = createMessageRouter({
|
||||||
sessionManager, modelRouter, systemPrompt, toolRegistry, toolExecutor,
|
sessionManager, modelRouter, systemPrompt, toolRegistry, toolExecutor,
|
||||||
config, memoryStore, agentConfigRegistry, agentRouter, sandboxManager, audioConfig,
|
config, memoryStore, agentConfigRegistry, agentRouter, sandboxManager,
|
||||||
});
|
});
|
||||||
channelRegistry.setMessageHandler(messageRouter.handler);
|
channelRegistry.setMessageHandler(messageRouter.handler);
|
||||||
channelAgents = messageRouter.agents;
|
channelAgents = messageRouter.agents;
|
||||||
|
|||||||
+14
-1
@@ -1,7 +1,8 @@
|
|||||||
import type { Config } from '../config/index.js';
|
import type { Config } from '../config/index.js';
|
||||||
import type { Lifecycle } from './lifecycle.js';
|
import type { Lifecycle } from './lifecycle.js';
|
||||||
|
import type { AudioTranscriptionConfig } from '../models/media.js';
|
||||||
import { HookEngine } from '../hooks/index.js';
|
import { HookEngine } from '../hooks/index.js';
|
||||||
import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools } from '../tools/index.js';
|
import { ToolRegistry, ToolExecutor, ToolPolicy, allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createAudioTranscribeTool } from '../tools/index.js';
|
||||||
|
|
||||||
export interface ToolsDeps {
|
export interface ToolsDeps {
|
||||||
config: Config;
|
config: Config;
|
||||||
@@ -52,6 +53,18 @@ export function initTools(deps: ToolsDeps): ToolsResult {
|
|||||||
console.log('Process manager stopped');
|
console.log('Process manager stopped');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Register audio transcription tool if configured
|
||||||
|
if (config.audio?.enabled && config.audio.provider) {
|
||||||
|
const audioConfig: AudioTranscriptionConfig = {
|
||||||
|
endpoint: config.audio.provider.endpoint,
|
||||||
|
apiKey: config.audio.provider.api_key,
|
||||||
|
model: config.audio.provider.model,
|
||||||
|
};
|
||||||
|
const audioTool = createAudioTranscribeTool(audioConfig);
|
||||||
|
toolRegistry.register(audioTool);
|
||||||
|
console.log(`Audio transcription enabled (type=${config.audio.provider.type}, endpoint=${audioConfig.endpoint})`);
|
||||||
|
}
|
||||||
|
|
||||||
// Initialize browser manager and register browser tools (if enabled)
|
// Initialize browser manager and register browser tools (if enabled)
|
||||||
let browserManager: BrowserManager | undefined;
|
let browserManager: BrowserManager | undefined;
|
||||||
if (config.browser?.enabled) {
|
if (config.browser?.enabled) {
|
||||||
|
|||||||
@@ -0,0 +1,214 @@
|
|||||||
|
import type { Tool, ToolResult } from '../types.js';
|
||||||
|
|
||||||
|
interface AudioTranscribeArgs {
|
||||||
|
data?: string;
|
||||||
|
url?: string;
|
||||||
|
mime_type?: string;
|
||||||
|
language?: string;
|
||||||
|
prompt?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const SUPPORTED_MIME_TYPES = new Set([
|
||||||
|
'audio/ogg',
|
||||||
|
'audio/mpeg',
|
||||||
|
'audio/mp3',
|
||||||
|
'audio/wav',
|
||||||
|
'audio/webm',
|
||||||
|
'audio/mp4',
|
||||||
|
'audio/x-m4a',
|
||||||
|
]);
|
||||||
|
|
||||||
|
const PROVIDER_ENDPOINTS: Record<string, string> = {
|
||||||
|
openai: 'https://api.openai.com/v1/audio/transcriptions',
|
||||||
|
groq: 'https://api.groq.com/openai/v1/audio/transcriptions',
|
||||||
|
ollama: 'http://localhost:11434/api/generate',
|
||||||
|
llamacpp: 'http://localhost:8080/v1/audio/transcriptions',
|
||||||
|
};
|
||||||
|
|
||||||
|
function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: string } {
|
||||||
|
const hasData = args.data !== undefined && args.data !== '';
|
||||||
|
const hasUrl = args.url !== undefined && args.url !== '';
|
||||||
|
|
||||||
|
if (!hasData && !hasUrl) {
|
||||||
|
return { valid: false, error: 'Either data or url must be provided' };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasData && hasUrl) {
|
||||||
|
return { valid: false, error: 'Only one of data or url can be provided' };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasData && !args.mime_type) {
|
||||||
|
return { valid: false, error: 'mime_type is required when using data' };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (args.mime_type && !SUPPORTED_MIME_TYPES.has(args.mime_type)) {
|
||||||
|
return { valid: false, error: `Unsupported MIME type: ${args.mime_type}. Supported: ${Array.from(SUPPORTED_MIME_TYPES).join(', ')}` };
|
||||||
|
}
|
||||||
|
|
||||||
|
return { valid: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
interface AudioTranscriptionConfig {
|
||||||
|
endpoint?: string;
|
||||||
|
apiKey?: string;
|
||||||
|
model?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig): Tool {
|
||||||
|
return {
|
||||||
|
name: 'audio.transcribe',
|
||||||
|
description: 'Transcribe an audio file to text using a Whisper-compatible API. Accepts base64-encoded audio data or a URL to download to audio file.',
|
||||||
|
inputSchema: {
|
||||||
|
type: 'object',
|
||||||
|
properties: {
|
||||||
|
data: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Base64-encoded audio data (alternative to url)',
|
||||||
|
},
|
||||||
|
url: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'URL to download to audio file (alternative to data)',
|
||||||
|
},
|
||||||
|
mime_type: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'MIME type of audio file (required when using data, e.g., audio/ogg, audio/mpeg, audio/wav)',
|
||||||
|
},
|
||||||
|
language: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Language code (e.g., en, es, fr) - optional',
|
||||||
|
},
|
||||||
|
prompt: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Optional text to guide transcription (OpenAI/Groq/custom only)',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
execute: async (rawArgs: unknown): Promise<ToolResult> => {
|
||||||
|
const args = rawArgs as AudioTranscribeArgs;
|
||||||
|
|
||||||
|
const validation = validateInput(args);
|
||||||
|
if (!validation.valid) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
output: '',
|
||||||
|
error: validation.error,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!audioConfig?.endpoint) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
output: '',
|
||||||
|
error: 'Audio transcription endpoint not configured. Set audio.endpoint in config.yaml',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
let filename = 'audio.bin';
|
||||||
|
let audioBlob: Blob | undefined;
|
||||||
|
|
||||||
|
if (args.data) {
|
||||||
|
const rawBuffer = Buffer.from(args.data, 'base64');
|
||||||
|
const audioBuffer = rawBuffer.buffer;
|
||||||
|
|
||||||
|
const extMap: Record<string, string> = {
|
||||||
|
'audio/ogg': 'ogg',
|
||||||
|
'audio/mpeg': 'mp3',
|
||||||
|
'audio/mp3': 'mp3',
|
||||||
|
'audio/wav': 'wav',
|
||||||
|
'audio/webm': 'webm',
|
||||||
|
'audio/mp4': 'm4a',
|
||||||
|
'audio/x-m4a': 'm4a',
|
||||||
|
};
|
||||||
|
const ext = extMap[args.mime_type!] || 'bin';
|
||||||
|
filename = `audio.${ext}`;
|
||||||
|
|
||||||
|
const mimeType = args.mime_type ?? 'audio/wav';
|
||||||
|
audioBlob = new Blob([audioBuffer], { type: mimeType });
|
||||||
|
} else if (args.url) {
|
||||||
|
const response = await fetch(args.url);
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`Failed to download audio from ${args.url}: ${response.status} ${response.statusText}`);
|
||||||
|
}
|
||||||
|
const arrayBuffer = await response.arrayBuffer();
|
||||||
|
|
||||||
|
const urlExt = args.url.split('.').pop()?.split('?')[0] || 'bin';
|
||||||
|
filename = `audio.${urlExt}`;
|
||||||
|
|
||||||
|
audioBlob = new Blob([arrayBuffer], { type: 'audio/wav' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const endpoint = audioConfig.endpoint;
|
||||||
|
const model = audioConfig.model ?? 'whisper-1';
|
||||||
|
|
||||||
|
if (endpoint === PROVIDER_ENDPOINTS.ollama) {
|
||||||
|
const ollamaResponse = await fetch(endpoint, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({
|
||||||
|
model: model,
|
||||||
|
audio: args.data ?? args.url,
|
||||||
|
stream: false,
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!ollamaResponse.ok) {
|
||||||
|
throw new Error(`Ollama transcription failed: ${ollamaResponse.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const ollamaJson = await ollamaResponse.json() as { response?: string };
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
output: ollamaJson.response ?? 'No response from Ollama',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append('file', audioBlob, filename);
|
||||||
|
formData.append('model', model);
|
||||||
|
|
||||||
|
if (args.language) {
|
||||||
|
formData.append('language', args.language);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (args.prompt) {
|
||||||
|
formData.append('prompt', args.prompt);
|
||||||
|
}
|
||||||
|
|
||||||
|
const fetchOptions: RequestInit = {
|
||||||
|
method: 'POST',
|
||||||
|
body: formData,
|
||||||
|
};
|
||||||
|
|
||||||
|
const headers: Record<string, string> = {};
|
||||||
|
if (audioConfig.apiKey) {
|
||||||
|
headers['Authorization'] = `Bearer ${audioConfig.apiKey}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Object.keys(headers).length > 0) {
|
||||||
|
fetchOptions.headers = headers;
|
||||||
|
}
|
||||||
|
|
||||||
|
const response = await fetch(endpoint, fetchOptions);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const errorText = await response.text();
|
||||||
|
throw new Error(`Transcription request failed (${response.status}): ${errorText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const json = await response.json() as { text: string };
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
output: json.text,
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
output: '',
|
||||||
|
error: error instanceof Error ? error.message : 'Unknown error occurred',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -8,6 +8,7 @@ export { systemInfoTool } from './system-info.js';
|
|||||||
export { webFetchTool } from './web-fetch.js';
|
export { webFetchTool } from './web-fetch.js';
|
||||||
export { createMediaSendTool } from './media-send.js';
|
export { createMediaSendTool } from './media-send.js';
|
||||||
export { createImageAnalyzeTool } from './image-analyze.js';
|
export { createImageAnalyzeTool } from './image-analyze.js';
|
||||||
|
export { createAudioTranscribeTool } from './audio-transcribe.js';
|
||||||
export { createMemoryReadTool } from './memory-read.js';
|
export { createMemoryReadTool } from './memory-read.js';
|
||||||
export { createMemoryWriteTool } from './memory-write.js';
|
export { createMemoryWriteTool } from './memory-write.js';
|
||||||
export { createMemorySearchTool } from './memory-search.js';
|
export { createMemorySearchTool } from './memory-search.js';
|
||||||
|
|||||||
+1
-1
@@ -5,7 +5,7 @@ export { ToolExecutor } from './executor.js';
|
|||||||
export type { ToolExecutorConfig } from './executor.js';
|
export type { ToolExecutorConfig } from './executor.js';
|
||||||
export { ToolPolicy } from './policy.js';
|
export { ToolPolicy } from './policy.js';
|
||||||
export type { ToolPolicyContext } from './policy.js';
|
export type { ToolPolicyContext } from './policy.js';
|
||||||
export { allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool, createSessionTools, createAgentsListTool, createMessageSendTool, createCronTools, createGmailTools, createGcalTools, createGdocsTools, createGdriveTools, createGtasksTools } from './builtin/index.js';
|
export { allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool, createAudioTranscribeTool, createSessionTools, createAgentsListTool, createMessageSendTool, createCronTools, createGmailTools, createGcalTools, createGdocsTools, createGdriveTools, createGtasksTools } from './builtin/index.js';
|
||||||
export type { WebSearchConfig } from './builtin/web-search.js';
|
export type { WebSearchConfig } from './builtin/web-search.js';
|
||||||
export type { ProcessManagerConfig } from './builtin/process/index.js';
|
export type { ProcessManagerConfig } from './builtin/process/index.js';
|
||||||
export type { BrowserManagerConfig } from './builtin/browser/index.js';
|
export type { BrowserManagerConfig } from './builtin/browser/index.js';
|
||||||
|
|||||||
Reference in New Issue
Block a user