feat(audio): add audio.transcribe tool with Whisper-compatible API support

- Add createAudioTranscribeTool with OpenAI/Groq/Ollama/llama.cpp provider support
- Refactor audio config schema to nested audio.enabled + audio.provider structure
- Move audio tool registration to initTools() for conditional enablement
- Fix duplication bug in audio-transcribe.ts URL download handler
- Support base64 data and URL-based audio input with format detection
This commit is contained in:
William Valentin
2026-02-11 18:13:19 -08:00
parent 5491d5a82a
commit a875bcc4ae
6 changed files with 240 additions and 13 deletions
+214
View File
@@ -0,0 +1,214 @@
import type { Tool, ToolResult } from '../types.js';
interface AudioTranscribeArgs {
data?: string;
url?: string;
mime_type?: string;
language?: string;
prompt?: string;
}
const SUPPORTED_MIME_TYPES = new Set([
'audio/ogg',
'audio/mpeg',
'audio/mp3',
'audio/wav',
'audio/webm',
'audio/mp4',
'audio/x-m4a',
]);
const PROVIDER_ENDPOINTS: Record<string, string> = {
openai: 'https://api.openai.com/v1/audio/transcriptions',
groq: 'https://api.groq.com/openai/v1/audio/transcriptions',
ollama: 'http://localhost:11434/api/generate',
llamacpp: 'http://localhost:8080/v1/audio/transcriptions',
};
function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: string } {
const hasData = args.data !== undefined && args.data !== '';
const hasUrl = args.url !== undefined && args.url !== '';
if (!hasData && !hasUrl) {
return { valid: false, error: 'Either data or url must be provided' };
}
if (hasData && hasUrl) {
return { valid: false, error: 'Only one of data or url can be provided' };
}
if (hasData && !args.mime_type) {
return { valid: false, error: 'mime_type is required when using data' };
}
if (args.mime_type && !SUPPORTED_MIME_TYPES.has(args.mime_type)) {
return { valid: false, error: `Unsupported MIME type: ${args.mime_type}. Supported: ${Array.from(SUPPORTED_MIME_TYPES).join(', ')}` };
}
return { valid: true };
}
interface AudioTranscriptionConfig {
endpoint?: string;
apiKey?: string;
model?: string;
}
export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig): Tool {
return {
name: 'audio.transcribe',
description: 'Transcribe an audio file to text using a Whisper-compatible API. Accepts base64-encoded audio data or a URL to download to audio file.',
inputSchema: {
type: 'object',
properties: {
data: {
type: 'string',
description: 'Base64-encoded audio data (alternative to url)',
},
url: {
type: 'string',
description: 'URL to download to audio file (alternative to data)',
},
mime_type: {
type: 'string',
description: 'MIME type of audio file (required when using data, e.g., audio/ogg, audio/mpeg, audio/wav)',
},
language: {
type: 'string',
description: 'Language code (e.g., en, es, fr) - optional',
},
prompt: {
type: 'string',
description: 'Optional text to guide transcription (OpenAI/Groq/custom only)',
},
},
},
execute: async (rawArgs: unknown): Promise<ToolResult> => {
const args = rawArgs as AudioTranscribeArgs;
const validation = validateInput(args);
if (!validation.valid) {
return {
success: false,
output: '',
error: validation.error,
};
}
if (!audioConfig?.endpoint) {
return {
success: false,
output: '',
error: 'Audio transcription endpoint not configured. Set audio.endpoint in config.yaml',
};
}
try {
let filename = 'audio.bin';
let audioBlob: Blob | undefined;
if (args.data) {
const rawBuffer = Buffer.from(args.data, 'base64');
const audioBuffer = rawBuffer.buffer;
const extMap: Record<string, string> = {
'audio/ogg': 'ogg',
'audio/mpeg': 'mp3',
'audio/mp3': 'mp3',
'audio/wav': 'wav',
'audio/webm': 'webm',
'audio/mp4': 'm4a',
'audio/x-m4a': 'm4a',
};
const ext = extMap[args.mime_type!] || 'bin';
filename = `audio.${ext}`;
const mimeType = args.mime_type ?? 'audio/wav';
audioBlob = new Blob([audioBuffer], { type: mimeType });
} else if (args.url) {
const response = await fetch(args.url);
if (!response.ok) {
throw new Error(`Failed to download audio from ${args.url}: ${response.status} ${response.statusText}`);
}
const arrayBuffer = await response.arrayBuffer();
const urlExt = args.url.split('.').pop()?.split('?')[0] || 'bin';
filename = `audio.${urlExt}`;
audioBlob = new Blob([arrayBuffer], { type: 'audio/wav' });
}
const endpoint = audioConfig.endpoint;
const model = audioConfig.model ?? 'whisper-1';
if (endpoint === PROVIDER_ENDPOINTS.ollama) {
const ollamaResponse = await fetch(endpoint, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model: model,
audio: args.data ?? args.url,
stream: false,
}),
});
if (!ollamaResponse.ok) {
throw new Error(`Ollama transcription failed: ${ollamaResponse.status}`);
}
const ollamaJson = await ollamaResponse.json() as { response?: string };
return {
success: true,
output: ollamaJson.response ?? 'No response from Ollama',
};
}
const formData = new FormData();
formData.append('file', audioBlob, filename);
formData.append('model', model);
if (args.language) {
formData.append('language', args.language);
}
if (args.prompt) {
formData.append('prompt', args.prompt);
}
const fetchOptions: RequestInit = {
method: 'POST',
body: formData,
};
const headers: Record<string, string> = {};
if (audioConfig.apiKey) {
headers['Authorization'] = `Bearer ${audioConfig.apiKey}`;
}
if (Object.keys(headers).length > 0) {
fetchOptions.headers = headers;
}
const response = await fetch(endpoint, fetchOptions);
if (!response.ok) {
const errorText = await response.text();
throw new Error(`Transcription request failed (${response.status}): ${errorText}`);
}
const json = await response.json() as { text: string };
return {
success: true,
output: json.text,
};
} catch (error) {
return {
success: false,
output: '',
error: error instanceof Error ? error.message : 'Unknown error occurred',
};
}
},
};
}
+1
View File
@@ -8,6 +8,7 @@ export { systemInfoTool } from './system-info.js';
export { webFetchTool } from './web-fetch.js';
export { createMediaSendTool } from './media-send.js';
export { createImageAnalyzeTool } from './image-analyze.js';
export { createAudioTranscribeTool } from './audio-transcribe.js';
export { createMemoryReadTool } from './memory-read.js';
export { createMemoryWriteTool } from './memory-write.js';
export { createMemorySearchTool } from './memory-search.js';
+1 -1
View File
@@ -5,7 +5,7 @@ export { ToolExecutor } from './executor.js';
export type { ToolExecutorConfig } from './executor.js';
export { ToolPolicy } from './policy.js';
export type { ToolPolicyContext } from './policy.js';
export { allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool, createSessionTools, createAgentsListTool, createMessageSendTool, createCronTools, createGmailTools, createGcalTools, createGdocsTools, createGdriveTools, createGtasksTools } from './builtin/index.js';
export { allBuiltinTools, createWebSearchTools, createProcessTools, ProcessManager, BrowserManager, createBrowserTools, createMediaSendTool, createAudioTranscribeTool, createSessionTools, createAgentsListTool, createMessageSendTool, createCronTools, createGmailTools, createGcalTools, createGdocsTools, createGdriveTools, createGtasksTools } from './builtin/index.js';
export type { WebSearchConfig } from './builtin/web-search.js';
export type { ProcessManagerConfig } from './builtin/process/index.js';
export type { BrowserManagerConfig } from './builtin/browser/index.js';