import type { Tool, ToolResult } from '../types.js'; interface AudioTranscribeArgs { data?: string; url?: string; mime_type?: string; language?: string; prompt?: string; } const SUPPORTED_MIME_TYPES = new Set([ 'audio/ogg', 'audio/mpeg', 'audio/mp3', 'audio/wav', 'audio/webm', 'audio/mp4', 'audio/x-m4a', ]); const PROVIDER_ENDPOINTS: Record = { openai: 'https://api.openai.com/v1/audio/transcriptions', groq: 'https://api.groq.com/openai/v1/audio/transcriptions', ollama: 'http://localhost:11434/api/generate', llamacpp: 'http://localhost:8080/v1/audio/transcriptions', }; const TRANSCRIPTION_FETCH_MAX_ATTEMPTS = 3; const TRANSCRIPTION_FETCH_TIMEOUT_MS = 45_000; const TRANSCRIPTION_FETCH_BASE_DELAY_MS = 250; function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } function isTransientNetworkError(error: unknown): boolean { const message = (error instanceof Error ? error.message : String(error)).toLowerCase(); return message.includes('fetch failed') || message.includes('network') || message.includes('timeout') || message.includes('timed out') || message.includes('econnrefused') || message.includes('econnreset') || message.includes('enotfound') || message.includes('ehostunreach'); } function buildEndpointCandidates(endpoint: string): string[] { try { const parsed = new URL(endpoint); if (parsed.hostname !== 'localhost') { return [endpoint]; } const ipv4Endpoint = new URL(endpoint); ipv4Endpoint.hostname = '127.0.0.1'; return [endpoint, ipv4Endpoint.toString()]; } catch { return [endpoint]; } } async function fetchWithRetry(endpoint: string, init: RequestInit): Promise { const endpointCandidates = buildEndpointCandidates(endpoint); let lastErrorMessage = 'Unknown network error'; let lastEndpoint = endpoint; for (let attempt = 1; attempt <= TRANSCRIPTION_FETCH_MAX_ATTEMPTS; attempt += 1) { const endpointForAttempt = endpointCandidates[(attempt - 1) % endpointCandidates.length]; const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), TRANSCRIPTION_FETCH_TIMEOUT_MS); try { return await fetch(endpointForAttempt, { ...init, signal: controller.signal }); } catch (error) { const timedOut = error instanceof Error && error.name === 'AbortError'; const normalizedMessage = timedOut ? `request timed out after ${TRANSCRIPTION_FETCH_TIMEOUT_MS}ms` : (error instanceof Error ? error.message : String(error)); lastErrorMessage = normalizedMessage; lastEndpoint = endpointForAttempt; const retriable = timedOut || isTransientNetworkError(error); const exhausted = attempt >= TRANSCRIPTION_FETCH_MAX_ATTEMPTS; if (!retriable || exhausted) { throw new Error( `Transcription service connectivity failure at ${lastEndpoint} after ${attempt} attempt(s): ${normalizedMessage}. This indicates endpoint/network availability, not missing audio bytes.`, ); } await sleep(TRANSCRIPTION_FETCH_BASE_DELAY_MS * (2 ** (attempt - 1))); } finally { clearTimeout(timeout); } } throw new Error( `Transcription service connectivity failure at ${lastEndpoint} after retries: ${lastErrorMessage}. This indicates endpoint/network availability, not missing audio bytes.`, ); } function validateUrl(url: string): { valid: boolean; error?: string } { let parsed: URL; try { parsed = new URL(url); } catch { return { valid: false, error: `Invalid URL: ${url}` }; } if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') { return { valid: false, error: `Only http/https URLs are allowed, got ${parsed.protocol}` }; } const hostname = parsed.hostname; if (hostname === 'localhost' || hostname === '127.0.0.1' || hostname === '::1' || hostname === '0.0.0.0') { return { valid: false, error: 'URLs pointing to localhost are not allowed' }; } // Block private/internal IP ranges if (/^(10\.|172\.(1[6-9]|2\d|3[01])\.|192\.168\.|169\.254\.)/.test(hostname)) { return { valid: false, error: 'URLs pointing to private/internal networks are not allowed' }; } return { valid: true }; } function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: string } { if (args.data !== undefined && typeof args.data !== 'string') { return { valid: false, error: 'data must be a base64 string when provided' }; } if (args.url !== undefined && typeof args.url !== 'string') { return { valid: false, error: 'url must be a string when provided' }; } if (args.mime_type !== undefined && typeof args.mime_type !== 'string') { return { valid: false, error: 'mime_type must be a string when provided' }; } if (args.language !== undefined && typeof args.language !== 'string') { return { valid: false, error: 'language must be a string when provided' }; } if (args.prompt !== undefined && typeof args.prompt !== 'string') { return { valid: false, error: 'prompt must be a string when provided' }; } const hasData = args.data !== undefined && args.data !== ''; const hasUrl = args.url !== undefined && args.url !== ''; if (!hasData && !hasUrl) { return { valid: false, error: 'Either data or url must be provided' }; } if (hasData && hasUrl) { return { valid: false, error: 'Only one of data or url can be provided' }; } if (hasData) { const compact = (args.data ?? '').replace(/\s+/g, ''); const isBase64 = /^[A-Za-z0-9+/=]+$/.test(compact); let hasDecodedBytes = false; if (isBase64) { try { hasDecodedBytes = Buffer.from(compact, 'base64').length > 0; } catch { hasDecodedBytes = false; } } if (!isBase64 || !hasDecodedBytes) { return { valid: false, error: 'data must be valid base64-encoded audio bytes' }; } } if (hasData && !args.mime_type) { return { valid: false, error: 'mime_type is required when using data' }; } if (args.mime_type && !SUPPORTED_MIME_TYPES.has(args.mime_type)) { return { valid: false, error: `Unsupported MIME type: ${args.mime_type}. Supported: ${Array.from(SUPPORTED_MIME_TYPES).join(', ')}` }; } if (hasUrl) { const url = args.url; if (!url) { return { valid: false, error: 'URL is required when using url mode' }; } const urlValidation = validateUrl(url); if (!urlValidation.valid) { return urlValidation; } } return { valid: true }; } function extractTranscriptionText(payload: unknown): string | undefined { if (typeof payload === 'string') { return payload; } if (!payload || typeof payload !== 'object') { return undefined; } const obj = payload as Record; const directKeys = ['text', 'transcript', 'transcription', 'output']; for (const key of directKeys) { const value = obj[key]; if (typeof value === 'string') { return value; } } if (obj.result && typeof obj.result === 'object') { const resultObj = obj.result as Record; const nested = resultObj.text ?? resultObj.transcript; if (typeof nested === 'string') { return nested; } } if (obj.data && typeof obj.data === 'object') { const dataObj = obj.data as Record; const nested = dataObj.text ?? dataObj.transcript; if (typeof nested === 'string') { return nested; } } if (Array.isArray(obj.results)) { for (const result of obj.results) { if (!result || typeof result !== 'object') { continue; } const resultObj = result as Record; if (typeof resultObj.text === 'string') { return resultObj.text; } if (Array.isArray(resultObj.alternatives)) { for (const alternative of resultObj.alternatives) { if (!alternative || typeof alternative !== 'object') { continue; } const altObj = alternative as Record; const altTranscript = altObj.transcript ?? altObj.text; if (typeof altTranscript === 'string') { return altTranscript; } } } } } if (Array.isArray(obj.segments)) { const joined = obj.segments .map((segment) => (segment && typeof segment === 'object' ? (segment as Record).text : undefined)) .filter((v): v is string => typeof v === 'string' && v.trim().length > 0) .join(' '); if (joined.trim().length > 0) { return joined; } } return undefined; } function extractTranscriptionError(payload: unknown): string | undefined { if (!payload || typeof payload !== 'object') { return undefined; } const obj = payload as Record; if (typeof obj.error === 'string' && obj.error.trim().length > 0) { return obj.error; } if (obj.error && typeof obj.error === 'object') { const errorObj = obj.error as Record; const message = errorObj.message ?? errorObj.error; if (typeof message === 'string' && message.trim().length > 0) { return message; } } if (typeof obj.detail === 'string' && obj.detail.trim().length > 0) { return obj.detail; } if (typeof obj.message === 'string' && obj.message.trim().length > 0) { return obj.message; } return undefined; } function truncateForError(text: string, max = 180): string { const normalized = text.replace(/\s+/g, ' ').trim(); if (normalized.length <= max) { return normalized; } return `${normalized.slice(0, max)}...`; } async function readResponseBody(response: Response): Promise { const textReader = response.text as unknown; if (typeof textReader === 'function') { return response.text(); } const maybeJsonResponse = response as unknown as { json?: () => Promise }; if (typeof maybeJsonResponse.json === 'function') { const jsonPayload = await maybeJsonResponse.json(); return JSON.stringify(jsonPayload); } return ''; } interface AudioTranscriptionConfig { endpoint?: string; apiKey?: string; model?: string; } export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig): Tool { return { name: 'audio.transcribe', description: 'Transcribe an audio file to text using a Whisper-compatible API. Accepts base64-encoded audio data or a URL to download to audio file.', inputSchema: { type: 'object', properties: { data: { type: 'string', description: 'Base64-encoded audio data (alternative to url)', }, url: { type: 'string', description: 'URL to download to audio file (alternative to data)', }, mime_type: { type: 'string', description: 'MIME type of audio file (required when using data, e.g., audio/ogg, audio/mpeg, audio/wav)', }, language: { type: 'string', description: 'Language code (e.g., en, es, fr) - optional', }, prompt: { type: 'string', description: 'Optional text to guide transcription (OpenAI/Groq/custom only)', }, }, }, execute: async (rawArgs: unknown): Promise => { const args = rawArgs as AudioTranscribeArgs; const validation = validateInput(args); if (!validation.valid) { return { success: false, output: '', error: validation.error, }; } if (!audioConfig?.endpoint) { return { success: false, output: '', error: 'Audio transcription endpoint not configured. Set audio.endpoint in config.yaml', }; } try { let filename = 'audio.bin'; let audioBlob: Blob | undefined; if (args.data) { const rawBuffer = Buffer.from(args.data, 'base64'); if (rawBuffer.length === 0) { throw new Error('Decoded audio data is empty'); } const extMap: Record = { 'audio/ogg': 'ogg', 'audio/mpeg': 'mp3', 'audio/mp3': 'mp3', 'audio/wav': 'wav', 'audio/webm': 'webm', 'audio/mp4': 'm4a', 'audio/x-m4a': 'm4a', }; const ext = extMap[args.mime_type ?? ''] || 'bin'; filename = `audio.${ext}`; const mimeType = args.mime_type ?? 'audio/wav'; audioBlob = new Blob([rawBuffer], { type: mimeType }); } else if (args.url) { const response = await fetch(args.url); if (!response.ok) { throw new Error(`Failed to download audio from ${args.url}: ${response.status} ${response.statusText}`); } const arrayBuffer = await response.arrayBuffer(); const urlExt = args.url.split('.').pop()?.split('?')[0] || 'bin'; filename = `audio.${urlExt}`; const contentType = response.headers.get('content-type') ?? 'audio/wav'; audioBlob = new Blob([arrayBuffer], { type: contentType }); } const endpoint = audioConfig.endpoint; const model = audioConfig.model ?? 'whisper-1'; if (endpoint === PROVIDER_ENDPOINTS.ollama) { const ollamaResponse = await fetch(endpoint, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: model, audio: args.data ?? args.url, stream: false, }), }); if (!ollamaResponse.ok) { throw new Error(`Ollama transcription failed: ${ollamaResponse.status}`); } const ollamaJson = await ollamaResponse.json() as { response?: string }; return { success: true, output: ollamaJson.response ?? 'No response from Ollama', }; } const formData = new FormData(); formData.append('file', audioBlob, filename); formData.append('model', model); formData.append('response_format', 'json'); if (args.language) { formData.append('language', args.language); } if (args.prompt) { formData.append('prompt', args.prompt); } const fetchOptions: RequestInit = { method: 'POST', body: formData, }; const headers: Record = {}; if (audioConfig.apiKey) { headers['Authorization'] = `Bearer ${audioConfig.apiKey}`; } if (Object.keys(headers).length > 0) { fetchOptions.headers = headers; } const response = await fetchWithRetry(endpoint, fetchOptions); if (!response.ok) { const errorText = await response.text(); throw new Error(`Transcription request failed (${response.status}): ${errorText}`); } const rawBody = await readResponseBody(response); const trimmedBody = rawBody.trim(); let payload: unknown = rawBody; if (trimmedBody.startsWith('{') || trimmedBody.startsWith('[')) { try { payload = JSON.parse(rawBody) as unknown; } catch { payload = rawBody; } } const transcript = extractTranscriptionText(payload); if (transcript === undefined) { const endpointError = extractTranscriptionError(payload); if (endpointError) { throw new Error(`Transcription endpoint error: ${endpointError}`); } throw new Error(`Transcription response missing text field (body: ${truncateForError(rawBody)})`); } const normalizedTranscript = transcript.trim().length > 0 ? transcript : '[No speech detected]'; return { success: true, output: normalizedTranscript, }; } catch (error) { return { success: false, output: '', error: error instanceof Error ? error.message : 'Unknown error occurred', }; } }, }; }