501 lines
16 KiB
TypeScript
501 lines
16 KiB
TypeScript
import type { Tool, ToolResult } from '../types.js';
|
|
|
|
interface AudioTranscribeArgs {
|
|
data?: string;
|
|
url?: string;
|
|
mime_type?: string;
|
|
language?: string;
|
|
prompt?: string;
|
|
}
|
|
|
|
const SUPPORTED_MIME_TYPES = new Set([
|
|
'audio/ogg',
|
|
'audio/mpeg',
|
|
'audio/mp3',
|
|
'audio/wav',
|
|
'audio/webm',
|
|
'audio/mp4',
|
|
'audio/x-m4a',
|
|
]);
|
|
|
|
const PROVIDER_ENDPOINTS: Record<string, string> = {
|
|
openai: 'https://api.openai.com/v1/audio/transcriptions',
|
|
groq: 'https://api.groq.com/openai/v1/audio/transcriptions',
|
|
ollama: 'http://localhost:11434/api/generate',
|
|
llamacpp: 'http://localhost:8080/v1/audio/transcriptions',
|
|
};
|
|
|
|
const TRANSCRIPTION_FETCH_MAX_ATTEMPTS = 3;
|
|
const TRANSCRIPTION_FETCH_TIMEOUT_MS = 45_000;
|
|
const TRANSCRIPTION_FETCH_BASE_DELAY_MS = 250;
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
function isTransientNetworkError(error: unknown): boolean {
|
|
const message = (error instanceof Error ? error.message : String(error)).toLowerCase();
|
|
return message.includes('fetch failed')
|
|
|| message.includes('network')
|
|
|| message.includes('timeout')
|
|
|| message.includes('timed out')
|
|
|| message.includes('econnrefused')
|
|
|| message.includes('econnreset')
|
|
|| message.includes('enotfound')
|
|
|| message.includes('ehostunreach');
|
|
}
|
|
|
|
function buildEndpointCandidates(endpoint: string): string[] {
|
|
try {
|
|
const parsed = new URL(endpoint);
|
|
if (parsed.hostname !== 'localhost') {
|
|
return [endpoint];
|
|
}
|
|
const ipv4Endpoint = new URL(endpoint);
|
|
ipv4Endpoint.hostname = '127.0.0.1';
|
|
return [endpoint, ipv4Endpoint.toString()];
|
|
} catch {
|
|
return [endpoint];
|
|
}
|
|
}
|
|
|
|
async function fetchWithRetry(endpoint: string, init: RequestInit): Promise<Response> {
|
|
const endpointCandidates = buildEndpointCandidates(endpoint);
|
|
let lastErrorMessage = 'Unknown network error';
|
|
let lastEndpoint = endpoint;
|
|
for (let attempt = 1; attempt <= TRANSCRIPTION_FETCH_MAX_ATTEMPTS; attempt += 1) {
|
|
const endpointForAttempt = endpointCandidates[(attempt - 1) % endpointCandidates.length];
|
|
const controller = new AbortController();
|
|
const timeout = setTimeout(() => controller.abort(), TRANSCRIPTION_FETCH_TIMEOUT_MS);
|
|
try {
|
|
return await fetch(endpointForAttempt, { ...init, signal: controller.signal });
|
|
} catch (error) {
|
|
const timedOut = error instanceof Error && error.name === 'AbortError';
|
|
const normalizedMessage = timedOut
|
|
? `request timed out after ${TRANSCRIPTION_FETCH_TIMEOUT_MS}ms`
|
|
: (error instanceof Error ? error.message : String(error));
|
|
lastErrorMessage = normalizedMessage;
|
|
lastEndpoint = endpointForAttempt;
|
|
const retriable = timedOut || isTransientNetworkError(error);
|
|
const exhausted = attempt >= TRANSCRIPTION_FETCH_MAX_ATTEMPTS;
|
|
if (!retriable || exhausted) {
|
|
throw new Error(
|
|
`Transcription service connectivity failure at ${lastEndpoint} after ${attempt} attempt(s): ${normalizedMessage}. This indicates endpoint/network availability, not missing audio bytes.`,
|
|
);
|
|
}
|
|
await sleep(TRANSCRIPTION_FETCH_BASE_DELAY_MS * (2 ** (attempt - 1)));
|
|
} finally {
|
|
clearTimeout(timeout);
|
|
}
|
|
}
|
|
|
|
throw new Error(
|
|
`Transcription service connectivity failure at ${lastEndpoint} after retries: ${lastErrorMessage}. This indicates endpoint/network availability, not missing audio bytes.`,
|
|
);
|
|
}
|
|
|
|
function validateUrl(url: string): { valid: boolean; error?: string } {
|
|
let parsed: URL;
|
|
try {
|
|
parsed = new URL(url);
|
|
} catch {
|
|
return { valid: false, error: `Invalid URL: ${url}` };
|
|
}
|
|
|
|
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
|
|
return { valid: false, error: `Only http/https URLs are allowed, got ${parsed.protocol}` };
|
|
}
|
|
|
|
const hostname = parsed.hostname;
|
|
if (hostname === 'localhost' || hostname === '127.0.0.1' || hostname === '::1' || hostname === '0.0.0.0') {
|
|
return { valid: false, error: 'URLs pointing to localhost are not allowed' };
|
|
}
|
|
|
|
// Block private/internal IP ranges
|
|
if (/^(10\.|172\.(1[6-9]|2\d|3[01])\.|192\.168\.|169\.254\.)/.test(hostname)) {
|
|
return { valid: false, error: 'URLs pointing to private/internal networks are not allowed' };
|
|
}
|
|
|
|
return { valid: true };
|
|
}
|
|
|
|
function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: string } {
|
|
if (args.data !== undefined && typeof args.data !== 'string') {
|
|
return { valid: false, error: 'data must be a base64 string when provided' };
|
|
}
|
|
if (args.url !== undefined && typeof args.url !== 'string') {
|
|
return { valid: false, error: 'url must be a string when provided' };
|
|
}
|
|
if (args.mime_type !== undefined && typeof args.mime_type !== 'string') {
|
|
return { valid: false, error: 'mime_type must be a string when provided' };
|
|
}
|
|
if (args.language !== undefined && typeof args.language !== 'string') {
|
|
return { valid: false, error: 'language must be a string when provided' };
|
|
}
|
|
if (args.prompt !== undefined && typeof args.prompt !== 'string') {
|
|
return { valid: false, error: 'prompt must be a string when provided' };
|
|
}
|
|
|
|
const hasData = args.data !== undefined && args.data !== '';
|
|
const hasUrl = args.url !== undefined && args.url !== '';
|
|
|
|
if (!hasData && !hasUrl) {
|
|
return { valid: false, error: 'Either data or url must be provided' };
|
|
}
|
|
|
|
if (hasData && hasUrl) {
|
|
return { valid: false, error: 'Only one of data or url can be provided' };
|
|
}
|
|
|
|
if (hasData) {
|
|
const compact = (args.data ?? '').replace(/\s+/g, '');
|
|
const isBase64 = /^[A-Za-z0-9+/=]+$/.test(compact);
|
|
let hasDecodedBytes = false;
|
|
if (isBase64) {
|
|
try {
|
|
hasDecodedBytes = Buffer.from(compact, 'base64').length > 0;
|
|
} catch {
|
|
hasDecodedBytes = false;
|
|
}
|
|
}
|
|
if (!isBase64 || !hasDecodedBytes) {
|
|
return { valid: false, error: 'data must be valid base64-encoded audio bytes' };
|
|
}
|
|
}
|
|
|
|
if (hasData && !args.mime_type) {
|
|
return { valid: false, error: 'mime_type is required when using data' };
|
|
}
|
|
|
|
if (args.mime_type && !SUPPORTED_MIME_TYPES.has(args.mime_type)) {
|
|
return { valid: false, error: `Unsupported MIME type: ${args.mime_type}. Supported: ${Array.from(SUPPORTED_MIME_TYPES).join(', ')}` };
|
|
}
|
|
|
|
if (hasUrl) {
|
|
const url = args.url;
|
|
if (!url) {
|
|
return { valid: false, error: 'URL is required when using url mode' };
|
|
}
|
|
const urlValidation = validateUrl(url);
|
|
if (!urlValidation.valid) {
|
|
return urlValidation;
|
|
}
|
|
}
|
|
|
|
return { valid: true };
|
|
}
|
|
|
|
function extractTranscriptionText(payload: unknown): string | undefined {
|
|
if (typeof payload === 'string') {
|
|
return payload;
|
|
}
|
|
if (!payload || typeof payload !== 'object') {
|
|
return undefined;
|
|
}
|
|
|
|
const obj = payload as Record<string, unknown>;
|
|
const directKeys = ['text', 'transcript', 'transcription', 'output'];
|
|
for (const key of directKeys) {
|
|
const value = obj[key];
|
|
if (typeof value === 'string') {
|
|
return value;
|
|
}
|
|
}
|
|
|
|
if (obj.result && typeof obj.result === 'object') {
|
|
const resultObj = obj.result as Record<string, unknown>;
|
|
const nested = resultObj.text ?? resultObj.transcript;
|
|
if (typeof nested === 'string') {
|
|
return nested;
|
|
}
|
|
}
|
|
|
|
if (obj.data && typeof obj.data === 'object') {
|
|
const dataObj = obj.data as Record<string, unknown>;
|
|
const nested = dataObj.text ?? dataObj.transcript;
|
|
if (typeof nested === 'string') {
|
|
return nested;
|
|
}
|
|
}
|
|
|
|
if (Array.isArray(obj.results)) {
|
|
for (const result of obj.results) {
|
|
if (!result || typeof result !== 'object') {
|
|
continue;
|
|
}
|
|
const resultObj = result as Record<string, unknown>;
|
|
if (typeof resultObj.text === 'string') {
|
|
return resultObj.text;
|
|
}
|
|
|
|
if (Array.isArray(resultObj.alternatives)) {
|
|
for (const alternative of resultObj.alternatives) {
|
|
if (!alternative || typeof alternative !== 'object') {
|
|
continue;
|
|
}
|
|
const altObj = alternative as Record<string, unknown>;
|
|
const altTranscript = altObj.transcript ?? altObj.text;
|
|
if (typeof altTranscript === 'string') {
|
|
return altTranscript;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (Array.isArray(obj.segments)) {
|
|
const joined = obj.segments
|
|
.map((segment) => (segment && typeof segment === 'object'
|
|
? (segment as Record<string, unknown>).text
|
|
: undefined))
|
|
.filter((v): v is string => typeof v === 'string' && v.trim().length > 0)
|
|
.join(' ');
|
|
if (joined.trim().length > 0) {
|
|
return joined;
|
|
}
|
|
}
|
|
|
|
return undefined;
|
|
}
|
|
|
|
function extractTranscriptionError(payload: unknown): string | undefined {
|
|
if (!payload || typeof payload !== 'object') {
|
|
return undefined;
|
|
}
|
|
|
|
const obj = payload as Record<string, unknown>;
|
|
if (typeof obj.error === 'string' && obj.error.trim().length > 0) {
|
|
return obj.error;
|
|
}
|
|
|
|
if (obj.error && typeof obj.error === 'object') {
|
|
const errorObj = obj.error as Record<string, unknown>;
|
|
const message = errorObj.message ?? errorObj.error;
|
|
if (typeof message === 'string' && message.trim().length > 0) {
|
|
return message;
|
|
}
|
|
}
|
|
|
|
if (typeof obj.detail === 'string' && obj.detail.trim().length > 0) {
|
|
return obj.detail;
|
|
}
|
|
|
|
if (typeof obj.message === 'string' && obj.message.trim().length > 0) {
|
|
return obj.message;
|
|
}
|
|
|
|
return undefined;
|
|
}
|
|
|
|
function truncateForError(text: string, max = 180): string {
|
|
const normalized = text.replace(/\s+/g, ' ').trim();
|
|
if (normalized.length <= max) {
|
|
return normalized;
|
|
}
|
|
return `${normalized.slice(0, max)}...`;
|
|
}
|
|
|
|
async function readResponseBody(response: Response): Promise<string> {
|
|
const textReader = response.text as unknown;
|
|
if (typeof textReader === 'function') {
|
|
return response.text();
|
|
}
|
|
|
|
const maybeJsonResponse = response as unknown as { json?: () => Promise<unknown> };
|
|
if (typeof maybeJsonResponse.json === 'function') {
|
|
const jsonPayload = await maybeJsonResponse.json();
|
|
return JSON.stringify(jsonPayload);
|
|
}
|
|
|
|
return '';
|
|
}
|
|
|
|
interface AudioTranscriptionConfig {
|
|
endpoint?: string;
|
|
apiKey?: string;
|
|
model?: string;
|
|
}
|
|
|
|
export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig): Tool {
|
|
return {
|
|
name: 'audio.transcribe',
|
|
description: 'Transcribe an audio file to text using a Whisper-compatible API. Accepts base64-encoded audio data or a URL to download to audio file.',
|
|
inputSchema: {
|
|
type: 'object',
|
|
properties: {
|
|
data: {
|
|
type: 'string',
|
|
description: 'Base64-encoded audio data (alternative to url)',
|
|
},
|
|
url: {
|
|
type: 'string',
|
|
description: 'URL to download to audio file (alternative to data)',
|
|
},
|
|
mime_type: {
|
|
type: 'string',
|
|
description: 'MIME type of audio file (required when using data, e.g., audio/ogg, audio/mpeg, audio/wav)',
|
|
},
|
|
language: {
|
|
type: 'string',
|
|
description: 'Language code (e.g., en, es, fr) - optional',
|
|
},
|
|
prompt: {
|
|
type: 'string',
|
|
description: 'Optional text to guide transcription (OpenAI/Groq/custom only)',
|
|
},
|
|
},
|
|
},
|
|
|
|
execute: async (rawArgs: unknown): Promise<ToolResult> => {
|
|
const args = rawArgs as AudioTranscribeArgs;
|
|
|
|
const validation = validateInput(args);
|
|
if (!validation.valid) {
|
|
return {
|
|
success: false,
|
|
output: '',
|
|
error: validation.error,
|
|
};
|
|
}
|
|
|
|
if (!audioConfig?.endpoint) {
|
|
return {
|
|
success: false,
|
|
output: '',
|
|
error: 'Audio transcription endpoint not configured. Set audio.endpoint in config.yaml',
|
|
};
|
|
}
|
|
|
|
try {
|
|
let filename = 'audio.bin';
|
|
let audioBlob: Blob | undefined;
|
|
|
|
if (args.data) {
|
|
const rawBuffer = Buffer.from(args.data, 'base64');
|
|
if (rawBuffer.length === 0) {
|
|
throw new Error('Decoded audio data is empty');
|
|
}
|
|
|
|
const extMap: Record<string, string> = {
|
|
'audio/ogg': 'ogg',
|
|
'audio/mpeg': 'mp3',
|
|
'audio/mp3': 'mp3',
|
|
'audio/wav': 'wav',
|
|
'audio/webm': 'webm',
|
|
'audio/mp4': 'm4a',
|
|
'audio/x-m4a': 'm4a',
|
|
};
|
|
const ext = extMap[args.mime_type ?? ''] || 'bin';
|
|
filename = `audio.${ext}`;
|
|
|
|
const mimeType = args.mime_type ?? 'audio/wav';
|
|
audioBlob = new Blob([rawBuffer], { type: mimeType });
|
|
} else if (args.url) {
|
|
const response = await fetch(args.url);
|
|
if (!response.ok) {
|
|
throw new Error(`Failed to download audio from ${args.url}: ${response.status} ${response.statusText}`);
|
|
}
|
|
const arrayBuffer = await response.arrayBuffer();
|
|
|
|
const urlExt = args.url.split('.').pop()?.split('?')[0] || 'bin';
|
|
filename = `audio.${urlExt}`;
|
|
|
|
const contentType = response.headers.get('content-type') ?? 'audio/wav';
|
|
audioBlob = new Blob([arrayBuffer], { type: contentType });
|
|
}
|
|
|
|
const endpoint = audioConfig.endpoint;
|
|
const model = audioConfig.model ?? 'whisper-1';
|
|
|
|
if (endpoint === PROVIDER_ENDPOINTS.ollama) {
|
|
const ollamaResponse = await fetch(endpoint, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
model: model,
|
|
audio: args.data ?? args.url,
|
|
stream: false,
|
|
}),
|
|
});
|
|
|
|
if (!ollamaResponse.ok) {
|
|
throw new Error(`Ollama transcription failed: ${ollamaResponse.status}`);
|
|
}
|
|
|
|
const ollamaJson = await ollamaResponse.json() as { response?: string };
|
|
return {
|
|
success: true,
|
|
output: ollamaJson.response ?? 'No response from Ollama',
|
|
};
|
|
}
|
|
|
|
const formData = new FormData();
|
|
formData.append('file', audioBlob, filename);
|
|
formData.append('model', model);
|
|
formData.append('response_format', 'json');
|
|
|
|
if (args.language) {
|
|
formData.append('language', args.language);
|
|
}
|
|
|
|
if (args.prompt) {
|
|
formData.append('prompt', args.prompt);
|
|
}
|
|
|
|
const fetchOptions: RequestInit = {
|
|
method: 'POST',
|
|
body: formData,
|
|
};
|
|
|
|
const headers: Record<string, string> = {};
|
|
if (audioConfig.apiKey) {
|
|
headers['Authorization'] = `Bearer ${audioConfig.apiKey}`;
|
|
}
|
|
|
|
if (Object.keys(headers).length > 0) {
|
|
fetchOptions.headers = headers;
|
|
}
|
|
|
|
const response = await fetchWithRetry(endpoint, fetchOptions);
|
|
|
|
if (!response.ok) {
|
|
const errorText = await response.text();
|
|
throw new Error(`Transcription request failed (${response.status}): ${errorText}`);
|
|
}
|
|
|
|
const rawBody = await readResponseBody(response);
|
|
const trimmedBody = rawBody.trim();
|
|
let payload: unknown = rawBody;
|
|
if (trimmedBody.startsWith('{') || trimmedBody.startsWith('[')) {
|
|
try {
|
|
payload = JSON.parse(rawBody) as unknown;
|
|
} catch {
|
|
payload = rawBody;
|
|
}
|
|
}
|
|
|
|
const transcript = extractTranscriptionText(payload);
|
|
if (transcript === undefined) {
|
|
const endpointError = extractTranscriptionError(payload);
|
|
if (endpointError) {
|
|
throw new Error(`Transcription endpoint error: ${endpointError}`);
|
|
}
|
|
throw new Error(`Transcription response missing text field (body: ${truncateForError(rawBody)})`);
|
|
}
|
|
|
|
const normalizedTranscript = transcript.trim().length > 0 ? transcript : '[No speech detected]';
|
|
return {
|
|
success: true,
|
|
output: normalizedTranscript,
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
success: false,
|
|
output: '',
|
|
error: error instanceof Error ? error.message : 'Unknown error occurred',
|
|
};
|
|
}
|
|
},
|
|
};
|
|
}
|