Harden audio transcription arg hydration and add rewrite audit event

This commit is contained in:
William Valentin
2026-02-22 18:56:22 -08:00
parent 7d0d8abec6
commit db4e52dd7e
10 changed files with 1183 additions and 16 deletions
+184 -4
View File
@@ -51,6 +51,22 @@ function validateUrl(url: string): { valid: boolean; error?: string } {
}
function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: string } {
if (args.data !== undefined && typeof args.data !== 'string') {
return { valid: false, error: 'data must be a base64 string when provided' };
}
if (args.url !== undefined && typeof args.url !== 'string') {
return { valid: false, error: 'url must be a string when provided' };
}
if (args.mime_type !== undefined && typeof args.mime_type !== 'string') {
return { valid: false, error: 'mime_type must be a string when provided' };
}
if (args.language !== undefined && typeof args.language !== 'string') {
return { valid: false, error: 'language must be a string when provided' };
}
if (args.prompt !== undefined && typeof args.prompt !== 'string') {
return { valid: false, error: 'prompt must be a string when provided' };
}
const hasData = args.data !== undefined && args.data !== '';
const hasUrl = args.url !== undefined && args.url !== '';
@@ -62,6 +78,22 @@ function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: str
return { valid: false, error: 'Only one of data or url can be provided' };
}
if (hasData) {
const compact = (args.data ?? '').replace(/\s+/g, '');
const isBase64 = /^[A-Za-z0-9+/=]+$/.test(compact);
let hasDecodedBytes = false;
if (isBase64) {
try {
hasDecodedBytes = Buffer.from(compact, 'base64').length > 0;
} catch {
hasDecodedBytes = false;
}
}
if (!isBase64 || !hasDecodedBytes) {
return { valid: false, error: 'data must be valid base64-encoded audio bytes' };
}
}
if (hasData && !args.mime_type) {
return { valid: false, error: 'mime_type is required when using data' };
}
@@ -84,6 +116,131 @@ function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: str
return { valid: true };
}
function extractTranscriptionText(payload: unknown): string | undefined {
if (typeof payload === 'string') {
return payload;
}
if (!payload || typeof payload !== 'object') {
return undefined;
}
const obj = payload as Record<string, unknown>;
const directKeys = ['text', 'transcript', 'transcription', 'output'];
for (const key of directKeys) {
const value = obj[key];
if (typeof value === 'string') {
return value;
}
}
if (obj.result && typeof obj.result === 'object') {
const resultObj = obj.result as Record<string, unknown>;
const nested = resultObj.text ?? resultObj.transcript;
if (typeof nested === 'string') {
return nested;
}
}
if (obj.data && typeof obj.data === 'object') {
const dataObj = obj.data as Record<string, unknown>;
const nested = dataObj.text ?? dataObj.transcript;
if (typeof nested === 'string') {
return nested;
}
}
if (Array.isArray(obj.results)) {
for (const result of obj.results) {
if (!result || typeof result !== 'object') {
continue;
}
const resultObj = result as Record<string, unknown>;
if (typeof resultObj.text === 'string') {
return resultObj.text;
}
if (Array.isArray(resultObj.alternatives)) {
for (const alternative of resultObj.alternatives) {
if (!alternative || typeof alternative !== 'object') {
continue;
}
const altObj = alternative as Record<string, unknown>;
const altTranscript = altObj.transcript ?? altObj.text;
if (typeof altTranscript === 'string') {
return altTranscript;
}
}
}
}
}
if (Array.isArray(obj.segments)) {
const joined = obj.segments
.map((segment) => (segment && typeof segment === 'object'
? (segment as Record<string, unknown>).text
: undefined))
.filter((v): v is string => typeof v === 'string' && v.trim().length > 0)
.join(' ');
if (joined.trim().length > 0) {
return joined;
}
}
return undefined;
}
function extractTranscriptionError(payload: unknown): string | undefined {
if (!payload || typeof payload !== 'object') {
return undefined;
}
const obj = payload as Record<string, unknown>;
if (typeof obj.error === 'string' && obj.error.trim().length > 0) {
return obj.error;
}
if (obj.error && typeof obj.error === 'object') {
const errorObj = obj.error as Record<string, unknown>;
const message = errorObj.message ?? errorObj.error;
if (typeof message === 'string' && message.trim().length > 0) {
return message;
}
}
if (typeof obj.detail === 'string' && obj.detail.trim().length > 0) {
return obj.detail;
}
if (typeof obj.message === 'string' && obj.message.trim().length > 0) {
return obj.message;
}
return undefined;
}
function truncateForError(text: string, max = 180): string {
const normalized = text.replace(/\s+/g, ' ').trim();
if (normalized.length <= max) {
return normalized;
}
return `${normalized.slice(0, max)}...`;
}
async function readResponseBody(response: Response): Promise<string> {
const textReader = response.text as unknown;
if (typeof textReader === 'function') {
return await response.text();
}
const maybeJsonResponse = response as unknown as { json?: () => Promise<unknown> };
if (typeof maybeJsonResponse.json === 'function') {
const jsonPayload = await maybeJsonResponse.json();
return JSON.stringify(jsonPayload);
}
return '';
}
interface AudioTranscriptionConfig {
endpoint?: string;
apiKey?: string;
@@ -146,7 +303,9 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig
if (args.data) {
const rawBuffer = Buffer.from(args.data, 'base64');
const audioBuffer = rawBuffer.buffer;
if (rawBuffer.length === 0) {
throw new Error('Decoded audio data is empty');
}
const extMap: Record<string, string> = {
'audio/ogg': 'ogg',
@@ -161,7 +320,7 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig
filename = `audio.${ext}`;
const mimeType = args.mime_type ?? 'audio/wav';
audioBlob = new Blob([audioBuffer], { type: mimeType });
audioBlob = new Blob([rawBuffer], { type: mimeType });
} else if (args.url) {
const response = await fetch(args.url);
if (!response.ok) {
@@ -204,6 +363,7 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig
const formData = new FormData();
formData.append('file', audioBlob, filename);
formData.append('model', model);
formData.append('response_format', 'json');
if (args.language) {
formData.append('language', args.language);
@@ -234,10 +394,30 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig
throw new Error(`Transcription request failed (${response.status}): ${errorText}`);
}
const json = await response.json() as { text: string };
const rawBody = await readResponseBody(response);
const trimmedBody = rawBody.trim();
let payload: unknown = rawBody;
if (trimmedBody.startsWith('{') || trimmedBody.startsWith('[')) {
try {
payload = JSON.parse(rawBody) as unknown;
} catch {
payload = rawBody;
}
}
const transcript = extractTranscriptionText(payload);
if (transcript === undefined) {
const endpointError = extractTranscriptionError(payload);
if (endpointError) {
throw new Error(`Transcription endpoint error: ${endpointError}`);
}
throw new Error(`Transcription response missing text field (body: ${truncateForError(rawBody)})`);
}
const normalizedTranscript = transcript.trim().length > 0 ? transcript : '[No speech detected]';
return {
success: true,
output: json.text,
output: normalizedTranscript,
};
} catch (error) {
return {