Harden audio transcription arg hydration and add rewrite audit event

2026-02-22 18:56:22 -08:00
parent 7d0d8abec6
commit db4e52dd7e
10 changed files with 1183 additions and 16 deletions
@@ -51,6 +51,12 @@ describe('createAudioTranscribeTool', () => {
      expect(result.success).toBe(false);
      expect(result.error).toMatch(/Unsupported MIME type/);
    });
+
+    it('rejects invalid non-base64 data payloads', async () => {
+      const result = await tool.execute({ data: '[voice message data not provided]', mime_type: 'audio/ogg' });
+      expect(result.success).toBe(false);
+      expect(result.error).toMatch(/valid base64/i);
+    });
  });

  describe('URL validation (SSRF protection)', () => {
@@ -106,7 +112,7 @@ describe('createAudioTranscribeTool', () => {
      });
      mockFetch.mockResolvedValueOnce({
        ok: true,
-        json: async () => ({ text: 'hello' }),
+        text: async () => JSON.stringify({ text: 'hello' }),
      });

      const result = await tool.execute({ url: 'https://example.com/audio.wav' });
@@ -136,7 +142,7 @@ describe('createAudioTranscribeTool', () => {
    it('transcribes base64 audio data', async () => {
      mockFetch.mockResolvedValueOnce({
        ok: true,
-        json: async () => ({ text: 'Hello, world!' }),
+        text: async () => JSON.stringify({ text: 'Hello, world!' }),
      });

      const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' });
@@ -152,7 +158,7 @@ describe('createAudioTranscribeTool', () => {
    it('sends Authorization header when apiKey is set', async () => {
      mockFetch.mockResolvedValueOnce({
        ok: true,
-        json: async () => ({ text: 'test' }),
+        text: async () => JSON.stringify({ text: 'test' }),
      });

      await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/ogg' });
@@ -164,7 +170,7 @@ describe('createAudioTranscribeTool', () => {
    it('passes language and prompt parameters', async () => {
      mockFetch.mockResolvedValueOnce({
        ok: true,
-        json: async () => ({ text: 'Hola mundo' }),
+        text: async () => JSON.stringify({ text: 'Hola mundo' }),
      });

      const result = await tool.execute({
@@ -176,6 +182,28 @@ describe('createAudioTranscribeTool', () => {
      expect(result.success).toBe(true);
      expect(result.output).toBe('Hola mundo');
    });
+
+    it('accepts plain-text transcription responses', async () => {
+      mockFetch.mockResolvedValueOnce({
+        ok: true,
+        text: async () => 'Plain transcript',
+      });
+
+      const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' });
+      expect(result.success).toBe(true);
+      expect(result.output).toBe('Plain transcript');
+    });
+
+    it('returns a no-speech placeholder for empty transcript text', async () => {
+      mockFetch.mockResolvedValueOnce({
+        ok: true,
+        text: async () => JSON.stringify({ text: '' }),
+      });
+
+      const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' });
+      expect(result.success).toBe(true);
+      expect(result.output).toBe('[No speech detected]');
+    });
  });

  describe('URL-based transcription', () => {
@@ -191,7 +219,7 @@ describe('createAudioTranscribeTool', () => {
      // Second fetch: transcription API
      mockFetch.mockResolvedValueOnce({
        ok: true,
-        json: async () => ({ text: 'URL transcription result' }),
+        text: async () => JSON.stringify({ text: 'URL transcription result' }),
      });

      const result = await tool.execute({ url: 'https://cdn.example.com/audio.mp3' });
@@ -207,7 +235,7 @@ describe('createAudioTranscribeTool', () => {
      });
      mockFetch.mockResolvedValueOnce({
        ok: true,
-        json: async () => ({ text: 'ogg result' }),
+        text: async () => JSON.stringify({ text: 'ogg result' }),
      });

      const result = await tool.execute({ url: 'https://cdn.example.com/voice' });
@@ -287,5 +315,27 @@ describe('createAudioTranscribeTool', () => {
      expect(result.success).toBe(false);
      expect(result.error).toMatch(/ECONNREFUSED/);
    });
+
+    it('returns clear error when transcription payload has no text field', async () => {
+      mockFetch.mockResolvedValueOnce({
+        ok: true,
+        text: async () => JSON.stringify({ id: 'abc123', status: 'ok' }),
+      });
+
+      const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' });
+      expect(result.success).toBe(false);
+      expect(result.error).toMatch(/missing text field/i);
+    });
+
+    it('surfaces endpoint error payloads', async () => {
+      mockFetch.mockResolvedValueOnce({
+        ok: true,
+        text: async () => JSON.stringify({ error: { message: 'model not loaded' } }),
+      });
+
+      const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' });
+      expect(result.success).toBe(false);
+      expect(result.error).toMatch(/endpoint error: model not loaded/i);
+    });
  });
 });
@@ -51,6 +51,22 @@ function validateUrl(url: string): { valid: boolean; error?: string } {
 }

 function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: string } {
+  if (args.data !== undefined && typeof args.data !== 'string') {
+    return { valid: false, error: 'data must be a base64 string when provided' };
+  }
+  if (args.url !== undefined && typeof args.url !== 'string') {
+    return { valid: false, error: 'url must be a string when provided' };
+  }
+  if (args.mime_type !== undefined && typeof args.mime_type !== 'string') {
+    return { valid: false, error: 'mime_type must be a string when provided' };
+  }
+  if (args.language !== undefined && typeof args.language !== 'string') {
+    return { valid: false, error: 'language must be a string when provided' };
+  }
+  if (args.prompt !== undefined && typeof args.prompt !== 'string') {
+    return { valid: false, error: 'prompt must be a string when provided' };
+  }
+
  const hasData = args.data !== undefined && args.data !== '';
  const hasUrl = args.url !== undefined && args.url !== '';

@@ -62,6 +78,22 @@ function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: str
    return { valid: false, error: 'Only one of data or url can be provided' };
  }

+  if (hasData) {
+    const compact = (args.data ?? '').replace(/\s+/g, '');
+    const isBase64 = /^[A-Za-z0-9+/=]+$/.test(compact);
+    let hasDecodedBytes = false;
+    if (isBase64) {
+      try {
+        hasDecodedBytes = Buffer.from(compact, 'base64').length > 0;
+      } catch {
+        hasDecodedBytes = false;
+      }
+    }
+    if (!isBase64 || !hasDecodedBytes) {
+      return { valid: false, error: 'data must be valid base64-encoded audio bytes' };
+    }
+  }
+
  if (hasData && !args.mime_type) {
    return { valid: false, error: 'mime_type is required when using data' };
  }
@@ -84,6 +116,131 @@ function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: str
  return { valid: true };
 }

+function extractTranscriptionText(payload: unknown): string | undefined {
+  if (typeof payload === 'string') {
+    return payload;
+  }
+  if (!payload || typeof payload !== 'object') {
+    return undefined;
+  }
+
+  const obj = payload as Record<string, unknown>;
+  const directKeys = ['text', 'transcript', 'transcription', 'output'];
+  for (const key of directKeys) {
+    const value = obj[key];
+    if (typeof value === 'string') {
+      return value;
+    }
+  }
+
+  if (obj.result && typeof obj.result === 'object') {
+    const resultObj = obj.result as Record<string, unknown>;
+    const nested = resultObj.text ?? resultObj.transcript;
+    if (typeof nested === 'string') {
+      return nested;
+    }
+  }
+
+  if (obj.data && typeof obj.data === 'object') {
+    const dataObj = obj.data as Record<string, unknown>;
+    const nested = dataObj.text ?? dataObj.transcript;
+    if (typeof nested === 'string') {
+      return nested;
+    }
+  }
+
+  if (Array.isArray(obj.results)) {
+    for (const result of obj.results) {
+      if (!result || typeof result !== 'object') {
+        continue;
+      }
+      const resultObj = result as Record<string, unknown>;
+      if (typeof resultObj.text === 'string') {
+        return resultObj.text;
+      }
+
+      if (Array.isArray(resultObj.alternatives)) {
+        for (const alternative of resultObj.alternatives) {
+          if (!alternative || typeof alternative !== 'object') {
+            continue;
+          }
+          const altObj = alternative as Record<string, unknown>;
+          const altTranscript = altObj.transcript ?? altObj.text;
+          if (typeof altTranscript === 'string') {
+            return altTranscript;
+          }
+        }
+      }
+    }
+  }
+
+  if (Array.isArray(obj.segments)) {
+    const joined = obj.segments
+      .map((segment) => (segment && typeof segment === 'object'
+        ? (segment as Record<string, unknown>).text
+        : undefined))
+      .filter((v): v is string => typeof v === 'string' && v.trim().length > 0)
+      .join(' ');
+    if (joined.trim().length > 0) {
+      return joined;
+    }
+  }
+
+  return undefined;
+}
+
+function extractTranscriptionError(payload: unknown): string | undefined {
+  if (!payload || typeof payload !== 'object') {
+    return undefined;
+  }
+
+  const obj = payload as Record<string, unknown>;
+  if (typeof obj.error === 'string' && obj.error.trim().length > 0) {
+    return obj.error;
+  }
+
+  if (obj.error && typeof obj.error === 'object') {
+    const errorObj = obj.error as Record<string, unknown>;
+    const message = errorObj.message ?? errorObj.error;
+    if (typeof message === 'string' && message.trim().length > 0) {
+      return message;
+    }
+  }
+
+  if (typeof obj.detail === 'string' && obj.detail.trim().length > 0) {
+    return obj.detail;
+  }
+
+  if (typeof obj.message === 'string' && obj.message.trim().length > 0) {
+    return obj.message;
+  }
+
+  return undefined;
+}
+
+function truncateForError(text: string, max = 180): string {
+  const normalized = text.replace(/\s+/g, ' ').trim();
+  if (normalized.length <= max) {
+    return normalized;
+  }
+  return `${normalized.slice(0, max)}...`;
+}
+
+async function readResponseBody(response: Response): Promise<string> {
+  const textReader = response.text as unknown;
+  if (typeof textReader === 'function') {
+    return await response.text();
+  }
+
+  const maybeJsonResponse = response as unknown as { json?: () => Promise<unknown> };
+  if (typeof maybeJsonResponse.json === 'function') {
+    const jsonPayload = await maybeJsonResponse.json();
+    return JSON.stringify(jsonPayload);
+  }
+
+  return '';
+}
+
 interface AudioTranscriptionConfig {
  endpoint?: string;
  apiKey?: string;
@@ -146,7 +303,9 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig

        if (args.data) {
          const rawBuffer = Buffer.from(args.data, 'base64');
-          const audioBuffer = rawBuffer.buffer;
+          if (rawBuffer.length === 0) {
+            throw new Error('Decoded audio data is empty');
+          }

          const extMap: Record<string, string> = {
            'audio/ogg': 'ogg',
@@ -161,7 +320,7 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig
          filename = `audio.${ext}`;

          const mimeType = args.mime_type ?? 'audio/wav';
-          audioBlob = new Blob([audioBuffer], { type: mimeType });
+          audioBlob = new Blob([rawBuffer], { type: mimeType });
        } else if (args.url) {
          const response = await fetch(args.url);
          if (!response.ok) {
@@ -204,6 +363,7 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig
        const formData = new FormData();
        formData.append('file', audioBlob, filename);
        formData.append('model', model);
+        formData.append('response_format', 'json');

        if (args.language) {
          formData.append('language', args.language);
@@ -234,10 +394,30 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig
          throw new Error(`Transcription request failed (${response.status}): ${errorText}`);
        }

-        const json = await response.json() as { text: string };
+        const rawBody = await readResponseBody(response);
+        const trimmedBody = rawBody.trim();
+        let payload: unknown = rawBody;
+        if (trimmedBody.startsWith('{') || trimmedBody.startsWith('[')) {
+          try {
+            payload = JSON.parse(rawBody) as unknown;
+          } catch {
+            payload = rawBody;
+          }
+        }
+
+        const transcript = extractTranscriptionText(payload);
+        if (transcript === undefined) {
+          const endpointError = extractTranscriptionError(payload);
+          if (endpointError) {
+            throw new Error(`Transcription endpoint error: ${endpointError}`);
+          }
+          throw new Error(`Transcription response missing text field (body: ${truncateForError(rawBody)})`);
+        }
+
+        const normalizedTranscript = transcript.trim().length > 0 ? transcript : '[No speech detected]';
        return {
          success: true,
-          output: json.text,
+          output: normalizedTranscript,
        };
      } catch (error) {
        return {