Harden audio transcription fetch path with retries and timeout

2026-02-22 19:54:58 -08:00
parent abaa9be3f1
commit 487f26e36d
6 changed files with 175 additions and 4 deletions
@@ -6114,6 +6114,21 @@
        "README.md"
      ],
      "test_status": "docs-only update; no runtime code changes"
+    },
+    "audio-transcription-fetch-retry-hardening": {
+      "status": "completed",
+      "date": "2026-02-23",
+      "updated": "2026-02-23",
+      "summary": "Added retry+timeout hardening for transient `fetch failed` errors in both pre-transcription (`models/media`) and `audio.transcribe`, with focused regression tests and runbook updates for timestamp correlation and endpoint diagnostics.",
+      "files_modified": [
+        "src/models/media.ts",
+        "src/models/media.test.ts",
+        "src/tools/builtin/audio-transcribe.ts",
+        "src/tools/builtin/audio-transcribe.test.ts",
+        "docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md",
+        "docs/plans/state.json"
+      ],
+      "test_status": "pnpm test:run src/tools/builtin/audio-transcribe.test.ts src/models/media.test.ts"
    }
  },
  "overall_progress": {
@@ -6135,7 +6150,7 @@
    "feature_gap_scorecard": "128/128 match (100%), 0 partial (0%), 0 missing (0%)",
    "operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 2/2 plans complete — milestone done",
    "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram",
-    "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback, plus 2026-02-23 arg hydration hardening and tool.args_rewritten audit metric",
+    "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback, plus 2026-02-23 arg hydration hardening, tool.args_rewritten audit metric, and transient fetch retry/timeout hardening",
    "remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening",
    "next_up": "Track OpenClaw evolution regularly for inspiration and feature ideas"
  },
@@ -39,9 +39,28 @@ curl -sS -i -X POST http://localhost:18801/v1/audio/transcriptions \
 - `Transcription endpoint error: FFmpeg conversion failed.`
  - Endpoint could not decode payload as audio. Often caused by model-provided fake or mismatched `data`/`mime_type`.

+- `fetch failed`
+  - Flynn could not connect to the transcription endpoint for that attempt (transport/connectivity timeout/reset).
+  - Confirm endpoint is reachable from Flynn host and check `whisper-server` logs around the same timestamp.
+  - If this is intermittent, Flynn now retries transient failures before returning an error.
+
 - `[No speech detected]`
  - Request succeeded and endpoint returned empty transcript text.

+## Correlate Events By Timestamp
+
+To inspect one Telegram session with wall-clock timestamps:
+
+```bash
+tail -n 2000 ~/.local/share/flynn/audit.log | jq -c '
+  select(
+    .event.session_id=="telegram:8367012007" and
+    (.event_type=="user.action" or .event_type=="backend.route" or .event_type=="tool.start" or .event_type=="tool.success" or .event_type=="tool.error")
+  )
+  | . + {ts_iso: ((.timestamp/1000)|strftime("%Y-%m-%d %H:%M:%S %Z"))}
+'
+```
+
 ## Rewrite Metric

 Flynn emits `tool.args_rewritten` whenever it replaces model-provided `audio.transcribe` args with trusted session audio bytes.
@@ -440,6 +440,26 @@ describe('transcribeAudio', () => {
    expect(result).toBe('[Audio message transcription failed]');
  });

+  it('retries transient fetch failure and succeeds on a later attempt', async () => {
+    vi.mocked(global.fetch)
+      .mockRejectedValueOnce(new TypeError('fetch failed'))
+      .mockResolvedValueOnce({
+        ok: true,
+        json: async () => ({ text: mockTranscript }),
+      } as Response);
+
+    const config: AudioTranscriptionConfig = {
+      endpoint: 'https://api.example.com/v1/audio/transcriptions',
+      apiKey: 'test-key',
+      model: 'test-model',
+    };
+
+    const result = await transcribeAudio(oggAudioAttachment, config);
+
+    expect(result).toBe(mockTranscript);
+    expect(global.fetch).toHaveBeenCalledTimes(2);
+  });
+
  // Positive: uses Whisper-1 model by default.
  it('uses whisper-1 model by default', async () => {
    const config: AudioTranscriptionConfig = {
@@ -24,6 +24,53 @@ const SUPPORTED_AUDIO_TYPES = new Set([
  'audio/x-m4a',
 ]);

+const TRANSCRIPTION_FETCH_MAX_ATTEMPTS = 3;
+const TRANSCRIPTION_FETCH_TIMEOUT_MS = 45_000;
+const TRANSCRIPTION_FETCH_BASE_DELAY_MS = 250;
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+function isTransientNetworkError(error: unknown): boolean {
+  const message = (error instanceof Error ? error.message : String(error)).toLowerCase();
+  return message.includes('fetch failed')
+    || message.includes('network')
+    || message.includes('timeout')
+    || message.includes('timed out')
+    || message.includes('econnrefused')
+    || message.includes('econnreset')
+    || message.includes('enotfound')
+    || message.includes('ehostunreach');
+}
+
+async function fetchTranscriptionWithRetry(endpoint: string, init: RequestInit): Promise<Response> {
+  for (let attempt = 1; attempt <= TRANSCRIPTION_FETCH_MAX_ATTEMPTS; attempt += 1) {
+    const controller = new AbortController();
+    const timeout = setTimeout(() => controller.abort(), TRANSCRIPTION_FETCH_TIMEOUT_MS);
+    try {
+      return await fetch(endpoint, { ...init, signal: controller.signal });
+    } catch (error) {
+      const timedOut = error instanceof Error && error.name === 'AbortError';
+      const retriable = timedOut || isTransientNetworkError(error);
+      const normalizedMessage = timedOut
+        ? `request timed out after ${TRANSCRIPTION_FETCH_TIMEOUT_MS}ms`
+        : (error instanceof Error ? error.message : String(error));
+      const exhausted = attempt >= TRANSCRIPTION_FETCH_MAX_ATTEMPTS;
+      if (!retriable || exhausted) {
+        throw new Error(
+          `Transcription request to ${endpoint} failed after ${attempt} attempt(s): ${normalizedMessage}`,
+        );
+      }
+      await sleep(TRANSCRIPTION_FETCH_BASE_DELAY_MS * (2 ** (attempt - 1)));
+    } finally {
+      clearTimeout(timeout);
+    }
+  }
+
+  throw new Error(`Transcription request to ${endpoint} failed after retries`);
+}
+
 /** Check whether an attachment is a supported image type. */
 export function isSupportedImage(attachment: Attachment): boolean {
  return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType);
@@ -257,7 +304,7 @@ export async function transcribeAudio(
      headers['Authorization'] = `Bearer ${config.apiKey}`;
    }

-    const res = await fetch(config.endpoint, { method: 'POST', body: formData, headers });
+    const res = await fetchTranscriptionWithRetry(config.endpoint, { method: 'POST', body: formData, headers });
    if (!res.ok) {
      throw new Error(`Transcription failed: ${res.status} ${res.statusText}`);
    }
@@ -309,13 +309,36 @@ describe('createAudioTranscribeTool', () => {
    });

    it('handles network errors gracefully', async () => {
-      mockFetch.mockRejectedValueOnce(new Error('ECONNREFUSED'));
+      mockFetch.mockRejectedValue(new Error('ECONNREFUSED'));

      const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' });
      expect(result.success).toBe(false);
      expect(result.error).toMatch(/ECONNREFUSED/);
    });

+    it('retries transient fetch failures before succeeding', async () => {
+      mockFetch
+        .mockRejectedValueOnce(new TypeError('fetch failed'))
+        .mockResolvedValueOnce({
+          ok: true,
+          text: async () => JSON.stringify({ text: 'Recovered transcript' }),
+        });
+
+      const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' });
+      expect(result.success).toBe(true);
+      expect(result.output).toBe('Recovered transcript');
+      expect(mockFetch).toHaveBeenCalledTimes(2);
+    });
+
+    it('returns endpoint context when transient failures are exhausted', async () => {
+      mockFetch.mockRejectedValue(new TypeError('fetch failed'));
+
+      const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' });
+      expect(result.success).toBe(false);
+      expect(result.error).toMatch(/failed after 3 attempt/);
+      expect(result.error).toMatch(/audio\/transcriptions/);
+    });
+
    it('returns clear error when transcription payload has no text field', async () => {
      mockFetch.mockResolvedValueOnce({
        ok: true,
@@ -25,6 +25,53 @@ const PROVIDER_ENDPOINTS: Record<string, string> = {
  llamacpp: 'http://localhost:8080/v1/audio/transcriptions',
 };

+const TRANSCRIPTION_FETCH_MAX_ATTEMPTS = 3;
+const TRANSCRIPTION_FETCH_TIMEOUT_MS = 45_000;
+const TRANSCRIPTION_FETCH_BASE_DELAY_MS = 250;
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+function isTransientNetworkError(error: unknown): boolean {
+  const message = (error instanceof Error ? error.message : String(error)).toLowerCase();
+  return message.includes('fetch failed')
+    || message.includes('network')
+    || message.includes('timeout')
+    || message.includes('timed out')
+    || message.includes('econnrefused')
+    || message.includes('econnreset')
+    || message.includes('enotfound')
+    || message.includes('ehostunreach');
+}
+
+async function fetchWithRetry(endpoint: string, init: RequestInit): Promise<Response> {
+  for (let attempt = 1; attempt <= TRANSCRIPTION_FETCH_MAX_ATTEMPTS; attempt += 1) {
+    const controller = new AbortController();
+    const timeout = setTimeout(() => controller.abort(), TRANSCRIPTION_FETCH_TIMEOUT_MS);
+    try {
+      return await fetch(endpoint, { ...init, signal: controller.signal });
+    } catch (error) {
+      const timedOut = error instanceof Error && error.name === 'AbortError';
+      const normalizedMessage = timedOut
+        ? `request timed out after ${TRANSCRIPTION_FETCH_TIMEOUT_MS}ms`
+        : (error instanceof Error ? error.message : String(error));
+      const retriable = timedOut || isTransientNetworkError(error);
+      const exhausted = attempt >= TRANSCRIPTION_FETCH_MAX_ATTEMPTS;
+      if (!retriable || exhausted) {
+        throw new Error(
+          `Transcription request to ${endpoint} failed after ${attempt} attempt(s): ${normalizedMessage}`,
+        );
+      }
+      await sleep(TRANSCRIPTION_FETCH_BASE_DELAY_MS * (2 ** (attempt - 1)));
+    } finally {
+      clearTimeout(timeout);
+    }
+  }
+
+  throw new Error(`Transcription request to ${endpoint} failed after retries`);
+}
+
 function validateUrl(url: string): { valid: boolean; error?: string } {
  let parsed: URL;
  try {
@@ -387,7 +434,7 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig
          fetchOptions.headers = headers;
        }

-        const response = await fetch(endpoint, fetchOptions);
+        const response = await fetchWithRetry(endpoint, fetchOptions);

        if (!response.ok) {
          const errorText = await response.text();