From 487f26e36da88d25d37acfbdb9ac13eaf91736a6 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Sun, 22 Feb 2026 19:54:58 -0800 Subject: [PATCH] Harden audio transcription fetch path with retries and timeout --- docs/plans/state.json | 17 +++++++- docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md | 19 +++++++++ src/models/media.test.ts | 20 +++++++++ src/models/media.ts | 49 +++++++++++++++++++++- src/tools/builtin/audio-transcribe.test.ts | 25 ++++++++++- src/tools/builtin/audio-transcribe.ts | 49 +++++++++++++++++++++- 6 files changed, 175 insertions(+), 4 deletions(-) diff --git a/docs/plans/state.json b/docs/plans/state.json index 63b4212..33f5217 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -6114,6 +6114,21 @@ "README.md" ], "test_status": "docs-only update; no runtime code changes" + }, + "audio-transcription-fetch-retry-hardening": { + "status": "completed", + "date": "2026-02-23", + "updated": "2026-02-23", + "summary": "Added retry+timeout hardening for transient `fetch failed` errors in both pre-transcription (`models/media`) and `audio.transcribe`, with focused regression tests and runbook updates for timestamp correlation and endpoint diagnostics.", + "files_modified": [ + "src/models/media.ts", + "src/models/media.test.ts", + "src/tools/builtin/audio-transcribe.ts", + "src/tools/builtin/audio-transcribe.test.ts", + "docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md", + "docs/plans/state.json" + ], + "test_status": "pnpm test:run src/tools/builtin/audio-transcribe.test.ts src/models/media.test.ts" } }, "overall_progress": { @@ -6135,7 +6150,7 @@ "feature_gap_scorecard": "128/128 match (100%), 0 partial (0%), 0 missing (0%)", "operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 2/2 plans complete — milestone done", "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram", - "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback, plus 2026-02-23 arg hydration hardening and tool.args_rewritten audit metric", + "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback, plus 2026-02-23 arg hydration hardening, tool.args_rewritten audit metric, and transient fetch retry/timeout hardening", "remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening", "next_up": "Track OpenClaw evolution regularly for inspiration and feature ideas" }, diff --git a/docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md b/docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md index 9c74725..6490fbe 100644 --- a/docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md +++ b/docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md @@ -39,9 +39,28 @@ curl -sS -i -X POST http://localhost:18801/v1/audio/transcriptions \ - `Transcription endpoint error: FFmpeg conversion failed.` - Endpoint could not decode payload as audio. Often caused by model-provided fake or mismatched `data`/`mime_type`. +- `fetch failed` + - Flynn could not connect to the transcription endpoint for that attempt (transport/connectivity timeout/reset). + - Confirm endpoint is reachable from Flynn host and check `whisper-server` logs around the same timestamp. + - If this is intermittent, Flynn now retries transient failures before returning an error. + - `[No speech detected]` - Request succeeded and endpoint returned empty transcript text. +## Correlate Events By Timestamp + +To inspect one Telegram session with wall-clock timestamps: + +```bash +tail -n 2000 ~/.local/share/flynn/audit.log | jq -c ' + select( + .event.session_id=="telegram:8367012007" and + (.event_type=="user.action" or .event_type=="backend.route" or .event_type=="tool.start" or .event_type=="tool.success" or .event_type=="tool.error") + ) + | . + {ts_iso: ((.timestamp/1000)|strftime("%Y-%m-%d %H:%M:%S %Z"))} +' +``` + ## Rewrite Metric Flynn emits `tool.args_rewritten` whenever it replaces model-provided `audio.transcribe` args with trusted session audio bytes. diff --git a/src/models/media.test.ts b/src/models/media.test.ts index 3acf419..f9a598d 100644 --- a/src/models/media.test.ts +++ b/src/models/media.test.ts @@ -440,6 +440,26 @@ describe('transcribeAudio', () => { expect(result).toBe('[Audio message transcription failed]'); }); + it('retries transient fetch failure and succeeds on a later attempt', async () => { + vi.mocked(global.fetch) + .mockRejectedValueOnce(new TypeError('fetch failed')) + .mockResolvedValueOnce({ + ok: true, + json: async () => ({ text: mockTranscript }), + } as Response); + + const config: AudioTranscriptionConfig = { + endpoint: 'https://api.example.com/v1/audio/transcriptions', + apiKey: 'test-key', + model: 'test-model', + }; + + const result = await transcribeAudio(oggAudioAttachment, config); + + expect(result).toBe(mockTranscript); + expect(global.fetch).toHaveBeenCalledTimes(2); + }); + // Positive: uses Whisper-1 model by default. it('uses whisper-1 model by default', async () => { const config: AudioTranscriptionConfig = { diff --git a/src/models/media.ts b/src/models/media.ts index 96c9ec7..8d376f5 100644 --- a/src/models/media.ts +++ b/src/models/media.ts @@ -24,6 +24,53 @@ const SUPPORTED_AUDIO_TYPES = new Set([ 'audio/x-m4a', ]); +const TRANSCRIPTION_FETCH_MAX_ATTEMPTS = 3; +const TRANSCRIPTION_FETCH_TIMEOUT_MS = 45_000; +const TRANSCRIPTION_FETCH_BASE_DELAY_MS = 250; + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function isTransientNetworkError(error: unknown): boolean { + const message = (error instanceof Error ? error.message : String(error)).toLowerCase(); + return message.includes('fetch failed') + || message.includes('network') + || message.includes('timeout') + || message.includes('timed out') + || message.includes('econnrefused') + || message.includes('econnreset') + || message.includes('enotfound') + || message.includes('ehostunreach'); +} + +async function fetchTranscriptionWithRetry(endpoint: string, init: RequestInit): Promise { + for (let attempt = 1; attempt <= TRANSCRIPTION_FETCH_MAX_ATTEMPTS; attempt += 1) { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), TRANSCRIPTION_FETCH_TIMEOUT_MS); + try { + return await fetch(endpoint, { ...init, signal: controller.signal }); + } catch (error) { + const timedOut = error instanceof Error && error.name === 'AbortError'; + const retriable = timedOut || isTransientNetworkError(error); + const normalizedMessage = timedOut + ? `request timed out after ${TRANSCRIPTION_FETCH_TIMEOUT_MS}ms` + : (error instanceof Error ? error.message : String(error)); + const exhausted = attempt >= TRANSCRIPTION_FETCH_MAX_ATTEMPTS; + if (!retriable || exhausted) { + throw new Error( + `Transcription request to ${endpoint} failed after ${attempt} attempt(s): ${normalizedMessage}`, + ); + } + await sleep(TRANSCRIPTION_FETCH_BASE_DELAY_MS * (2 ** (attempt - 1))); + } finally { + clearTimeout(timeout); + } + } + + throw new Error(`Transcription request to ${endpoint} failed after retries`); +} + /** Check whether an attachment is a supported image type. */ export function isSupportedImage(attachment: Attachment): boolean { return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType); @@ -257,7 +304,7 @@ export async function transcribeAudio( headers['Authorization'] = `Bearer ${config.apiKey}`; } - const res = await fetch(config.endpoint, { method: 'POST', body: formData, headers }); + const res = await fetchTranscriptionWithRetry(config.endpoint, { method: 'POST', body: formData, headers }); if (!res.ok) { throw new Error(`Transcription failed: ${res.status} ${res.statusText}`); } diff --git a/src/tools/builtin/audio-transcribe.test.ts b/src/tools/builtin/audio-transcribe.test.ts index 3e614ee..cdc4857 100644 --- a/src/tools/builtin/audio-transcribe.test.ts +++ b/src/tools/builtin/audio-transcribe.test.ts @@ -309,13 +309,36 @@ describe('createAudioTranscribeTool', () => { }); it('handles network errors gracefully', async () => { - mockFetch.mockRejectedValueOnce(new Error('ECONNREFUSED')); + mockFetch.mockRejectedValue(new Error('ECONNREFUSED')); const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' }); expect(result.success).toBe(false); expect(result.error).toMatch(/ECONNREFUSED/); }); + it('retries transient fetch failures before succeeding', async () => { + mockFetch + .mockRejectedValueOnce(new TypeError('fetch failed')) + .mockResolvedValueOnce({ + ok: true, + text: async () => JSON.stringify({ text: 'Recovered transcript' }), + }); + + const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' }); + expect(result.success).toBe(true); + expect(result.output).toBe('Recovered transcript'); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it('returns endpoint context when transient failures are exhausted', async () => { + mockFetch.mockRejectedValue(new TypeError('fetch failed')); + + const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' }); + expect(result.success).toBe(false); + expect(result.error).toMatch(/failed after 3 attempt/); + expect(result.error).toMatch(/audio\/transcriptions/); + }); + it('returns clear error when transcription payload has no text field', async () => { mockFetch.mockResolvedValueOnce({ ok: true, diff --git a/src/tools/builtin/audio-transcribe.ts b/src/tools/builtin/audio-transcribe.ts index fed70a0..8f81bde 100644 --- a/src/tools/builtin/audio-transcribe.ts +++ b/src/tools/builtin/audio-transcribe.ts @@ -25,6 +25,53 @@ const PROVIDER_ENDPOINTS: Record = { llamacpp: 'http://localhost:8080/v1/audio/transcriptions', }; +const TRANSCRIPTION_FETCH_MAX_ATTEMPTS = 3; +const TRANSCRIPTION_FETCH_TIMEOUT_MS = 45_000; +const TRANSCRIPTION_FETCH_BASE_DELAY_MS = 250; + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function isTransientNetworkError(error: unknown): boolean { + const message = (error instanceof Error ? error.message : String(error)).toLowerCase(); + return message.includes('fetch failed') + || message.includes('network') + || message.includes('timeout') + || message.includes('timed out') + || message.includes('econnrefused') + || message.includes('econnreset') + || message.includes('enotfound') + || message.includes('ehostunreach'); +} + +async function fetchWithRetry(endpoint: string, init: RequestInit): Promise { + for (let attempt = 1; attempt <= TRANSCRIPTION_FETCH_MAX_ATTEMPTS; attempt += 1) { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), TRANSCRIPTION_FETCH_TIMEOUT_MS); + try { + return await fetch(endpoint, { ...init, signal: controller.signal }); + } catch (error) { + const timedOut = error instanceof Error && error.name === 'AbortError'; + const normalizedMessage = timedOut + ? `request timed out after ${TRANSCRIPTION_FETCH_TIMEOUT_MS}ms` + : (error instanceof Error ? error.message : String(error)); + const retriable = timedOut || isTransientNetworkError(error); + const exhausted = attempt >= TRANSCRIPTION_FETCH_MAX_ATTEMPTS; + if (!retriable || exhausted) { + throw new Error( + `Transcription request to ${endpoint} failed after ${attempt} attempt(s): ${normalizedMessage}`, + ); + } + await sleep(TRANSCRIPTION_FETCH_BASE_DELAY_MS * (2 ** (attempt - 1))); + } finally { + clearTimeout(timeout); + } + } + + throw new Error(`Transcription request to ${endpoint} failed after retries`); +} + function validateUrl(url: string): { valid: boolean; error?: string } { let parsed: URL; try { @@ -387,7 +434,7 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig fetchOptions.headers = headers; } - const response = await fetch(endpoint, fetchOptions); + const response = await fetchWithRetry(endpoint, fetchOptions); if (!response.ok) { const errorText = await response.text();