From 12802b9b24c3847879304a28bb403e2f55428637 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Sun, 22 Feb 2026 18:56:26 -0800 Subject: [PATCH] Document voice transcription hardening and storage behavior --- README.md | 7 ++ docs/plans/state.json | 23 +++++- docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md | 89 ++++++++++++++++++++++ 3 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md diff --git a/README.md b/README.md index 0dee735..bbb0a04 100644 --- a/README.md +++ b/README.md @@ -428,6 +428,13 @@ docker run -d \ # docker compose up -d ``` +Audio persistence and diagnostics: + +- Latest inbound voice bytes are stored per-session in `~/.local/share/flynn/sessions.db` under `session_config.key = "lastAudioAttachment"` (used to safely hydrate `audio.transcribe` calls). +- `/reset` clears session history and session config for that chat, including `lastAudioAttachment`. +- When Flynn rewrites bad model-provided audio tool args, it emits audit event `tool.args_rewritten`. +- Runbook: `docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md`. + ### Text-to-Speech (TTS) Reply Audio Flynn can attach synthesized voice replies (OpenAI-compatible `/v1/audio/speech`) alongside text responses. diff --git a/docs/plans/state.json b/docs/plans/state.json index e127417..37f42c8 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -6019,6 +6019,27 @@ "docs/plans/state.json" ], "test_status": "pnpm test:run src/councils/orchestrator.test.ts src/tools/builtin/council-run.test.ts src/commands/builtin/index.test.ts src/config/schema.test.ts src/daemon/routing.test.ts src/tools/policy.test.ts + pnpm typecheck passing" + }, + "audit-followup-audio-transcribe-arg-hardening": { + "status": "completed", + "date": "2026-02-23", + "updated": "2026-02-23", + "summary": "Hardened audio.transcribe argument hydration for Telegram voice flows by prioritizing trusted latest-turn/persisted audio bytes, validating model-supplied audio signatures, broadening transcription response parsing, and adding audit observability via tool.args_rewritten.", + "files_modified": [ + "src/backends/native/agent.ts", + "src/backends/native/agent.test.ts", + "src/daemon/routing.ts", + "src/daemon/routing.test.ts", + "src/tools/builtin/audio-transcribe.ts", + "src/tools/builtin/audio-transcribe.test.ts", + "src/tools/executor.ts", + "src/tools/executor.test.ts", + "src/audit/types.ts", + "src/audit/logger.ts", + "README.md", + "docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md" + ], + "test_status": "pnpm test:run src/backends/native/agent.test.ts src/daemon/routing.test.ts src/tools/builtin/audio-transcribe.test.ts src/tools/executor.test.ts; pnpm typecheck; pnpm build" } }, "overall_progress": { @@ -6040,7 +6061,7 @@ "feature_gap_scorecard": "128/128 match (100%), 0 partial (0%), 0 missing (0%)", "operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 2/2 plans complete — milestone done", "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram", - "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback", + "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback, plus 2026-02-23 arg hydration hardening and tool.args_rewritten audit metric", "remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening", "next_up": "Track OpenClaw evolution regularly for inspiration and feature ideas" }, diff --git a/docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md b/docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md new file mode 100644 index 0000000..84917a3 --- /dev/null +++ b/docs/runbooks/VOICE_TRANSCRIPTION_DEBUG.md @@ -0,0 +1,89 @@ +# Voice Transcription Debug Runbook + +This runbook covers Telegram voice-message troubleshooting for `audio.transcribe`. + +## Fast Checks + +1. Confirm tool is enabled in config: + +```yaml +audio: + enabled: true + provider: + endpoint: http://localhost:18801/v1/audio/transcriptions +``` + +2. Confirm recent tool events: + +```bash +tail -n 400 ~/.local/share/flynn/audit.log | jq -c 'select(.event_type=="tool.start" or .event_type=="tool.success" or .event_type=="tool.error" or .event_type=="tool.args_rewritten")' +``` + +3. If needed, confirm local endpoint behavior directly: + +```bash +curl -sS -i -X POST http://localhost:18801/v1/audio/transcriptions \ + -F file=@/tmp/sample.ogg \ + -F model=whisper-1 \ + -F response_format=json +``` + +## Interpreting Common Errors + +- `Either data or url must be provided` + - Model/tool call had empty args and no hydrated attachment data. + +- `Only http/https URLs are allowed, got file:` + - Model emitted `file://...` URL; Flynn should rewrite from latest session audio. + +- `Transcription endpoint error: FFmpeg conversion failed.` + - Endpoint could not decode payload as audio. Often caused by model-provided fake or mismatched `data`/`mime_type`. + +- `[No speech detected]` + - Request succeeded and endpoint returned empty transcript text. + +## Rewrite Metric + +Flynn emits `tool.args_rewritten` whenever it replaces model-provided `audio.transcribe` args with trusted session audio bytes. + +Fields: + +- `source`: `latest_turn`, `persisted`, or `history` +- `reason`: `latest_audio_preferred`, `voice_turn_fallback`, `invalid_model_args`, `missing_model_args` +- `original_*` and `final_mime_type` for quick diagnosis + +Example: + +```json +{ + "event_type": "tool.args_rewritten", + "event": { + "tool_name": "audio.transcribe", + "source": "persisted", + "reason": "voice_turn_fallback", + "original_mime_type": "audio/ogg", + "final_mime_type": "audio/ogg" + } +} +``` + +## Where Audio Is Stored Locally + +Audio bytes are not written as standalone files by default. They are persisted in SQLite: + +- DB path: `~/.local/share/flynn/sessions.db` +- Table: `session_config` +- Key: `lastAudioAttachment` +- Value: JSON with `data` (base64) or `url`, plus `mimeType` + +Inspect current value: + +```bash +sqlite3 ~/.local/share/flynn/sessions.db \ + "SELECT session_id,key,length(value) FROM session_config WHERE key='lastAudioAttachment';" +``` + +## Data Lifecycle + +- `session.clear()` (e.g. `/reset`) removes messages, tool executions, and session config for that session. +- Session TTL pruning removes stale sessions and associated config from SQLite.