diff --git a/docs/plans/state.json b/docs/plans/state.json index 1cf8a2c..d1cdb9b 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -1089,7 +1089,7 @@ }, "overall_progress": { - "total_test_count": 1331, + "total_test_count": 1369, "all_tests_passing": true, "p0_completion": "3/3 (100%)", "p1_completion": "4/4 (100%)", @@ -1107,6 +1107,7 @@ "feature_gap_scorecard": "100/128 match (78%), 0 partial (0%), 28 missing (22%)", "operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 1/2 plans complete — metrics backend done, dashboard UI next", "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram", + "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback", "next_up": "End-to-end test that Flynn follows through on tool calls via GitHub Copilot fallback. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items" }, "soul_md_and_cron_create": { @@ -1137,6 +1138,56 @@ "src/backends/native/agent.test.ts" ] }, + "native-audio-support": { + "status": "completed", + "date": "2026-02-11", + "summary": "Native audio input support — voice messages passed directly to audio-capable models (Gemini, OpenAI, GitHub) instead of always transcribing via Whisper. Smart routing decides per-model whether to pass raw audio or transcribe first.", + "phases": { + "audio_transcribe_tool": { + "status": "completed", + "description": "audio.transcribe tool with Whisper-compatible API support", + "files_created": [ + "src/tools/builtin/audio-transcribe.ts" + ] + }, + "type_system_and_clients": { + "status": "completed", + "description": "AudioSource type, audio content part handling in all model clients (Gemini inlineData, OpenAI input_audio, GitHub input_audio = native; Anthropic, Bedrock = text fallback)", + "files_modified": [ + "src/models/types.ts", + "src/models/gemini.ts", + "src/models/openai.ts", + "src/models/github.ts", + "src/models/anthropic.ts", + "src/models/bedrock.ts", + "src/models/media.ts" + ] + }, + "capabilities_and_routing": { + "status": "completed", + "description": "supportsAudioInput() capability check, smart routing in daemon that transcribes for non-audio models and passes raw audio for capable ones, supports_audio config override", + "files_created": [ + "src/models/capabilities.ts", + "src/models/capabilities.test.ts" + ], + "files_modified": [ + "src/daemon/routing.ts", + "src/config/schema.ts" + ], + "test_status": "18/18 passing" + }, + "tests_and_token_estimation": { + "status": "completed", + "description": "Audio tests for media helpers, audio token estimation (base64→bytes→duration→tokens at 32 tokens/sec), supports_audio config override wiring", + "files_modified": [ + "src/models/media.test.ts", + "src/context/tokens.ts", + "src/context/tokens.test.ts" + ], + "test_status": "20/20 tokens tests, 87/87 media tests" + } + } + }, "stopreason-normalization": { "date": "2026-02-11", "summary": "Normalize OpenAI/GitHub finish_reason to Flynn stopReason conventions. OpenAI 'stop' → 'end_turn', 'length' → 'max_tokens', 'tool_calls' with tools → 'tool_use', 'tool_calls' without tools → 'end_turn'. Fixes premature agent loop exit when falling back to GitHub Copilot (Anthropic API quota exhausted). Agent loop now accepts both 'tool_use' and 'tool_calls' as belt-and-suspenders.",