docs: update state.json with native audio support feature and test count (1369)

2026-02-11 18:27:50 -08:00
parent 148219153e
commit c62dad2e2e
1 changed files with 52 additions and 1 deletions
@@ -1089,7 +1089,7 @@
  },

  "overall_progress": {
-    "total_test_count": 1331,
+    "total_test_count": 1369,
    "all_tests_passing": true,
    "p0_completion": "3/3 (100%)",
    "p1_completion": "4/4 (100%)",
@@ -1107,6 +1107,7 @@
    "feature_gap_scorecard": "100/128 match (78%), 0 partial (0%), 28 missing (22%)",
    "operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 1/2 plans complete — metrics backend done, dashboard UI next",
    "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram",
+    "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback",
    "next_up": "End-to-end test that Flynn follows through on tool calls via GitHub Copilot fallback. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items"
  },
  "soul_md_and_cron_create": {
@@ -1137,6 +1138,56 @@
      "src/backends/native/agent.test.ts"
    ]
  },
+  "native-audio-support": {
+    "status": "completed",
+    "date": "2026-02-11",
+    "summary": "Native audio input support — voice messages passed directly to audio-capable models (Gemini, OpenAI, GitHub) instead of always transcribing via Whisper. Smart routing decides per-model whether to pass raw audio or transcribe first.",
+    "phases": {
+      "audio_transcribe_tool": {
+        "status": "completed",
+        "description": "audio.transcribe tool with Whisper-compatible API support",
+        "files_created": [
+          "src/tools/builtin/audio-transcribe.ts"
+        ]
+      },
+      "type_system_and_clients": {
+        "status": "completed",
+        "description": "AudioSource type, audio content part handling in all model clients (Gemini inlineData, OpenAI input_audio, GitHub input_audio = native; Anthropic, Bedrock = text fallback)",
+        "files_modified": [
+          "src/models/types.ts",
+          "src/models/gemini.ts",
+          "src/models/openai.ts",
+          "src/models/github.ts",
+          "src/models/anthropic.ts",
+          "src/models/bedrock.ts",
+          "src/models/media.ts"
+        ]
+      },
+      "capabilities_and_routing": {
+        "status": "completed",
+        "description": "supportsAudioInput() capability check, smart routing in daemon that transcribes for non-audio models and passes raw audio for capable ones, supports_audio config override",
+        "files_created": [
+          "src/models/capabilities.ts",
+          "src/models/capabilities.test.ts"
+        ],
+        "files_modified": [
+          "src/daemon/routing.ts",
+          "src/config/schema.ts"
+        ],
+        "test_status": "18/18 passing"
+      },
+      "tests_and_token_estimation": {
+        "status": "completed",
+        "description": "Audio tests for media helpers, audio token estimation (base64→bytes→duration→tokens at 32 tokens/sec), supports_audio config override wiring",
+        "files_modified": [
+          "src/models/media.test.ts",
+          "src/context/tokens.ts",
+          "src/context/tokens.test.ts"
+        ],
+        "test_status": "20/20 tokens tests, 87/87 media tests"
+      }
+    }
+  },
  "stopreason-normalization": {
    "date": "2026-02-11",
    "summary": "Normalize OpenAI/GitHub finish_reason to Flynn stopReason conventions. OpenAI 'stop' → 'end_turn', 'length' → 'max_tokens', 'tool_calls' with tools → 'tool_use', 'tool_calls' without tools → 'end_turn'. Fixes premature agent loop exit when falling back to GitHub Copilot (Anthropic API quota exhausted). Agent loop now accepts both 'tool_use' and 'tool_calls' as belt-and-suspenders.",