From 62126038216cc8a6ebb0ca319f4b9cd5f18e83fc Mon Sep 17 00:00:00 2001 From: William Valentin Date: Sat, 14 Feb 2026 00:43:48 -0800 Subject: [PATCH] fix(models): tighten audio capability + correct openai oauth content --- docs/plans/state.json | 10 +++++++++ src/models/capabilities.ts | 46 +++++++++++++++++++++----------------- src/models/openai.ts | 3 ++- 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/docs/plans/state.json b/docs/plans/state.json index bea1ab7..36447b3 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -89,6 +89,16 @@ ], "test_status": "pnpm test:run src/auth/anthropic.test.ts + pnpm typecheck passing" }, + "openai-oauth-and-audio-capability-tweaks": { + "status": "completed", + "date": "2026-02-14", + "summary": "Fixed OpenAI OAuth (Codex) message shaping by using output_text for assistant turns, and made native-audio detection conservative via a model allowlist so Flynn does not attempt to send raw audio to models that cannot consume it.", + "files_modified": [ + "src/models/openai.ts", + "src/models/capabilities.ts" + ], + "test_status": "pnpm typecheck passing (no new tests added in this change)" + }, "p0-p1-implementation-plan": { "file": "2026-02-06-p0-p1-implementation-plan.md", "status": "completed", diff --git a/src/models/capabilities.ts b/src/models/capabilities.ts index adc35fe..379cd5e 100644 --- a/src/models/capabilities.ts +++ b/src/models/capabilities.ts @@ -7,22 +7,26 @@ export type ModelProvider = 'anthropic' | 'openai' | 'gemini' | 'bedrock' | 'github' | 'ollama' | 'llamacpp' | 'openrouter' | 'zhipuai' | 'xai'; -/** Providers that support native audio input in their API. */ -const AUDIO_CAPABLE_PROVIDERS = new Set([ - 'gemini', - 'openai', - 'github', // GitHub Models uses OpenAI-compatible API +/** + * Models known to support native audio input via their API. + * We use an allowlist (not a provider-level blanket) to avoid silently + * dropping audio for models that don't actually handle audio content parts. + */ +const AUDIO_CAPABLE_MODELS = new Set([ + // Gemini — all current models support audio + 'gemini-2.0-flash', + 'gemini-2.0-pro', + 'gemini-1.5-flash', + 'gemini-1.5-pro', + // OpenAI — only multimodal audio models + 'gpt-4o', + 'gpt-4o-mini', + 'gpt-4o-audio-preview', ]); -/** - * Models known NOT to support audio despite their provider supporting it. - * For example, older OpenAI models or specialized models. - */ -const AUDIO_INCAPABLE_MODELS = new Set([ - // Older OpenAI models that predate audio input support - 'gpt-3.5-turbo', - 'gpt-4', - 'gpt-4-turbo', +/** Providers where all models support audio (e.g. Gemini). */ +const AUDIO_CAPABLE_PROVIDERS = new Set([ + 'gemini', ]); /** @@ -34,15 +38,15 @@ const AUDIO_INCAPABLE_MODELS = new Set([ export function supportsAudioInput(provider: string, model: string, override?: boolean): boolean { if (override !== undefined) {return override;} - // Provider must be in the capable set - if (!AUDIO_CAPABLE_PROVIDERS.has(provider)) { - return false; + // Provider-level blanket (all models support audio) + if (AUDIO_CAPABLE_PROVIDERS.has(provider)) { + return true; } - // Check model-specific exclusions - if (AUDIO_INCAPABLE_MODELS.has(model)) { - return false; + // Model-level allowlist + if (AUDIO_CAPABLE_MODELS.has(model)) { + return true; } - return true; + return false; } diff --git a/src/models/openai.ts b/src/models/openai.ts index a5b74c1..f779490 100644 --- a/src/models/openai.ts +++ b/src/models/openai.ts @@ -96,9 +96,10 @@ export class OpenAIClient implements ModelClient { .map((m) => { const text = getMessageTextWithTools(m); if (!text) {return null;} + const contentType = m.role === 'assistant' ? 'output_text' : 'input_text'; return { role: m.role, - content: [{ type: 'input_text', text }], + content: [{ type: contentType, text }], }; }) .filter((x): x is NonNullable => Boolean(x));