From 85d7a6bfec8e963eb684c01e1302f1119a1ce5e0 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Wed, 11 Feb 2026 09:51:19 -0800 Subject: [PATCH] test: add stopReason edge case tests; update state.json with recent fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added tests for finish_reason 'tool_calls' with empty array → 'end_turn' - Added test for finish_reason 'length' → 'max_tokens' - Updated state.json with 4 new entries for today's fixes (SOUL.md, message normalization, agent loop resilience, stopReason normalization) - Test count: 1329 → 1331 --- docs/plans/state.json | 35 ++++++++++++++++++++++++++--- src/models/openai.test.ts | 46 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 3 deletions(-) diff --git a/docs/plans/state.json b/docs/plans/state.json index 2703278..a10a792 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -1,6 +1,6 @@ { "version": "1.0", - "updated_at": "2026-02-10", + "updated_at": "2026-02-11", "description": "Tracks the status of all Flynn plans and implementation phases", "plans": { @@ -1063,7 +1063,7 @@ }, "overall_progress": { - "total_test_count": 1329, + "total_test_count": 1331, "all_tests_passing": true, "p0_completion": "3/3 (100%)", "p1_completion": "4/4 (100%)", @@ -1081,7 +1081,7 @@ "feature_gap_scorecard": "100/128 match (78%), 0 partial (0%), 28 missing (22%)", "operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 1/2 plans complete — metrics backend done, dashboard UI next", "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram", - "next_up": "GSD Milestone: Operator DX — Phase 3 Plan 02 (Dashboard UI consuming metrics RPC). All phases P0-P8 and Tiers 1-4 complete. Setup wizard added. TUI fullscreen mode now has full tool access and proper display. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items" + "next_up": "End-to-end test that Flynn follows through on tool calls via GitHub Copilot fallback. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items" }, "soul_md_and_cron_create": { "date": "2026-02-11", @@ -1092,5 +1092,34 @@ "src/automation/cron.ts", "src/tools/policy.ts" ] + }, + "local-model-message-normalization": { + "date": "2026-02-11", + "summary": "Ollama & llama.cpp tool calling message normalization — normalizeMessagesForOllama() converts tool_use/tool_result content blocks to Ollama's native role:tool format, normalizeMessagesForLlamaCpp() converts to OpenAI-style tool_calls arrays with hybrid fallback for GGUF templates that drop role:tool messages.", + "files_modified": [ + "src/models/local/ollama.ts", + "src/models/local/ollama.test.ts", + "src/models/local/llamacpp.ts", + "src/models/local/llamacpp.test.ts" + ] + }, + "agent-loop-resilience": { + "date": "2026-02-11", + "summary": "Agent loop same-tool nudging (after 4 consecutive identical tool calls, inject a nudge message) and try-catch error handling around individual tool executions to prevent daemon crashes.", + "files_modified": [ + "src/backends/native/agent.ts", + "src/backends/native/agent.test.ts" + ] + }, + "stopreason-normalization": { + "date": "2026-02-11", + "summary": "Normalize OpenAI/GitHub finish_reason to Flynn stopReason conventions. OpenAI 'stop' → 'end_turn', 'length' → 'max_tokens', 'tool_calls' with tools → 'tool_use', 'tool_calls' without tools → 'end_turn'. Fixes premature agent loop exit when falling back to GitHub Copilot (Anthropic API quota exhausted). Agent loop now accepts both 'tool_use' and 'tool_calls' as belt-and-suspenders.", + "files_modified": [ + "src/models/openai.ts", + "src/models/openai.test.ts", + "src/models/github.ts", + "src/backends/native/agent.ts" + ], + "test_status": "4/4 passing" } } \ No newline at end of file diff --git a/src/models/openai.test.ts b/src/models/openai.test.ts index bd33647..2a9c14f 100644 --- a/src/models/openai.test.ts +++ b/src/models/openai.test.ts @@ -74,4 +74,50 @@ describe('OpenAIClient tool use', () => { args: { command: 'ls' }, }); }); + + it('maps finish_reason "tool_calls" with empty tool_calls to end_turn', async () => { + mockCreate.mockResolvedValueOnce({ + choices: [{ + message: { + content: 'I tried to call a tool but none matched.', + tool_calls: [], + }, + finish_reason: 'tool_calls', + }], + usage: { prompt_tokens: 15, completion_tokens: 10 }, + }); + + const client = new OpenAIClient({ + apiKey: 'test-key', + model: 'gpt-4o', + }); + + const response = await client.chat({ + messages: [{ role: 'user', content: 'do something' }], + }); + + expect(response.stopReason).toBe('end_turn'); + expect(response.toolCalls).toBeUndefined(); + }); + + it('maps finish_reason "length" to max_tokens', async () => { + mockCreate.mockResolvedValueOnce({ + choices: [{ + message: { content: 'Truncated output...' }, + finish_reason: 'length', + }], + usage: { prompt_tokens: 100, completion_tokens: 4096 }, + }); + + const client = new OpenAIClient({ + apiKey: 'test-key', + model: 'gpt-4o', + }); + + const response = await client.chat({ + messages: [{ role: 'user', content: 'write a long essay' }], + }); + + expect(response.stopReason).toBe('max_tokens'); + }); });