From 85d7a6bfec8e963eb684c01e1302f1119a1ce5e0 Mon Sep 17 00:00:00 2001
From: William Valentin <william.valentin.info@gmail.com>
Date: Wed, 11 Feb 2026 09:51:19 -0800
Subject: [PATCH] test: add stopReason edge case tests; update state.json with
 recent fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Added tests for finish_reason 'tool_calls' with empty array → 'end_turn'
- Added test for finish_reason 'length' → 'max_tokens'
- Updated state.json with 4 new entries for today's fixes (SOUL.md, message
  normalization, agent loop resilience, stopReason normalization)
- Test count: 1329 → 1331
---
 docs/plans/state.json     | 35 ++++++++++++++++++++++++++---
 src/models/openai.test.ts | 46 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/docs/plans/state.json b/docs/plans/state.json
index 2703278..a10a792 100644
--- a/docs/plans/state.json
+++ b/docs/plans/state.json
@@ -1,6 +1,6 @@
 {
   "version": "1.0",
-  "updated_at": "2026-02-10",
+  "updated_at": "2026-02-11",
   "description": "Tracks the status of all Flynn plans and implementation phases",
 
   "plans": {
@@ -1063,7 +1063,7 @@
   },
 
   "overall_progress": {
-    "total_test_count": 1329,
+    "total_test_count": 1331,
     "all_tests_passing": true,
     "p0_completion": "3/3 (100%)",
     "p1_completion": "4/4 (100%)",
@@ -1081,7 +1081,7 @@
     "feature_gap_scorecard": "100/128 match (78%), 0 partial (0%), 28 missing (22%)",
     "operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 1/2 plans complete — metrics backend done, dashboard UI next",
     "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram",
-    "next_up": "GSD Milestone: Operator DX — Phase 3 Plan 02 (Dashboard UI consuming metrics RPC). All phases P0-P8 and Tiers 1-4 complete. Setup wizard added. TUI fullscreen mode now has full tool access and proper display. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items"
+    "next_up": "End-to-end test that Flynn follows through on tool calls via GitHub Copilot fallback. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items"
   },
   "soul_md_and_cron_create": {
     "date": "2026-02-11",
@@ -1092,5 +1092,34 @@
       "src/automation/cron.ts",
       "src/tools/policy.ts"
     ]
+  },
+  "local-model-message-normalization": {
+    "date": "2026-02-11",
+    "summary": "Ollama & llama.cpp tool calling message normalization — normalizeMessagesForOllama() converts tool_use/tool_result content blocks to Ollama's native role:tool format, normalizeMessagesForLlamaCpp() converts to OpenAI-style tool_calls arrays with hybrid fallback for GGUF templates that drop role:tool messages.",
+    "files_modified": [
+      "src/models/local/ollama.ts",
+      "src/models/local/ollama.test.ts",
+      "src/models/local/llamacpp.ts",
+      "src/models/local/llamacpp.test.ts"
+    ]
+  },
+  "agent-loop-resilience": {
+    "date": "2026-02-11",
+    "summary": "Agent loop same-tool nudging (after 4 consecutive identical tool calls, inject a nudge message) and try-catch error handling around individual tool executions to prevent daemon crashes.",
+    "files_modified": [
+      "src/backends/native/agent.ts",
+      "src/backends/native/agent.test.ts"
+    ]
+  },
+  "stopreason-normalization": {
+    "date": "2026-02-11",
+    "summary": "Normalize OpenAI/GitHub finish_reason to Flynn stopReason conventions. OpenAI 'stop' → 'end_turn', 'length' → 'max_tokens', 'tool_calls' with tools → 'tool_use', 'tool_calls' without tools → 'end_turn'. Fixes premature agent loop exit when falling back to GitHub Copilot (Anthropic API quota exhausted). Agent loop now accepts both 'tool_use' and 'tool_calls' as belt-and-suspenders.",
+    "files_modified": [
+      "src/models/openai.ts",
+      "src/models/openai.test.ts",
+      "src/models/github.ts",
+      "src/backends/native/agent.ts"
+    ],
+    "test_status": "4/4 passing"
   }
 }
\ No newline at end of file
diff --git a/src/models/openai.test.ts b/src/models/openai.test.ts
index bd33647..2a9c14f 100644
--- a/src/models/openai.test.ts
+++ b/src/models/openai.test.ts
@@ -74,4 +74,50 @@ describe('OpenAIClient tool use', () => {
       args: { command: 'ls' },
     });
   });
+
+  it('maps finish_reason "tool_calls" with empty tool_calls to end_turn', async () => {
+    mockCreate.mockResolvedValueOnce({
+      choices: [{
+        message: {
+          content: 'I tried to call a tool but none matched.',
+          tool_calls: [],
+        },
+        finish_reason: 'tool_calls',
+      }],
+      usage: { prompt_tokens: 15, completion_tokens: 10 },
+    });
+
+    const client = new OpenAIClient({
+      apiKey: 'test-key',
+      model: 'gpt-4o',
+    });
+
+    const response = await client.chat({
+      messages: [{ role: 'user', content: 'do something' }],
+    });
+
+    expect(response.stopReason).toBe('end_turn');
+    expect(response.toolCalls).toBeUndefined();
+  });
+
+  it('maps finish_reason "length" to max_tokens', async () => {
+    mockCreate.mockResolvedValueOnce({
+      choices: [{
+        message: { content: 'Truncated output...' },
+        finish_reason: 'length',
+      }],
+      usage: { prompt_tokens: 100, completion_tokens: 4096 },
+    });
+
+    const client = new OpenAIClient({
+      apiKey: 'test-key',
+      model: 'gpt-4o',
+    });
+
+    const response = await client.chat({
+      messages: [{ role: 'user', content: 'write a long essay' }],
+    });
+
+    expect(response.stopReason).toBe('max_tokens');
+  });
 });