test: add stopReason edge case tests; update state.json with recent fixes

- Added tests for finish_reason 'tool_calls' with empty array → 'end_turn' - Added test for finish_reason 'length' → 'max_tokens' - Updated state.json with 4 new entries for today's fixes (SOUL.md, message normalization, agent loop resilience, stopReason normalization) - Test count: 1329 → 1331
2026-02-11 09:51:19 -08:00
parent 01c3175fdb
commit 85d7a6bfec
2 changed files with 78 additions and 3 deletions
@@ -1,6 +1,6 @@
 {
  "version": "1.0",
-  "updated_at": "2026-02-10",
+  "updated_at": "2026-02-11",
  "description": "Tracks the status of all Flynn plans and implementation phases",

  "plans": {
@@ -1063,7 +1063,7 @@
  },

  "overall_progress": {
-    "total_test_count": 1329,
+    "total_test_count": 1331,
    "all_tests_passing": true,
    "p0_completion": "3/3 (100%)",
    "p1_completion": "4/4 (100%)",
@@ -1081,7 +1081,7 @@
    "feature_gap_scorecard": "100/128 match (78%), 0 partial (0%), 28 missing (22%)",
    "operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 1/2 plans complete — metrics backend done, dashboard UI next",
    "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram",
-    "next_up": "GSD Milestone: Operator DX — Phase 3 Plan 02 (Dashboard UI consuming metrics RPC). All phases P0-P8 and Tiers 1-4 complete. Setup wizard added. TUI fullscreen mode now has full tool access and proper display. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items"
+    "next_up": "End-to-end test that Flynn follows through on tool calls via GitHub Copilot fallback. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items"
  },
  "soul_md_and_cron_create": {
    "date": "2026-02-11",
@@ -1092,5 +1092,34 @@
      "src/automation/cron.ts",
      "src/tools/policy.ts"
    ]
+  },
+  "local-model-message-normalization": {
+    "date": "2026-02-11",
+    "summary": "Ollama & llama.cpp tool calling message normalization — normalizeMessagesForOllama() converts tool_use/tool_result content blocks to Ollama's native role:tool format, normalizeMessagesForLlamaCpp() converts to OpenAI-style tool_calls arrays with hybrid fallback for GGUF templates that drop role:tool messages.",
+    "files_modified": [
+      "src/models/local/ollama.ts",
+      "src/models/local/ollama.test.ts",
+      "src/models/local/llamacpp.ts",
+      "src/models/local/llamacpp.test.ts"
+    ]
+  },
+  "agent-loop-resilience": {
+    "date": "2026-02-11",
+    "summary": "Agent loop same-tool nudging (after 4 consecutive identical tool calls, inject a nudge message) and try-catch error handling around individual tool executions to prevent daemon crashes.",
+    "files_modified": [
+      "src/backends/native/agent.ts",
+      "src/backends/native/agent.test.ts"
+    ]
+  },
+  "stopreason-normalization": {
+    "date": "2026-02-11",
+    "summary": "Normalize OpenAI/GitHub finish_reason to Flynn stopReason conventions. OpenAI 'stop' → 'end_turn', 'length' → 'max_tokens', 'tool_calls' with tools → 'tool_use', 'tool_calls' without tools → 'end_turn'. Fixes premature agent loop exit when falling back to GitHub Copilot (Anthropic API quota exhausted). Agent loop now accepts both 'tool_use' and 'tool_calls' as belt-and-suspenders.",
+    "files_modified": [
+      "src/models/openai.ts",
+      "src/models/openai.test.ts",
+      "src/models/github.ts",
+      "src/backends/native/agent.ts"
+    ],
+    "test_status": "4/4 passing"
  }
 }
@@ -74,4 +74,50 @@ describe('OpenAIClient tool use', () => {
      args: { command: 'ls' },
    });
  });
+
+  it('maps finish_reason "tool_calls" with empty tool_calls to end_turn', async () => {
+    mockCreate.mockResolvedValueOnce({
+      choices: [{
+        message: {
+          content: 'I tried to call a tool but none matched.',
+          tool_calls: [],
+        },
+        finish_reason: 'tool_calls',
+      }],
+      usage: { prompt_tokens: 15, completion_tokens: 10 },
+    });
+
+    const client = new OpenAIClient({
+      apiKey: 'test-key',
+      model: 'gpt-4o',
+    });
+
+    const response = await client.chat({
+      messages: [{ role: 'user', content: 'do something' }],
+    });
+
+    expect(response.stopReason).toBe('end_turn');
+    expect(response.toolCalls).toBeUndefined();
+  });
+
+  it('maps finish_reason "length" to max_tokens', async () => {
+    mockCreate.mockResolvedValueOnce({
+      choices: [{
+        message: { content: 'Truncated output...' },
+        finish_reason: 'length',
+      }],
+      usage: { prompt_tokens: 100, completion_tokens: 4096 },
+    });
+
+    const client = new OpenAIClient({
+      apiKey: 'test-key',
+      model: 'gpt-4o',
+    });
+
+    const response = await client.chat({
+      messages: [{ role: 'user', content: 'write a long essay' }],
+    });
+
+    expect(response.stopReason).toBe('max_tokens');
+  });
 });