test: add stopReason edge case tests; update state.json with recent fixes

- Added tests for finish_reason 'tool_calls' with empty array → 'end_turn'
- Added test for finish_reason 'length' → 'max_tokens'
- Updated state.json with 4 new entries for today's fixes (SOUL.md, message
  normalization, agent loop resilience, stopReason normalization)
- Test count: 1329 → 1331
This commit is contained in:
William Valentin
2026-02-11 09:51:19 -08:00
parent 01c3175fdb
commit 85d7a6bfec
2 changed files with 78 additions and 3 deletions
+32 -3
View File
@@ -1,6 +1,6 @@
{
"version": "1.0",
"updated_at": "2026-02-10",
"updated_at": "2026-02-11",
"description": "Tracks the status of all Flynn plans and implementation phases",
"plans": {
@@ -1063,7 +1063,7 @@
},
"overall_progress": {
"total_test_count": 1329,
"total_test_count": 1331,
"all_tests_passing": true,
"p0_completion": "3/3 (100%)",
"p1_completion": "4/4 (100%)",
@@ -1081,7 +1081,7 @@
"feature_gap_scorecard": "100/128 match (78%), 0 partial (0%), 28 missing (22%)",
"operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 1/2 plans complete — metrics backend done, dashboard UI next",
"gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram",
"next_up": "GSD Milestone: Operator DX — Phase 3 Plan 02 (Dashboard UI consuming metrics RPC). All phases P0-P8 and Tiers 1-4 complete. Setup wizard added. TUI fullscreen mode now has full tool access and proper display. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items"
"next_up": "End-to-end test that Flynn follows through on tool calls via GitHub Copilot fallback. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items"
},
"soul_md_and_cron_create": {
"date": "2026-02-11",
@@ -1092,5 +1092,34 @@
"src/automation/cron.ts",
"src/tools/policy.ts"
]
},
"local-model-message-normalization": {
"date": "2026-02-11",
"summary": "Ollama & llama.cpp tool calling message normalization — normalizeMessagesForOllama() converts tool_use/tool_result content blocks to Ollama's native role:tool format, normalizeMessagesForLlamaCpp() converts to OpenAI-style tool_calls arrays with hybrid fallback for GGUF templates that drop role:tool messages.",
"files_modified": [
"src/models/local/ollama.ts",
"src/models/local/ollama.test.ts",
"src/models/local/llamacpp.ts",
"src/models/local/llamacpp.test.ts"
]
},
"agent-loop-resilience": {
"date": "2026-02-11",
"summary": "Agent loop same-tool nudging (after 4 consecutive identical tool calls, inject a nudge message) and try-catch error handling around individual tool executions to prevent daemon crashes.",
"files_modified": [
"src/backends/native/agent.ts",
"src/backends/native/agent.test.ts"
]
},
"stopreason-normalization": {
"date": "2026-02-11",
"summary": "Normalize OpenAI/GitHub finish_reason to Flynn stopReason conventions. OpenAI 'stop' → 'end_turn', 'length' → 'max_tokens', 'tool_calls' with tools → 'tool_use', 'tool_calls' without tools → 'end_turn'. Fixes premature agent loop exit when falling back to GitHub Copilot (Anthropic API quota exhausted). Agent loop now accepts both 'tool_use' and 'tool_calls' as belt-and-suspenders.",
"files_modified": [
"src/models/openai.ts",
"src/models/openai.test.ts",
"src/models/github.ts",
"src/backends/native/agent.ts"
],
"test_status": "4/4 passing"
}
}
+46
View File
@@ -74,4 +74,50 @@ describe('OpenAIClient tool use', () => {
args: { command: 'ls' },
});
});
it('maps finish_reason "tool_calls" with empty tool_calls to end_turn', async () => {
mockCreate.mockResolvedValueOnce({
choices: [{
message: {
content: 'I tried to call a tool but none matched.',
tool_calls: [],
},
finish_reason: 'tool_calls',
}],
usage: { prompt_tokens: 15, completion_tokens: 10 },
});
const client = new OpenAIClient({
apiKey: 'test-key',
model: 'gpt-4o',
});
const response = await client.chat({
messages: [{ role: 'user', content: 'do something' }],
});
expect(response.stopReason).toBe('end_turn');
expect(response.toolCalls).toBeUndefined();
});
it('maps finish_reason "length" to max_tokens', async () => {
mockCreate.mockResolvedValueOnce({
choices: [{
message: { content: 'Truncated output...' },
finish_reason: 'length',
}],
usage: { prompt_tokens: 100, completion_tokens: 4096 },
});
const client = new OpenAIClient({
apiKey: 'test-key',
model: 'gpt-4o',
});
const response = await client.chat({
messages: [{ role: 'user', content: 'write a long essay' }],
});
expect(response.stopReason).toBe('max_tokens');
});
});