test: add stopReason edge case tests; update state.json with recent fixes
- Added tests for finish_reason 'tool_calls' with empty array → 'end_turn' - Added test for finish_reason 'length' → 'max_tokens' - Updated state.json with 4 new entries for today's fixes (SOUL.md, message normalization, agent loop resilience, stopReason normalization) - Test count: 1329 → 1331
This commit is contained in:
+32
-3
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"version": "1.0",
|
||||
"updated_at": "2026-02-10",
|
||||
"updated_at": "2026-02-11",
|
||||
"description": "Tracks the status of all Flynn plans and implementation phases",
|
||||
|
||||
"plans": {
|
||||
@@ -1063,7 +1063,7 @@
|
||||
},
|
||||
|
||||
"overall_progress": {
|
||||
"total_test_count": 1329,
|
||||
"total_test_count": 1331,
|
||||
"all_tests_passing": true,
|
||||
"p0_completion": "3/3 (100%)",
|
||||
"p1_completion": "4/4 (100%)",
|
||||
@@ -1081,7 +1081,7 @@
|
||||
"feature_gap_scorecard": "100/128 match (78%), 0 partial (0%), 28 missing (22%)",
|
||||
"operator_dx_milestone": "Phase 3 (Live Ops Dashboard): 1/2 plans complete — metrics backend done, dashboard UI next",
|
||||
"gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram",
|
||||
"next_up": "GSD Milestone: Operator DX — Phase 3 Plan 02 (Dashboard UI consuming metrics RPC). All phases P0-P8 and Tiers 1-4 complete. Setup wizard added. TUI fullscreen mode now has full tool access and proper display. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items"
|
||||
"next_up": "End-to-end test that Flynn follows through on tool calls via GitHub Copilot fallback. Remaining gaps: Tier 4 channels (Signal, Matrix, Teams, Google Chat), Tier 5 deferred/niche items"
|
||||
},
|
||||
"soul_md_and_cron_create": {
|
||||
"date": "2026-02-11",
|
||||
@@ -1092,5 +1092,34 @@
|
||||
"src/automation/cron.ts",
|
||||
"src/tools/policy.ts"
|
||||
]
|
||||
},
|
||||
"local-model-message-normalization": {
|
||||
"date": "2026-02-11",
|
||||
"summary": "Ollama & llama.cpp tool calling message normalization — normalizeMessagesForOllama() converts tool_use/tool_result content blocks to Ollama's native role:tool format, normalizeMessagesForLlamaCpp() converts to OpenAI-style tool_calls arrays with hybrid fallback for GGUF templates that drop role:tool messages.",
|
||||
"files_modified": [
|
||||
"src/models/local/ollama.ts",
|
||||
"src/models/local/ollama.test.ts",
|
||||
"src/models/local/llamacpp.ts",
|
||||
"src/models/local/llamacpp.test.ts"
|
||||
]
|
||||
},
|
||||
"agent-loop-resilience": {
|
||||
"date": "2026-02-11",
|
||||
"summary": "Agent loop same-tool nudging (after 4 consecutive identical tool calls, inject a nudge message) and try-catch error handling around individual tool executions to prevent daemon crashes.",
|
||||
"files_modified": [
|
||||
"src/backends/native/agent.ts",
|
||||
"src/backends/native/agent.test.ts"
|
||||
]
|
||||
},
|
||||
"stopreason-normalization": {
|
||||
"date": "2026-02-11",
|
||||
"summary": "Normalize OpenAI/GitHub finish_reason to Flynn stopReason conventions. OpenAI 'stop' → 'end_turn', 'length' → 'max_tokens', 'tool_calls' with tools → 'tool_use', 'tool_calls' without tools → 'end_turn'. Fixes premature agent loop exit when falling back to GitHub Copilot (Anthropic API quota exhausted). Agent loop now accepts both 'tool_use' and 'tool_calls' as belt-and-suspenders.",
|
||||
"files_modified": [
|
||||
"src/models/openai.ts",
|
||||
"src/models/openai.test.ts",
|
||||
"src/models/github.ts",
|
||||
"src/backends/native/agent.ts"
|
||||
],
|
||||
"test_status": "4/4 passing"
|
||||
}
|
||||
}
|
||||
@@ -74,4 +74,50 @@ describe('OpenAIClient tool use', () => {
|
||||
args: { command: 'ls' },
|
||||
});
|
||||
});
|
||||
|
||||
it('maps finish_reason "tool_calls" with empty tool_calls to end_turn', async () => {
|
||||
mockCreate.mockResolvedValueOnce({
|
||||
choices: [{
|
||||
message: {
|
||||
content: 'I tried to call a tool but none matched.',
|
||||
tool_calls: [],
|
||||
},
|
||||
finish_reason: 'tool_calls',
|
||||
}],
|
||||
usage: { prompt_tokens: 15, completion_tokens: 10 },
|
||||
});
|
||||
|
||||
const client = new OpenAIClient({
|
||||
apiKey: 'test-key',
|
||||
model: 'gpt-4o',
|
||||
});
|
||||
|
||||
const response = await client.chat({
|
||||
messages: [{ role: 'user', content: 'do something' }],
|
||||
});
|
||||
|
||||
expect(response.stopReason).toBe('end_turn');
|
||||
expect(response.toolCalls).toBeUndefined();
|
||||
});
|
||||
|
||||
it('maps finish_reason "length" to max_tokens', async () => {
|
||||
mockCreate.mockResolvedValueOnce({
|
||||
choices: [{
|
||||
message: { content: 'Truncated output...' },
|
||||
finish_reason: 'length',
|
||||
}],
|
||||
usage: { prompt_tokens: 100, completion_tokens: 4096 },
|
||||
});
|
||||
|
||||
const client = new OpenAIClient({
|
||||
apiKey: 'test-key',
|
||||
model: 'gpt-4o',
|
||||
});
|
||||
|
||||
const response = await client.chat({
|
||||
messages: [{ role: 'user', content: 'write a long essay' }],
|
||||
});
|
||||
|
||||
expect(response.stopReason).toBe('max_tokens');
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user