diff --git a/docs/plans/state.json b/docs/plans/state.json index bc3fdbf..d5c70f1 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -3,6 +3,18 @@ "updated_at": "2026-02-23", "description": "Tracks the status of all Flynn plans and implementation phases", "plans": { + "toolloop-execution-claim-recovery": { + "status": "completed", + "date": "2026-02-23", + "updated": "2026-02-23", + "summary": "Extended NativeAgent no-tool-call recovery to also catch execution-claim text that references known tools (for example `gmail.filter.create returns ...`) without an emitted tool call. Flynn now issues a one-time in-turn nudge and retries the same turn so tool execution or a concrete blocker is produced.", + "files_modified": [ + "src/backends/native/agent.ts", + "src/backends/native/agent.test.ts", + "docs/plans/state.json" + ], + "test_status": "pnpm test:run src/backends/native/agent.test.ts + pnpm typecheck passing" + }, "toolloop-action-intent-recovery": { "status": "completed", "date": "2026-02-23", @@ -6299,7 +6311,7 @@ } }, "overall_progress": { - "total_test_count": 1960, + "total_test_count": 1962, "all_tests_passing": true, "p0_completion": "3/3 (100%)", "p1_completion": "4/4 (100%)", @@ -6320,6 +6332,7 @@ "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram", "gmail_filter_creation": "completed — gmail.filter.create tool added with criteria/action validation; gmail-auth now requests full Gmail scope (https://mail.google.com/) for complete filter permissions", "toolloop_action_intent_recovery": "completed — when a model claims it will execute a tool but emits no tool call, NativeAgent now issues one internal nudge and continues the same turn to execute tools or produce a concrete blocker", + "toolloop_execution_claim_recovery": "completed — when a model claims a known tool already succeeded/failed without emitting a tool call, NativeAgent now nudges once and retries the same turn before returning text", "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback, plus 2026-02-23 arg hydration hardening, tool.args_rewritten audit metric, transient fetch retry/timeout hardening, localhost->127.0.0.1 fallback for transcription endpoint connectivity, and whisper docker-compose entrypoint arg fix for port 18801", "remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening", "next_up": "Track OpenClaw evolution regularly for inspiration and feature ideas" diff --git a/src/backends/native/agent.test.ts b/src/backends/native/agent.test.ts index e53e3df..44bb4f8 100644 --- a/src/backends/native/agent.test.ts +++ b/src/backends/native/agent.test.ts @@ -1041,6 +1041,64 @@ describe('NativeAgent tool loop', () => { expect(mockClient.chat).toHaveBeenCalledTimes(2); }); + it('nudges when response claims a known tool failed without emitting any tool call', async () => { + let callCount = 0; + const mockClient: ModelClient = { + chat: vi.fn().mockImplementation((request: ChatRequest) => { + callCount++; + if (callCount === 1) { + return { + content: 'Still failing: gmail.filter.create returns Insufficient Permission.', + stopReason: 'end_turn', + usage: { inputTokens: 10, outputTokens: 5 }, + }; + } + if (callCount === 2) { + expect(JSON.stringify(request.messages)).toContain('no tool call was emitted'); + return { + content: '', + stopReason: 'tool_use', + usage: { inputTokens: 10, outputTokens: 5 }, + toolCalls: [{ id: 'call_2', name: 'gmail_filter_create', args: { query: 'from:no-reply@gandi.net' } }], + }; + } + return { + content: 'Created after retry.', + stopReason: 'end_turn', + usage: { inputTokens: 12, outputTokens: 6 }, + }; + }), + }; + + const gmailFilterTool: Tool = { + name: 'gmail.filter.create', + description: 'Create gmail filter', + inputSchema: { + type: 'object', + properties: { + query: { type: 'string' }, + }, + }, + execute: async () => ({ success: true, output: 'ok' }), + }; + + const registry = new ToolRegistry(); + registry.register(gmailFilterTool); + const hooks = new HookEngine({ confirm: [], log: [], silent: [] }); + const executor = new ToolExecutor(registry, hooks); + + const agent = new NativeAgent({ + modelClient: mockClient, + systemPrompt: 'You are helpful.', + toolRegistry: registry, + toolExecutor: executor, + }); + + const response = await agent.process('Create the Gmail filter'); + expect(response).toBe('Created after retry.'); + expect(mockClient.chat).toHaveBeenCalledTimes(3); + }); + it('works without tools (backward compatible)', async () => { const mockClient: ModelClient = { chat: vi.fn().mockResolvedValue({ diff --git a/src/backends/native/agent.ts b/src/backends/native/agent.ts index e1e3128..f0d75d0 100644 --- a/src/backends/native/agent.ts +++ b/src/backends/native/agent.ts @@ -221,6 +221,7 @@ export class NativeAgent { throw new Error('Tool loop requires tool registry and executor'); } const tools = toolRegistry.filteredToAnthropicFormat(this._toolPolicyContext); + const availableToolNames = toolRegistry.filteredList(this._toolPolicyContext).map((tool) => tool.name); // Track whether untrusted content (web/fetched/tool output) has been introduced // during this run. Used to harden against prompt injection. @@ -306,7 +307,7 @@ export class NativeAgent { const wantsToolUse = toolCalls.length > 0; if (!wantsToolUse) { const pseudoToolUse = this.extractPseudoToolUse(response.content); - if (this.shouldNudgeForMissingToolCall(response.content, pseudoToolUse) && !actionIntentNudged) { + if (this.shouldNudgeForMissingToolCall(response.content, pseudoToolUse, availableToolNames) && !actionIntentNudged) { actionIntentNudged = true; const normalized = this.normalizeAssistantContent(response.content); loopMessages.push({ role: 'assistant', content: normalized }); @@ -483,19 +484,36 @@ export class NativeAgent { return warningMsg; } - private shouldNudgeForMissingToolCall(content: string, pseudoToolUse: PseudoToolUse | null): boolean { + private shouldNudgeForMissingToolCall( + content: string, + pseudoToolUse: PseudoToolUse | null, + availableToolNames: string[], + ): boolean { if (!content || pseudoToolUse) { return false; } const normalized = content.toLowerCase(); const intentRegex = /\b(i(?:'m| am)? going to|i(?:'ll| will)|let me|proceeding(?: now)?|i can(?: now)?)\b/; - if (!intentRegex.test(normalized)) { + const actionRegex = /\b(create|run|execute|call|use|check|fetch|search|read|write|send|retry|proceed|attempt|apply|delete|update|list)\b/; + if (intentRegex.test(normalized) && actionRegex.test(normalized)) { + return true; + } + + // Also catch "execution claims" (success/failure statements) that mention a known tool + // without actually emitting any tool calls in the response payload. + const claimRegex = /\b(still failing|failed|failing|error|blocked|insufficient permission|returns?|returned|result|succeeded|success|completed|done|tried|attempted|executed|ran|called|used)\b/; + if (!claimRegex.test(normalized)) { return false; } - const actionRegex = /\b(create|run|execute|call|use|check|fetch|search|read|write|send|retry|proceed|attempt|apply|delete|update|list)\b/; - return actionRegex.test(normalized); + const mentionsKnownTool = availableToolNames.some((name) => { + const dotName = name.toLowerCase(); + const underscoreName = dotName.replace(/\./g, '_'); + return normalized.includes(dotName) || normalized.includes(underscoreName); + }); + + return mentionsKnownTool; } private async chatWithRouter(request: ChatRequest): Promise {