flynn/src/backends/native/agent.test.ts

import { describe, it, expect, vi, beforeEach } from 'vitest';
import { NativeAgent } from './agent.js';
import type { ModelClient, ChatResponse } from '../../models/types.js';
import { ToolRegistry, ToolExecutor } from '../../tools/index.js';
import { HookEngine } from '../../hooks/index.js';
import type { Tool, ToolResult } from '../../tools/index.js';

describe('NativeAgent', () => {
  const createMockClient = (): ModelClient => ({
    chat: vi.fn().mockResolvedValue({
      content: 'Hello!',
      stopReason: 'end_turn',
      usage: { inputTokens: 10, outputTokens: 5 },
    } satisfies ChatResponse),
  });

  it('processes messages and maintains history', async () => {
    const mockClient = createMockClient();
    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
    });

    const response = await agent.process('Hi');

    expect(response).toBe('Hello!');
    expect(mockClient.chat).toHaveBeenCalledWith({
      messages: [{ role: 'user', content: 'Hi' }],
      system: 'You are helpful.',
    });

    const history = agent.getHistory();
    expect(history).toHaveLength(2);
    expect(history[0]).toEqual({ role: 'user', content: 'Hi' });
    expect(history[1]).toEqual({ role: 'assistant', content: 'Hello!' });
  });

  it('resets conversation history', async () => {
    const mockClient = createMockClient();
    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
    });

    await agent.process('Hi');
    agent.reset();

    expect(agent.getHistory()).toHaveLength(0);
  });

  it('uses session when provided', async () => {
    const mockClient = createMockClient();
    const mockSession = {
      id: 'test-session',
      getHistory: vi.fn().mockReturnValue([]),
      addMessage: vi.fn(),
      clear: vi.fn(),
      replaceHistory: vi.fn(),
    };

    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
      session: mockSession,
    });

    await agent.process('Hi');

    expect(mockSession.addMessage).toHaveBeenCalledTimes(2);
    expect(mockSession.addMessage).toHaveBeenNthCalledWith(1, { role: 'user', content: 'Hi' });
    expect(mockSession.addMessage).toHaveBeenNthCalledWith(2, { role: 'assistant', content: 'Hello!' });
  });
});

// Simple test tool
const echoTool: Tool = {
  name: 'test.echo',
  description: 'Echo',
  inputSchema: { type: 'object', properties: { text: { type: 'string' } }, required: ['text'] },
  execute: async (args) => ({ success: true, output: (args as { text: string }).text }),
};

describe('NativeAgent tool loop', () => {
  it('executes tool calls and feeds results back', async () => {
    let callCount = 0;
    const mockClient: ModelClient = {
      chat: vi.fn().mockImplementation(() => {
        callCount++;
        if (callCount === 1) {
          // First call: model requests tool use
          return {
            content: '',
            stopReason: 'tool_use',
            usage: { inputTokens: 10, outputTokens: 5 },
            toolCalls: [{ id: 'call_1', name: 'test.echo', args: { text: 'hello' } }],
          };
        }
        // Second call: model gives final text response
        return {
          content: 'The tool returned: hello',
          stopReason: 'end_turn',
          usage: { inputTokens: 15, outputTokens: 10 },
        };
      }),
    };

    const registry = new ToolRegistry();
    registry.register(echoTool);
    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
    const executor = new ToolExecutor(registry, hooks);

    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
      toolRegistry: registry,
      toolExecutor: executor,
    });

    const response = await agent.process('echo hello');
    expect(response).toBe('The tool returned: hello');
    expect(mockClient.chat).toHaveBeenCalledTimes(2);
  });

  it('respects max iterations when tool calls vary', async () => {
    // Model always returns tool_use but with different args each time (no loop detection)
    let callCount = 0;
    const mockClient: ModelClient = {
      chat: vi.fn().mockImplementation(() => {
        callCount++;
        return {
          content: '',
          stopReason: 'tool_use',
          usage: { inputTokens: 10, outputTokens: 5 },
          toolCalls: [{ id: `call_${callCount}`, name: 'test.echo', args: { text: `attempt_${callCount}` } }],
        };
      }),
    };

    const registry = new ToolRegistry();
    registry.register(echoTool);
    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
    const executor = new ToolExecutor(registry, hooks);

    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
      toolRegistry: registry,
      toolExecutor: executor,
      maxIterations: 3,
    });

    const response = await agent.process('loop forever');
    expect(response).toContain('max iterations');
    expect(mockClient.chat).toHaveBeenCalledTimes(3);
  });

  it('nudges model after same tool called too many times with different args', async () => {
    let callCount = 0;
    const mockClient: ModelClient = {
      chat: vi.fn().mockImplementation((req: any) => {
        callCount++;
        // After nudge message, model should respond with text
        const lastMsg = req.messages[req.messages.length - 1];
        const hasNudge = typeof lastMsg?.content !== 'string' &&
          Array.isArray(lastMsg?.content) &&
          lastMsg.content.some((b: any) => b.content?.includes('do NOT call it again'));
        if (hasNudge) {
          return {
            content: 'Here is what I found from my searches.',
            stopReason: 'end_turn',
            usage: { inputTokens: 10, outputTokens: 5 },
          };
        }
        return {
          content: '',
          stopReason: 'tool_use',
          usage: { inputTokens: 10, outputTokens: 5 },
          toolCalls: [{ id: `call_${callCount}`, name: 'test.echo', args: { text: `query_${callCount}` } }],
        };
      }),
    };

    const registry = new ToolRegistry();
    registry.register(echoTool);
    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
    const executor = new ToolExecutor(registry, hooks);

    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
      toolRegistry: registry,
      toolExecutor: executor,
      maxIterations: 10,
    });

    const response = await agent.process('search a lot');
    // Model should have responded after receiving the nudge
    expect(response).toBe('Here is what I found from my searches.');
    // 4 tool calls + 1 final response = 5 chat calls
    expect(mockClient.chat).toHaveBeenCalledTimes(5);
  });

  it('detects repeated identical tool calls and breaks the loop', async () => {
    // Model always returns the exact same tool call — simulates local LLM stuck in a loop
    const mockClient: ModelClient = {
      chat: vi.fn().mockResolvedValue({
        content: '',
        stopReason: 'tool_use',
        usage: { inputTokens: 10, outputTokens: 5 },
        toolCalls: [{ id: 'call_1', name: 'test.echo', args: { text: 'same thing' } }],
      }),
    };

    const registry = new ToolRegistry();
    registry.register(echoTool);
    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
    const executor = new ToolExecutor(registry, hooks);

    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
      toolRegistry: registry,
      toolExecutor: executor,
      maxIterations: 10,
    });

    const response = await agent.process('search for news');
    expect(response).toContain('Tool loop detected');
    expect(response).toContain('same thing'); // includes the last tool result
    // Should break after 3 consecutive identical calls, not 10
    expect(mockClient.chat).toHaveBeenCalledTimes(3);
  });

  it('works without tools (backward compatible)', async () => {
    const mockClient: ModelClient = {
      chat: vi.fn().mockResolvedValue({
        content: 'Hello!',
        stopReason: 'end_turn',
        usage: { inputTokens: 10, outputTokens: 5 },
      }),
    };

    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
    });

    const response = await agent.process('Hi');
    expect(response).toBe('Hello!');
  });

  it('calls onToolUse callback on start and end', async () => {
    let callCount = 0;
    const mockClient: ModelClient = {
      chat: vi.fn().mockImplementation(() => {
        callCount++;
        if (callCount === 1) {
          return {
            content: '',
            stopReason: 'tool_use',
            usage: { inputTokens: 10, outputTokens: 5 },
            toolCalls: [{ id: 'call_1', name: 'test.echo', args: { text: 'hi' } }],
          };
        }
        return {
          content: 'Done',
          stopReason: 'end_turn',
          usage: { inputTokens: 15, outputTokens: 10 },
        };
      }),
    };

    const registry = new ToolRegistry();
    registry.register(echoTool);
    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
    const executor = new ToolExecutor(registry, hooks);
    const onToolUse = vi.fn();

    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
      toolRegistry: registry,
      toolExecutor: executor,
      onToolUse,
    });

    await agent.process('echo hi');

    expect(onToolUse).toHaveBeenCalledTimes(2);
    expect(onToolUse).toHaveBeenNthCalledWith(1, expect.objectContaining({
      type: 'start',
      tool: 'test.echo',
      args: { text: 'hi' },
    }));
    expect(onToolUse).toHaveBeenNthCalledWith(2, expect.objectContaining({
      type: 'end',
      tool: 'test.echo',
      result: expect.objectContaining({ success: true, output: 'hi' }),
    }));
  });

  it('injects tool inventory note when history exists and fingerprint changes', async () => {
    let callCount = 0;
    const mockClient: ModelClient = {
      chat: vi.fn().mockImplementation(() => {
        callCount++;
        return {
          content: `Response ${callCount}`,
          stopReason: 'end_turn',
          usage: { inputTokens: 10, outputTokens: 5 },
        };
      }),
    };

    const registry = new ToolRegistry();
    registry.register(echoTool);
    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
    const executor = new ToolExecutor(registry, hooks);

    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
      toolRegistry: registry,
      toolExecutor: executor,
    });

    // First message — no prior history, so no inventory note
    await agent.process('Hi');
    const firstCall = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[0][0];
    expect(firstCall.system).toBe('You are helpful.');

    // Second message — history exists but fingerprint hasn't changed, no note
    await agent.process('Hello again');
    const secondCall = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[1][0];
    expect(secondCall.system).toBe('You are helpful.');

    // Now add a new tool to change the fingerprint
    const newTool: Tool = {
      name: 'test.greet',
      description: 'Greet',
      inputSchema: { type: 'object', properties: {} },
      execute: async () => ({ success: true, output: 'hi' }),
    };
    registry.register(newTool);

    // Third message — history exists AND fingerprint changed, should inject note
    await agent.process('What can you do?');
    const thirdCall = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[2][0];
    expect(thirdCall.system).toContain('[Tool inventory updated');
    expect(thirdCall.system).toContain('test_echo');
    expect(thirdCall.system).toContain('test_greet');
  });

  it('does not inject tool inventory note on fresh session', async () => {
    const mockClient: ModelClient = {
      chat: vi.fn().mockResolvedValue({
        content: 'Hello!',
        stopReason: 'end_turn',
        usage: { inputTokens: 10, outputTokens: 5 },
      }),
    };

    const registry = new ToolRegistry();
    registry.register(echoTool);
    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
    const executor = new ToolExecutor(registry, hooks);

    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
      toolRegistry: registry,
      toolExecutor: executor,
    });

    // First message ever — only one message in history (the user message just added)
    await agent.process('Hi');
    const call = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[0][0];
    expect(call.system).toBe('You are helpful.');
    expect(call.system).not.toContain('Tool inventory updated');
  });

  it('only injects tool inventory note once per fingerprint change', async () => {
    let callCount = 0;
    const mockClient: ModelClient = {
      chat: vi.fn().mockImplementation(() => {
        callCount++;
        return {
          content: `Response ${callCount}`,
          stopReason: 'end_turn',
          usage: { inputTokens: 10, outputTokens: 5 },
        };
      }),
    };

    const registry = new ToolRegistry();
    registry.register(echoTool);
    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
    const executor = new ToolExecutor(registry, hooks);

    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
      toolRegistry: registry,
      toolExecutor: executor,
    });

    // Build up history
    await agent.process('Hi');
    await agent.process('Hello');

    // Add a new tool
    const newTool: Tool = {
      name: 'test.greet',
      description: 'Greet',
      inputSchema: { type: 'object', properties: {} },
      execute: async () => ({ success: true, output: 'hi' }),
    };
    registry.register(newTool);

    // First call after change — note injected
    await agent.process('What tools?');
    const thirdCall = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[2][0];
    expect(thirdCall.system).toContain('[Tool inventory updated');

    // Second call with same tools — no note (fingerprint matches)
    await agent.process('Anything else?');
    const fourthCall = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[3][0];
    expect(fourthCall.system).toBe('You are helpful.');
  });

  it('resets tool fingerprint on reset()', async () => {
    const mockClient: ModelClient = {
      chat: vi.fn().mockResolvedValue({
        content: 'Hello!',
        stopReason: 'end_turn',
        usage: { inputTokens: 10, outputTokens: 5 },
      }),
    };

    const registry = new ToolRegistry();
    registry.register(echoTool);
    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
    const executor = new ToolExecutor(registry, hooks);

    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
      toolRegistry: registry,
      toolExecutor: executor,
    });

    // Build history and establish fingerprint
    await agent.process('Hi');
    await agent.process('Hello');

    // Reset clears fingerprint
    agent.reset();

    // After reset, first message has no history so no note
    await agent.process('Hi again');
    const call = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[2][0];
    expect(call.system).not.toContain('Tool inventory updated');
  });

  it('catches model errors in tool loop and returns error message', async () => {
    let callCount = 0;
    const mockClient: ModelClient = {
      chat: vi.fn().mockImplementation(() => {
        callCount++;
        if (callCount === 1) {
          // First call: model requests tool use
          return {
            content: '',
            stopReason: 'tool_use',
            usage: { inputTokens: 10, outputTokens: 5 },
            toolCalls: [{ id: 'call_1', name: 'test.echo', args: { text: 'hi' } }],
          };
        }
        // Second call: model throws an error
        throw new Error('Connection reset by peer');
      }),
    };

    const registry = new ToolRegistry();
    registry.register(echoTool);
    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
    const executor = new ToolExecutor(registry, hooks);

    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
      toolRegistry: registry,
      toolExecutor: executor,
    });

    const response = await agent.process('echo hi');
    expect(response).toContain('Error in tool loop');
    expect(response).toContain('Connection reset by peer');
    // Error should be persisted to history
    const history = agent.getHistory();
    expect(history[history.length - 1].role).toBe('assistant');
    expect(history[history.length - 1].content).toContain('Error in tool loop');
  });

  it('handles multiple tool calls in single response', async () => {
    let callCount = 0;
    const mockClient: ModelClient = {
      chat: vi.fn().mockImplementation(() => {
        callCount++;
        if (callCount === 1) {
          return {
            content: '',
            stopReason: 'tool_use',
            usage: { inputTokens: 10, outputTokens: 5 },
            toolCalls: [
              { id: 'call_1', name: 'test.echo', args: { text: 'first' } },
              { id: 'call_2', name: 'test.echo', args: { text: 'second' } },
            ],
          };
        }
        return {
          content: 'Got both results',
          stopReason: 'end_turn',
          usage: { inputTokens: 15, outputTokens: 10 },
        };
      }),
    };

    const registry = new ToolRegistry();
    registry.register(echoTool);
    const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
    const executor = new ToolExecutor(registry, hooks);

    const agent = new NativeAgent({
      modelClient: mockClient,
      systemPrompt: 'You are helpful.',
      toolRegistry: registry,
      toolExecutor: executor,
    });

    const response = await agent.process('echo both');
    expect(response).toBe('Got both results');
    expect(mockClient.chat).toHaveBeenCalledTimes(2);
  });
});