Files
flynn/src/backends/native/agent.test.ts
T

752 lines
25 KiB
TypeScript

import { describe, it, expect, vi } from 'vitest';
import { NativeAgent } from './agent.js';
import type { ModelClient, ChatRequest, ChatResponse } from '../../models/types.js';
import { ToolRegistry, ToolExecutor } from '../../tools/index.js';
import { HookEngine } from '../../hooks/index.js';
import type { Tool } from '../../tools/index.js';
describe('NativeAgent', () => {
const createMockClient = (): ModelClient => ({
chat: vi.fn().mockResolvedValue({
content: 'Hello!',
stopReason: 'end_turn',
usage: { inputTokens: 10, outputTokens: 5 },
} satisfies ChatResponse),
});
it('processes messages and maintains history', async () => {
const mockClient = createMockClient();
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
});
const response = await agent.process('Hi');
expect(response).toBe('Hello!');
expect(mockClient.chat).toHaveBeenCalledWith(expect.objectContaining({
messages: [{ role: 'user', content: 'Hi' }],
system: 'You are helpful.',
signal: expect.any(AbortSignal),
}));
const history = agent.getHistory();
expect(history).toHaveLength(2);
expect(history[0]).toEqual({ role: 'user', content: 'Hi' });
expect(history[1]).toEqual({ role: 'assistant', content: 'Hello!' });
});
it('resets conversation history', async () => {
const mockClient = createMockClient();
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
});
await agent.process('Hi');
agent.reset();
expect(agent.getHistory()).toHaveLength(0);
});
it('uses session when provided', async () => {
const mockClient = createMockClient();
const mockSession = {
id: 'test-session',
getHistory: vi.fn().mockReturnValue([]),
addMessage: vi.fn(),
clear: vi.fn(),
replaceHistory: vi.fn(),
getConfig: vi.fn().mockReturnValue(undefined),
setConfig: vi.fn(),
deleteConfig: vi.fn(),
};
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
session: mockSession,
});
await agent.process('Hi');
expect(mockSession.addMessage).toHaveBeenCalledTimes(2);
expect(mockSession.addMessage).toHaveBeenNthCalledWith(1, { role: 'user', content: 'Hi' });
expect(mockSession.addMessage).toHaveBeenNthCalledWith(2, { role: 'assistant', content: 'Hello!' });
});
it('supports cancellation during single-turn model wait', async () => {
let release!: () => void;
const blocked = new Promise<void>((resolve) => {
release = resolve;
});
const mockClient: ModelClient = {
chat: vi.fn(async () => {
await blocked;
return {
content: 'Late response',
stopReason: 'end_turn',
usage: { inputTokens: 10, outputTokens: 5 },
} satisfies ChatResponse;
}),
};
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
});
const pending = agent.process('Please wait');
await new Promise<void>((resolve) => queueMicrotask(resolve));
expect(agent.isCancellable()).toBe(true);
agent.cancel();
release();
const response = await pending;
expect(response).toBe('Operation cancelled by user.');
expect(agent.isCancellable()).toBe(false);
const history = agent.getHistory();
expect(history[history.length - 1]).toEqual({ role: 'assistant', content: 'Operation cancelled by user.' });
});
it('returns fallback text when model response is empty', async () => {
const mockClient: ModelClient = {
chat: vi.fn().mockResolvedValue({
content: '',
stopReason: 'end_turn',
usage: { inputTokens: 10, outputTokens: 5 },
}),
};
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
});
const response = await agent.process('Hi');
expect(response).toBe('I could not generate a response for that. Please try again.');
const history = agent.getHistory();
expect(history[history.length - 1]).toEqual({
role: 'assistant',
content: 'I could not generate a response for that. Please try again.',
});
});
it('times out single-turn model calls', async () => {
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => new Promise<ChatResponse>(() => {})),
};
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
modelTimeoutMs: 10,
});
await expect(agent.process('Hi')).rejects.toThrow('Model request timed out after 10ms');
});
});
// Simple test tool
const echoTool: Tool = {
name: 'test.echo',
description: 'Echo',
inputSchema: { type: 'object', properties: { text: { type: 'string' } }, required: ['text'] },
execute: async (args) => ({ success: true, output: (args as { text: string }).text }),
};
describe('NativeAgent tool loop', () => {
it('executes tool calls and feeds results back', async () => {
let callCount = 0;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
// First call: model requests tool use
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: 'call_1', name: 'test.echo', args: { text: 'hello' } }],
};
}
// Second call: model gives final text response
return {
content: 'The tool returned: hello',
stopReason: 'end_turn',
usage: { inputTokens: 15, outputTokens: 10 },
};
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process('echo hello');
expect(response).toBe('The tool returned: hello');
expect(mockClient.chat).toHaveBeenCalledTimes(2);
});
it('respects max iterations when tool calls vary', async () => {
// Model always returns tool_use but with different args each time (no loop detection)
let callCount = 0;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: `call_${callCount}`, name: 'test.echo', args: { text: `attempt_${callCount}` } }],
};
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
maxIterations: 3,
});
const response = await agent.process('loop forever');
expect(response).toContain('max iterations');
expect(mockClient.chat).toHaveBeenCalledTimes(3);
});
it('nudges model after same tool called too many times with different args', async () => {
let callCount = 0;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation((req: ChatRequest) => {
callCount++;
// After nudge message, model should respond with text
const lastMsg = req.messages[req.messages.length - 1];
const hasNudge = typeof lastMsg?.content === 'string'
&& lastMsg.content.includes('do NOT call it again');
if (hasNudge) {
return {
content: 'Here is what I found from my searches.',
stopReason: 'end_turn',
usage: { inputTokens: 10, outputTokens: 5 },
};
}
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: `call_${callCount}`, name: 'test.echo', args: { text: `query_${callCount}` } }],
};
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
maxIterations: 10,
});
const response = await agent.process('search a lot');
// Model should have responded after receiving the nudge
expect(response).toBe('Here is what I found from my searches.');
// 4 tool calls + 1 final response = 5 chat calls
expect(mockClient.chat).toHaveBeenCalledTimes(5);
});
it('detects repeated identical tool calls and breaks the loop', async () => {
// Model always returns the exact same tool call — simulates local LLM stuck in a loop
const mockClient: ModelClient = {
chat: vi.fn().mockResolvedValue({
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: 'call_1', name: 'test.echo', args: { text: 'same thing' } }],
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
maxIterations: 10,
});
const response = await agent.process('search for news');
expect(response).toContain('Tool loop detected');
expect(response).toContain('same thing'); // includes the last tool result
// Should break after 3 consecutive identical calls, not 10
expect(mockClient.chat).toHaveBeenCalledTimes(3);
});
it('surfaces warning when model emits textual tool_use block without structured tool calls', async () => {
const mockClient: ModelClient = {
chat: vi.fn().mockResolvedValue({
content: 'Let me read the full email to evaluate legitimacy:{"type":"tool_use","id":"call_123","name":"gmail_read","input":{"id":"abc"}}',
stopReason: 'end_turn',
usage: { inputTokens: 10, outputTokens: 5 },
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process('read latest email');
expect(response).toContain('Tool call was emitted as plain text and was not executed.');
expect(response).toContain('Tool: gmail_read (id: call_123)');
expect(response).toContain('"type":"tool_use"');
const history = agent.getHistory();
expect(history[history.length - 1]).toEqual({ role: 'assistant', content: response });
});
it('recovers and executes valid textual tool_use JSON for registered tools', async () => {
let callCount = 0;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
return {
content: 'Running tool now: {"type":"tool_use","id":"call_123","name":"test_echo","input":{"text":"hello"}}',
stopReason: 'end_turn',
usage: { inputTokens: 10, outputTokens: 5 },
};
}
return {
content: 'The tool returned: hello',
stopReason: 'end_turn',
usage: { inputTokens: 10, outputTokens: 5 },
};
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process('echo hello');
expect(response).toBe('The tool returned: hello');
expect(mockClient.chat).toHaveBeenCalledTimes(2);
});
it('works without tools (backward compatible)', async () => {
const mockClient: ModelClient = {
chat: vi.fn().mockResolvedValue({
content: 'Hello!',
stopReason: 'end_turn',
usage: { inputTokens: 10, outputTokens: 5 },
}),
};
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
});
const response = await agent.process('Hi');
expect(response).toBe('Hello!');
});
it('calls onToolUse callback on start and end', async () => {
let callCount = 0;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: 'call_1', name: 'test.echo', args: { text: 'hi' } }],
};
}
return {
content: 'Done',
stopReason: 'end_turn',
usage: { inputTokens: 15, outputTokens: 10 },
};
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const onToolUse = vi.fn();
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
onToolUse,
});
await agent.process('echo hi');
expect(onToolUse).toHaveBeenCalledTimes(2);
expect(onToolUse).toHaveBeenNthCalledWith(1, expect.objectContaining({
type: 'start',
tool: 'test.echo',
args: { text: 'hi' },
}));
expect(onToolUse).toHaveBeenNthCalledWith(2, expect.objectContaining({
type: 'end',
tool: 'test.echo',
result: expect.objectContaining({ success: true, output: 'hi' }),
}));
});
it('injects tool inventory note when history exists and fingerprint changes', async () => {
let callCount = 0;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
return {
content: `Response ${callCount}`,
stopReason: 'end_turn',
usage: { inputTokens: 10, outputTokens: 5 },
};
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
});
// First message — no prior history, so no inventory note
await agent.process('Hi');
const firstCall = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[0][0];
expect(firstCall.system).toBe('You are helpful.');
// Second message — history exists but fingerprint hasn't changed, no note
await agent.process('Hello again');
const secondCall = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[1][0];
expect(secondCall.system).toBe('You are helpful.');
// Now add a new tool to change the fingerprint
const newTool: Tool = {
name: 'test.greet',
description: 'Greet',
inputSchema: { type: 'object', properties: {} },
execute: async () => ({ success: true, output: 'hi' }),
};
registry.register(newTool);
// Third message — history exists AND fingerprint changed, should inject note
await agent.process('What can you do?');
const thirdCall = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[2][0];
expect(thirdCall.system).toContain('[Tool inventory updated');
expect(thirdCall.system).toContain('test_echo');
expect(thirdCall.system).toContain('test_greet');
});
it('does not inject tool inventory note on fresh session', async () => {
const mockClient: ModelClient = {
chat: vi.fn().mockResolvedValue({
content: 'Hello!',
stopReason: 'end_turn',
usage: { inputTokens: 10, outputTokens: 5 },
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
});
// First message ever — only one message in history (the user message just added)
await agent.process('Hi');
const call = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[0][0];
expect(call.system).toBe('You are helpful.');
expect(call.system).not.toContain('Tool inventory updated');
});
it('only injects tool inventory note once per fingerprint change', async () => {
let callCount = 0;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
return {
content: `Response ${callCount}`,
stopReason: 'end_turn',
usage: { inputTokens: 10, outputTokens: 5 },
};
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
});
// Build up history
await agent.process('Hi');
await agent.process('Hello');
// Add a new tool
const newTool: Tool = {
name: 'test.greet',
description: 'Greet',
inputSchema: { type: 'object', properties: {} },
execute: async () => ({ success: true, output: 'hi' }),
};
registry.register(newTool);
// First call after change — note injected
await agent.process('What tools?');
const thirdCall = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[2][0];
expect(thirdCall.system).toContain('[Tool inventory updated');
// Second call with same tools — no note (fingerprint matches)
await agent.process('Anything else?');
const fourthCall = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[3][0];
expect(fourthCall.system).toBe('You are helpful.');
});
it('resets tool fingerprint on reset()', async () => {
const mockClient: ModelClient = {
chat: vi.fn().mockResolvedValue({
content: 'Hello!',
stopReason: 'end_turn',
usage: { inputTokens: 10, outputTokens: 5 },
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
});
// Build history and establish fingerprint
await agent.process('Hi');
await agent.process('Hello');
// Reset clears fingerprint
agent.reset();
// After reset, first message has no history so no note
await agent.process('Hi again');
const call = (mockClient.chat as ReturnType<typeof vi.fn>).mock.calls[2][0];
expect(call.system).not.toContain('Tool inventory updated');
});
it('catches model errors in tool loop and returns error message', async () => {
let callCount = 0;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
// First call: model requests tool use
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: 'call_1', name: 'test.echo', args: { text: 'hi' } }],
};
}
// Second call: model throws an error
throw new Error('Connection reset by peer');
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process('echo hi');
expect(response).toContain('Error in tool loop');
expect(response).toContain('Connection reset by peer');
// Error should be persisted to history
const history = agent.getHistory();
expect(history[history.length - 1].role).toBe('assistant');
expect(history[history.length - 1].content).toContain('Error in tool loop');
});
it('handles multiple tool calls in single response', async () => {
let callCount = 0;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [
{ id: 'call_1', name: 'test.echo', args: { text: 'first' } },
{ id: 'call_2', name: 'test.echo', args: { text: 'second' } },
],
};
}
return {
content: 'Got both results',
stopReason: 'end_turn',
usage: { inputTokens: 15, outputTokens: 10 },
};
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process('echo both');
expect(response).toBe('Got both results');
expect(mockClient.chat).toHaveBeenCalledTimes(2);
});
it('returns fallback text when tool loop final response is empty', async () => {
const mockClient: ModelClient = {
chat: vi
.fn()
.mockResolvedValueOnce({
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: 'call_1', name: 'test.echo', args: { text: 'hello' } }],
})
.mockResolvedValueOnce({
content: '',
stopReason: 'end_turn',
usage: { inputTokens: 12, outputTokens: 4 },
}),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process('echo hello');
expect(response).toBe('I could not generate a response for that. Please try again.');
const history = agent.getHistory();
expect(history[history.length - 1]).toEqual({
role: 'assistant',
content: 'I could not generate a response for that. Please try again.',
});
});
it('times out tool-loop model calls and returns an error message', async () => {
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => new Promise<ChatResponse>(() => {})),
};
const registry = new ToolRegistry();
registry.register(echoTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
modelTimeoutMs: 10,
});
const response = await agent.process('echo hello');
expect(response).toContain('Error in tool loop');
expect(response).toContain('Model request timed out after 10ms');
});
});