feat: add image.analyze tool for vision model analysis
Provides a factory createImageAnalyzeTool(modelClient) that sends images to a vision-capable model and returns a textual analysis. Includes 15 tests covering base64, URL, multi-image, error, and edge cases.
This commit is contained in:
@@ -0,0 +1,306 @@
|
||||
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
|
||||
import type { ModelClient, ChatRequest, ChatResponse } from '../../models/types.js';
|
||||
import { createImageAnalyzeTool } from './image-analyze.js';
|
||||
|
||||
describe('image.analyze tool', () => {
|
||||
let mockClient: ModelClient & { chat: ReturnType<typeof vi.fn> };
|
||||
|
||||
beforeEach(() => {
|
||||
mockClient = {
|
||||
chat: vi.fn()
|
||||
};
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
it('has correct metadata', () => {
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
expect(tool.name).toBe('image.analyze');
|
||||
expect(tool.inputSchema.required).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('analyzes image from URL', async () => {
|
||||
mockClient.chat = vi.fn().mockResolvedValueOnce({
|
||||
content: 'This is a beautiful sunset over the ocean.',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 100, outputTokens: 50 }
|
||||
});
|
||||
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
const result = await tool.execute({ url: 'https://example.com/image.jpg' });
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.output).toBe('This is a beautiful sunset over the ocean.');
|
||||
expect(mockClient.chat).toHaveBeenCalledTimes(1);
|
||||
expect(mockClient.chat).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
messages: expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
role: 'user',
|
||||
content: expect.arrayContaining([
|
||||
{ type: 'text', text: 'Describe this image in detail.' },
|
||||
{
|
||||
type: 'image',
|
||||
source: expect.objectContaining({
|
||||
type: 'url',
|
||||
media_type: 'image/jpeg',
|
||||
url: 'https://example.com/image.jpg'
|
||||
})
|
||||
}
|
||||
])
|
||||
})
|
||||
]),
|
||||
system: expect.stringContaining('vision assistant'),
|
||||
maxTokens: 1024
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('analyzes image from base64 data', async () => {
|
||||
const base64Data = 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==';
|
||||
mockClient.chat = vi.fn().mockResolvedValueOnce({
|
||||
content: 'This is a sample image.',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 100, outputTokens: 20 }
|
||||
});
|
||||
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
const result = await tool.execute({
|
||||
data: base64Data,
|
||||
media_type: 'image/png'
|
||||
});
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.output).toBe('This is a sample image.');
|
||||
expect(mockClient.chat).toHaveBeenCalledTimes(1);
|
||||
expect(mockClient.chat).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
messages: expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
role: 'user',
|
||||
content: expect.arrayContaining([
|
||||
{ type: 'text', text: 'Describe this image in detail.' },
|
||||
{
|
||||
type: 'image',
|
||||
source: expect.objectContaining({
|
||||
type: 'base64',
|
||||
media_type: 'image/png',
|
||||
data: base64Data
|
||||
})
|
||||
}
|
||||
])
|
||||
})
|
||||
]),
|
||||
system: expect.stringContaining('vision assistant'),
|
||||
maxTokens: 1024
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('uses custom prompt', async () => {
|
||||
mockClient.chat = vi.fn().mockResolvedValueOnce({
|
||||
content: 'The image shows a cat sitting on a mat.',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 100, outputTokens: 30 }
|
||||
});
|
||||
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
const result = await tool.execute({
|
||||
url: 'https://example.com/cat.jpg',
|
||||
prompt: 'What is in this image?'
|
||||
});
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.output).toBe('The image shows a cat sitting on a mat.');
|
||||
expect(mockClient.chat).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
messages: expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
content: expect.arrayContaining([
|
||||
{ type: 'text', text: 'What is in this image?' },
|
||||
expect.any(Object)
|
||||
])
|
||||
})
|
||||
])
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('defaults prompt to "Describe this image in detail."', async () => {
|
||||
mockClient.chat = vi.fn().mockResolvedValueOnce({
|
||||
content: 'This is the default prompt response.',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 100, outputTokens: 10 }
|
||||
});
|
||||
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
await tool.execute({ url: 'https://example.com/image.jpg' });
|
||||
|
||||
expect(mockClient.chat).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
messages: expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
content: expect.arrayContaining([
|
||||
{ type: 'text', text: 'Describe this image in detail.' },
|
||||
expect.any(Object)
|
||||
])
|
||||
})
|
||||
])
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('fails when neither url nor data is provided', async () => {
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
const result = await tool.execute({});
|
||||
|
||||
expect(result.success).toBe(false);
|
||||
expect(result.error).toContain('Either "url" or "data" must be provided');
|
||||
expect(mockClient.chat).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('fails when both url and data are provided', async () => {
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
const result = await tool.execute({
|
||||
url: 'https://example.com/image.jpg',
|
||||
data: 'base64data'
|
||||
});
|
||||
|
||||
expect(result.success).toBe(false);
|
||||
expect(result.error).toContain('Cannot provide both "url" and "data"');
|
||||
expect(mockClient.chat).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('fails when data is provided without media_type', async () => {
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
const result = await tool.execute({
|
||||
data: 'base64data',
|
||||
prompt: 'Test'
|
||||
});
|
||||
|
||||
expect(result.success).toBe(false);
|
||||
expect(result.error).toContain('media_type is required when providing data');
|
||||
expect(mockClient.chat).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('fails with invalid media_type', async () => {
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
const result = await tool.execute({
|
||||
data: 'base64data',
|
||||
media_type: 'image/tiff'
|
||||
});
|
||||
|
||||
expect(result.success).toBe(false);
|
||||
expect(result.error).toContain('Invalid media_type');
|
||||
expect(mockClient.chat).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('passes valid media_types', async () => {
|
||||
const validTypes = ['image/jpeg', 'image/png', 'image/gif', 'image/webp'];
|
||||
|
||||
for (const mediaType of validTypes) {
|
||||
mockClient.chat = vi.fn().mockResolvedValueOnce({
|
||||
content: 'Success',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 10, outputTokens: 10 }
|
||||
});
|
||||
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
const result = await tool.execute({
|
||||
data: 'base64data',
|
||||
media_type: mediaType
|
||||
});
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
expect(mockClient.chat).toHaveBeenCalledTimes(1);
|
||||
mockClient.chat = vi.fn();
|
||||
}
|
||||
});
|
||||
|
||||
it('handles model client errors', async () => {
|
||||
mockClient.chat = vi.fn().mockRejectedValueOnce(new Error('Model API error'));
|
||||
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
const result = await tool.execute({ url: 'https://example.com/image.jpg' });
|
||||
|
||||
expect(result.success).toBe(false);
|
||||
expect(result.output).toBe('');
|
||||
expect(result.error).toBe('Model API error');
|
||||
});
|
||||
|
||||
it('handles non-Error exceptions', async () => {
|
||||
mockClient.chat = vi.fn().mockRejectedValueOnce('String error');
|
||||
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
const result = await tool.execute({ url: 'https://example.com/image.jpg' });
|
||||
|
||||
expect(result.success).toBe(false);
|
||||
expect(result.output).toBe('');
|
||||
expect(result.error).toBe('String error');
|
||||
});
|
||||
|
||||
it('uses custom system message prompt', async () => {
|
||||
const mockRequest = {
|
||||
messages: [] as any,
|
||||
system: '',
|
||||
maxTokens: 1024
|
||||
};
|
||||
const mockResponse = {
|
||||
content: 'Analysis complete.',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 100, outputTokens: 10 }
|
||||
};
|
||||
|
||||
mockClient.chat = vi.fn().mockResolvedValue(mockResponse).mockImplementationOnce(async (r) => {
|
||||
return mockResponse;
|
||||
});
|
||||
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
await tool.execute({
|
||||
url: 'https://example.com/image.jpg',
|
||||
prompt: 'Analyze the colors.'
|
||||
});
|
||||
|
||||
const callArgs = (mockClient.chat as any).mock.calls[0][0];
|
||||
expect(callArgs.system).toContain('vision assistant');
|
||||
expect(callArgs.system).toContain('Analyze the provided image');
|
||||
});
|
||||
|
||||
it('respects maxTokens parameter', async () => {
|
||||
const mockRequest = {
|
||||
messages: [] as any,
|
||||
system: '',
|
||||
maxTokens: 1024
|
||||
};
|
||||
const mockResponse = {
|
||||
content: 'Short response',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 10, outputTokens: 10 }
|
||||
};
|
||||
|
||||
mockClient.chat = vi.fn().mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
await tool.execute({ url: 'https://example.com/image.jpg' });
|
||||
|
||||
const callArgs = (mockClient.chat as any).mock.calls[0][0];
|
||||
expect(callArgs.maxTokens).toBe(1024);
|
||||
});
|
||||
|
||||
it('passes through model response content', async () => {
|
||||
const expectedContent = 'Detailed analysis of the image...';
|
||||
mockClient.chat.mockResolvedValueOnce({
|
||||
content: expectedContent,
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 100, outputTokens: 100 }
|
||||
});
|
||||
|
||||
const tool = createImageAnalyzeTool(mockClient);
|
||||
const result = await tool.execute({ url: 'https://example.com/image.jpg' });
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
expect(result.output).toBe(expectedContent);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,121 @@
|
||||
import type { ModelClient } from '../../models/types.js';
|
||||
import type { Tool, ToolResult } from '../types.js';
|
||||
|
||||
interface ImageAnalyzeArgs {
|
||||
url?: string;
|
||||
data?: string;
|
||||
media_type?: string;
|
||||
prompt?: string;
|
||||
}
|
||||
|
||||
const DEFAULT_PROMPT = 'Describe this image in detail.';
|
||||
|
||||
const VALID_MEDIA_TYPES = ['image/jpeg', 'image/png', 'image/gif', 'image/webp'];
|
||||
|
||||
export function createImageAnalyzeTool(modelClient: ModelClient): Tool {
|
||||
return {
|
||||
name: 'image.analyze',
|
||||
description: 'Analyze an image using a vision model. Accepts a URL or base64-encoded image data. Returns a text description or analysis based on the provided prompt.',
|
||||
inputSchema: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
url: {
|
||||
type: 'string',
|
||||
description: 'URL of the image to analyze'
|
||||
},
|
||||
data: {
|
||||
type: 'string',
|
||||
description: 'Base64-encoded image data (alternative to url)'
|
||||
},
|
||||
media_type: {
|
||||
type: 'string',
|
||||
description:
|
||||
'MIME type of the image (required when using data). One of: image/jpeg, image/png, image/gif, image/webp'
|
||||
},
|
||||
prompt: {
|
||||
type: 'string',
|
||||
description:
|
||||
'What to analyze or describe about the image. Default: "Describe this image in detail."'
|
||||
}
|
||||
},
|
||||
required: []
|
||||
},
|
||||
execute: async (rawArgs: unknown): Promise<ToolResult> => {
|
||||
try {
|
||||
const args = rawArgs as ImageAnalyzeArgs;
|
||||
|
||||
if (!args.url && !args.data) {
|
||||
return {
|
||||
success: false,
|
||||
output: '',
|
||||
error: 'Either "url" or "data" must be provided'
|
||||
};
|
||||
}
|
||||
|
||||
if (args.url && args.data) {
|
||||
return {
|
||||
success: false,
|
||||
output: '',
|
||||
error: 'Cannot provide both "url" and "data" - choose one'
|
||||
};
|
||||
}
|
||||
|
||||
if (args.data && !args.media_type) {
|
||||
return {
|
||||
success: false,
|
||||
output: '',
|
||||
error: 'media_type is required when providing data'
|
||||
};
|
||||
}
|
||||
|
||||
if (args.media_type && !VALID_MEDIA_TYPES.includes(args.media_type)) {
|
||||
return {
|
||||
success: false,
|
||||
output: '',
|
||||
error: `Invalid media_type: ${args.media_type}. Must be one of: ${VALID_MEDIA_TYPES.join(', ')}`
|
||||
};
|
||||
}
|
||||
|
||||
const prompt = args.prompt || DEFAULT_PROMPT;
|
||||
|
||||
const imageSource = args.url
|
||||
? {
|
||||
type: 'url' as const,
|
||||
media_type: args.media_type || 'image/jpeg',
|
||||
url: args.url
|
||||
}
|
||||
: {
|
||||
type: 'base64' as const,
|
||||
media_type: args.media_type!,
|
||||
data: args.data
|
||||
};
|
||||
|
||||
const message = {
|
||||
role: 'user' as const,
|
||||
content: [
|
||||
{ type: 'text' as const, text: prompt },
|
||||
{ type: 'image' as const, source: imageSource }
|
||||
]
|
||||
};
|
||||
|
||||
const response = await modelClient.chat({
|
||||
messages: [message],
|
||||
system:
|
||||
'You are a vision assistant. Analyze the provided image according to the user\'s request. Provide detailed, helpful descriptions.',
|
||||
maxTokens: 1024
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
output: response.content
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
output: '',
|
||||
error: error instanceof Error ? error.message : String(error)
|
||||
};
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user