Files
flynn/src/models/media.test.ts
T
William Valentin 6761dca1c2 fix: normalize message roles for local model backends (llama.cpp, Ollama)
Local backends using strict chat templates (e.g. Mistral 3) rejected
Flynn's Anthropic-style tool_use/tool_result content blocks, causing
'roles must alternate' errors. Added getMessageTextWithTools() and
normalizeMessagesForLocal() to serialize structured blocks to plain
text, drop empty messages, and merge consecutive same-role messages.
Also fixed compaction to ensure kept messages start with user role.
2026-02-10 22:04:17 -08:00

823 lines
26 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import { vi } from 'vitest';
import type { Attachment } from '../channels/types.js';
import type { Message } from './types.js';
import {
isSupportedImage,
isSupportedAudio,
attachmentToImageSource,
buildUserMessage,
getMessageText,
getMessageTextWithTools,
normalizeMessagesForLocal,
hasImages,
transcribeAudio,
buildUserMessageWithAudio,
type AudioTranscriptionConfig,
mimeToExtension,
} from './media.js';
// ---------------------------------------------------------------------------
// Helpers reusable attachment fixtures
// ---------------------------------------------------------------------------
function makeAttachment(overrides: Partial<Attachment> & { mimeType: string }): Attachment {
return { ...overrides };
}
const jpegBase64Attachment: Attachment = makeAttachment({
mimeType: 'image/jpeg',
data: 'aGVsbG8=', // "hello" in base64
filename: 'photo.jpg',
});
const pngUrlAttachment: Attachment = makeAttachment({
mimeType: 'image/png',
url: 'https://example.com/image.png',
});
const pdfAttachment: Attachment = makeAttachment({
mimeType: 'application/pdf',
data: 'cGRm',
filename: 'doc.pdf',
});
const oggAudioAttachment: Attachment = makeAttachment({
mimeType: 'audio/ogg',
data: 'AAAAAAAAAAAAAAAAAAAA',
filename: 'voice.ogg',
});
const mp3AudioAttachment: Attachment = makeAttachment({
mimeType: 'audio/mpeg',
data: 'AAAAAQAAAAAAAEAAABkAAABTQA=', // Base64 of a short MP3
filename: 'audio.mp3',
});
const wavAudioAttachment: Attachment = makeAttachment({
mimeType: 'audio/wav',
data: 'UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=', // Base64 of a short WAV
filename: 'audio.wav',
});
const m4aAudioAttachment: Attachment = makeAttachment({
mimeType: 'audio/x-m4a',
data: 'AAAAUGV0Zi4xLjAgc291cmNlIGZvciBzdGFydHBvaW50', // Base64 of M4A
filename: 'audio.m4a',
});
// ---------------------------------------------------------------------------
// 1. isSupportedImage
// ---------------------------------------------------------------------------
describe('isSupportedImage', () => {
// Positive: all four supported MIME types should return true.
it.each([
'image/jpeg',
'image/png',
'image/gif',
'image/webp',
])('returns true for supported type %s', (mime) => {
expect(isSupportedImage(makeAttachment({ mimeType: mime }))).toBe(true);
});
// Negative: unsupported MIME types should return false.
it.each([
'image/bmp',
'application/pdf',
'audio/mp3',
'text/plain',
])('returns false for unsupported type %s', (mime) => {
expect(isSupportedImage(makeAttachment({ mimeType: mime }))).toBe(false);
});
});
// ---------------------------------------------------------------------------
// 2. attachmentToImageSource
// ---------------------------------------------------------------------------
describe('attachmentToImageSource', () => {
// Positive: attachment with base64 data produces a base64 ImageSource.
it('returns base64 ImageSource when attachment has data', () => {
const result = attachmentToImageSource(jpegBase64Attachment);
expect(result).toEqual({
type: 'base64',
media_type: 'image/jpeg',
data: 'aGVsbG8=',
});
});
// Positive: attachment with url (no data) produces a url ImageSource.
it('returns url ImageSource when attachment has url but no data', () => {
const result = attachmentToImageSource(pngUrlAttachment);
expect(result).toEqual({
type: 'url',
media_type: 'image/png',
url: 'https://example.com/image.png',
});
});
// Positive: when both data and url are present, base64 is preferred.
it('prefers base64 data over url when both are present', () => {
const both = makeAttachment({
mimeType: 'image/webp',
data: 'YWJj',
url: 'https://example.com/img.webp',
});
const result = attachmentToImageSource(both);
expect(result).toEqual({
type: 'base64',
media_type: 'image/webp',
data: 'YWJj',
});
});
// Negative: unsupported MIME type returns null.
it('returns null for unsupported MIME type', () => {
expect(attachmentToImageSource(pdfAttachment)).toBeNull();
});
// Negative: supported MIME but neither data nor url returns null.
it('returns null when attachment has neither data nor url', () => {
const bare = makeAttachment({ mimeType: 'image/gif' });
expect(attachmentToImageSource(bare)).toBeNull();
});
});
// ---------------------------------------------------------------------------
// 3. buildUserMessage
// ---------------------------------------------------------------------------
describe('buildUserMessage', () => {
// Positive: plain text message when no attachments argument is provided.
it('returns plain string content when no attachments', () => {
const msg = buildUserMessage('Hello');
expect(msg).toEqual({ role: 'user', content: 'Hello' });
});
// Positive: plain text message when attachments is an empty array.
it('returns plain string content when attachments is empty array', () => {
const msg = buildUserMessage('Hello', []);
expect(msg).toEqual({ role: 'user', content: 'Hello' });
});
// Positive: plain text message when attachments contain no supported images.
it('returns plain string content when no image attachments (PDF only)', () => {
const msg = buildUserMessage('See attached', [pdfAttachment]);
expect(msg).toEqual({ role: 'user', content: 'See attached' });
});
// Positive: multimodal message with text + image parts when image attachment present.
it('returns multimodal message with text + image parts', () => {
const msg = buildUserMessage('Look at this', [jpegBase64Attachment]);
expect(msg.role).toBe('user');
expect(Array.isArray(msg.content)).toBe(true);
const parts = msg.content as Array<{ type: string }>;
expect(parts).toHaveLength(2);
expect(parts[0]).toEqual({ type: 'text', text: 'Look at this' });
expect(parts[1]).toEqual({
type: 'image',
source: { type: 'base64', media_type: 'image/jpeg', data: 'aGVsbG8=' },
});
});
// Positive: multimodal message with just image part when text is empty.
it('returns multimodal message with just image part when text is empty', () => {
const msg = buildUserMessage('', [pngUrlAttachment]);
expect(msg.role).toBe('user');
const parts = msg.content as Array<{ type: string }>;
// Empty text is omitted, only image part
expect(parts).toHaveLength(1);
expect(parts[0]).toEqual({
type: 'image',
source: { type: 'url', media_type: 'image/png', url: 'https://example.com/image.png' },
});
});
// Positive: handles multiple image attachments.
it('handles multiple image attachments', () => {
const msg = buildUserMessage('Two images', [jpegBase64Attachment, pngUrlAttachment]);
const parts = msg.content as Array<{ type: string }>;
expect(parts).toHaveLength(3); // text + 2 images
expect(parts[0]).toEqual({ type: 'text', text: 'Two images' });
expect(parts[1]).toMatchObject({ type: 'image' });
expect(parts[2]).toMatchObject({ type: 'image' });
});
});
// ---------------------------------------------------------------------------
// 4. getMessageText
// ---------------------------------------------------------------------------
describe('getMessageText', () => {
// Positive: returns string directly for string content.
it('returns string directly for string content messages', () => {
const msg: Message = { role: 'user', content: 'plain text' };
expect(getMessageText(msg)).toBe('plain text');
});
// Positive: extracts and joins text parts from multimodal messages.
it('extracts and joins text parts from multimodal messages', () => {
const msg: Message = {
role: 'user',
content: [
{ type: 'text', text: 'Hello ' },
{ type: 'image', source: { type: 'base64', media_type: 'image/png', data: 'x' } },
{ type: 'text', text: 'World' },
],
};
expect(getMessageText(msg)).toBe('Hello World');
});
// Negative: returns empty string for multimodal messages with only image parts.
it('returns empty string for multimodal messages with only image parts', () => {
const msg: Message = {
role: 'user',
content: [
{ type: 'image', source: { type: 'url', media_type: 'image/gif', url: 'https://example.com/a.gif' } },
],
};
expect(getMessageText(msg)).toBe('');
});
});
// ---------------------------------------------------------------------------
// 5. hasImages
// ---------------------------------------------------------------------------
describe('hasImages', () => {
// Negative: string content never has images.
it('returns false for string content messages', () => {
const msg: Message = { role: 'user', content: 'no images here' };
expect(hasImages(msg)).toBe(false);
});
// Negative: multimodal messages with only text parts have no images.
it('returns false for multimodal messages with only text parts', () => {
const msg: Message = {
role: 'user',
content: [{ type: 'text', text: 'just text' }],
};
expect(hasImages(msg)).toBe(false);
});
// Positive: multimodal messages with image parts are detected.
it('returns true for multimodal messages with image parts', () => {
const msg: Message = {
role: 'user',
content: [
{ type: 'text', text: 'caption' },
{ type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
],
};
expect(hasImages(msg)).toBe(true);
});
});
// ---------------------------------------------------------------------------
// 6. isSupportedAudio
// ---------------------------------------------------------------------------
describe('isSupportedAudio', () => {
// Positive: all supported audio MIME types should return true.
it.each([
'audio/ogg',
'audio/mpeg',
'audio/mp3',
'audio/wav',
'audio/webm',
'audio/mp4',
'audio/x-m4a',
])('returns true for supported type %s', (mime) => {
expect(isSupportedAudio(makeAttachment({ mimeType: mime }))).toBe(true);
});
// Negative: unsupported MIME types should return false.
it.each([
'audio/flac',
'audio/aac',
'audio/wma',
'application/pdf',
'image/jpeg',
'text/plain',
])('returns false for unsupported type %s', (mime) => {
expect(isSupportedAudio(makeAttachment({ mimeType: mime }))).toBe(false);
});
});
// ---------------------------------------------------------------------------
// 7. mimeToExtension
// ---------------------------------------------------------------------------
describe('mimeToExtension', () => {
it('returns correct extension for audio/ogg', () => {
expect(mimeToExtension('audio/ogg')).toBe('ogg');
});
it('returns correct extension for audio/mpeg', () => {
expect(mimeToExtension('audio/mpeg')).toBe('mp3');
});
it('returns correct extension for audio/wav', () => {
expect(mimeToExtension('audio/wav')).toBe('wav');
});
it('returns correct extension for audio/webm', () => {
expect(mimeToExtension('audio/webm')).toBe('webm');
});
it('returns correct extension for audio/mp4', () => {
expect(mimeToExtension('audio/mp4')).toBe('m4a');
});
it('returns correct extension for audio/x-m4a', () => {
expect(mimeToExtension('audio/x-m4a')).toBe('m4a');
});
it('returns bin for unknown MIME type', () => {
expect(mimeToExtension('audio/flac')).toBe('bin');
});
});
// ---------------------------------------------------------------------------
// 8. transcribeAudio
// ---------------------------------------------------------------------------
describe('transcribeAudio', () => {
const mockTranscript = 'Hello, this is a test transcription';
const originalFetch = global.fetch;
beforeEach(() => {
global.fetch = vi.fn();
});
afterEach(() => {
global.fetch = originalFetch;
});
// Positive: transcribes audio with valid config.
it('transcribes audio successfully with valid config', async () => {
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: mockTranscript }),
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
apiKey: 'test-key',
model: 'test-model',
};
const result = await transcribeAudio(oggAudioAttachment, config);
expect(result).toBe(mockTranscript);
expect(global.fetch).toHaveBeenCalledWith(
'https://api.example.com/v1/audio/transcriptions',
expect.objectContaining({
method: 'POST',
body: expect.any(FormData),
}),
);
});
// Negative: returns placeholder when endpoint is missing.
it('returns placeholder message when endpoint is not configured', async () => {
const result = await transcribeAudio(oggAudioAttachment);
expect(result).toBe('[Audio message received but no transcription service is configured]');
});
// Negative: returns placeholder when API fails.
it('returns placeholder message when API returns error', async () => {
vi.mocked(global.fetch).mockResolvedValue({
ok: false,
status: 500,
statusText: 'Internal Server Error',
text: async () => 'Internal Server Error',
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await transcribeAudio(oggAudioAttachment, config);
expect(result).toBe('[Audio message transcription failed]');
});
// Negative: handles network errors gracefully.
it('returns placeholder message on network error', async () => {
vi.mocked(global.fetch).mockRejectedValue(new Error('Network error'));
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await transcribeAudio(oggAudioAttachment, config);
expect(result).toBe('[Audio message transcription failed]');
});
// Positive: uses Whisper-1 model by default.
it('uses whisper-1 model by default', async () => {
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.openai.com/v1/audio/transcriptions',
};
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: 'test' }),
} as Response);
await transcribeAudio(oggAudioAttachment, config);
expect(global.fetch).toHaveBeenCalledWith(
'https://api.openai.com/v1/audio/transcriptions',
expect.objectContaining({
body: expect.any(FormData),
}),
);
});
});
// ---------------------------------------------------------------------------
// 9. buildUserMessageWithAudio
// ---------------------------------------------------------------------------
describe('buildUserMessageWithAudio', () => {
const textMessage = 'What is 2 + 2?';
const originalFetch = global.fetch;
beforeEach(() => {
global.fetch = vi.fn();
});
afterEach(() => {
global.fetch = originalFetch;
});
// Positive: plain text message when no attachments.
it('returns plain text message when no attachments', async () => {
const result = await buildUserMessageWithAudio(textMessage);
expect(result).toEqual({ role: 'user', content: textMessage });
});
// Positive: includes transcription when audio attachment present.
it('includes transcription when audio attachment is present', async () => {
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: 'The answer is 4' }),
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment], config);
expect(result.role).toBe('user');
expect(result.content).toContain('[Voice message]:');
expect(result.content).toContain('The answer is 4');
expect(result.content).toContain(textMessage);
});
// Positive: transcribes multiple audio attachments.
it('transcribes multiple audio attachments', async () => {
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: 'The answer is 4' }),
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await buildUserMessageWithAudio(
textMessage,
[oggAudioAttachment, mp3AudioAttachment],
config,
);
expect(result.content).toContain('[Voice message]: The answer is 4');
expect(result.content).toContain('[Voice message]: The answer is 4');
});
// Positive: audio transcripts appear before original text.
it('places audio transcripts before original message text', async () => {
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: 'The answer is 4' }),
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment], config);
const content = Array.isArray(result.content) ? result.content : [{ type: 'text' as const, text: result.content }];
const textPart = content.find((p) => p.type === 'text') as { type: 'text'; text: string } | undefined;
expect(textPart).toBeDefined();
const textContent = textPart!.text || '';
const firstVoiceIndex = textContent.indexOf('[Voice message]:');
const textIndex = textContent.indexOf(textMessage);
expect(firstVoiceIndex).toBeLessThan(textIndex);
});
// Positive: handles mixed image and audio attachments.
it('handles mixed image and audio attachments', async () => {
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: 'The answer is 4' }),
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await buildUserMessageWithAudio(
textMessage,
[jpegBase64Attachment, oggAudioAttachment, pngUrlAttachment],
config,
);
expect(result.role).toBe('user');
expect(Array.isArray(result.content)).toBe(true);
const parts = result.content as Array<{ type: string; text?: string }>;
expect(parts).toHaveLength(3); // transcription text, image part, text part
const textPart = parts.find((p) => p.type === 'text');
expect(textPart?.text).toContain('[Voice message]:');
expect(textPart?.text).toContain(textMessage);
const imagePart = parts.find((p) => p.type === 'image');
expect(imagePart).toBeDefined();
});
// Positive: no transcription when audio config is missing.
it('returns original message when audio config is missing', async () => {
const result = await buildUserMessageWithAudio(textMessage, [oggAudioAttachment]);
expect(result).toEqual({ role: 'user', content: textMessage });
});
// Positive: empty text with audio attachments.
it('handles empty text with audio attachments', async () => {
// Mock fetch to avoid actual API calls
vi.mocked(global.fetch).mockResolvedValue({
ok: true,
json: async () => ({ text: 'Test' }),
} as Response);
const config: AudioTranscriptionConfig = {
endpoint: 'https://api.example.com/v1/audio/transcriptions',
};
const result = await buildUserMessageWithAudio('', [oggAudioAttachment], config);
expect(result.role).toBe('user');
expect(result.content).toContain('[Voice message]:');
});
});
// ---------------------------------------------------------------------------
// 10. getMessageTextWithTools
// ---------------------------------------------------------------------------
describe('getMessageTextWithTools', () => {
it('returns string directly for string content', () => {
const msg: Message = { role: 'user', content: 'plain text' };
expect(getMessageTextWithTools(msg)).toBe('plain text');
});
it('extracts text from text-only array content', () => {
const msg: Message = {
role: 'assistant',
content: [
{ type: 'text', text: 'Hello ' },
{ type: 'text', text: 'World' },
],
};
expect(getMessageTextWithTools(msg)).toBe('Hello \nWorld');
});
it('serializes tool_use blocks to readable text', () => {
const msg = {
role: 'assistant',
content: [
{ type: 'tool_use', name: 'search', input: { query: 'foo' } },
],
} as unknown as Message;
expect(getMessageTextWithTools(msg)).toBe('[Calling tool: search({"query":"foo"})]');
});
it('serializes tool_result blocks to readable text', () => {
const msg = {
role: 'user',
content: [
{ type: 'tool_result', content: 'Found 3 results' },
],
} as unknown as Message;
expect(getMessageTextWithTools(msg)).toBe('[Tool result: Found 3 results]');
});
it('marks error tool_result blocks', () => {
const msg = {
role: 'user',
content: [
{ type: 'tool_result', content: 'File not found', is_error: true },
],
} as unknown as Message;
expect(getMessageTextWithTools(msg)).toBe('[Tool result (error): File not found]');
});
it('handles mixed content (text + tool_use + tool_result) joined with newline', () => {
const msg = {
role: 'assistant',
content: [
{ type: 'text', text: 'Let me search for that.' },
{ type: 'tool_use', name: 'web_search', input: { q: 'test' } },
{ type: 'tool_result', content: 'No results' },
],
} as unknown as Message;
const result = getMessageTextWithTools(msg);
expect(result).toBe(
'Let me search for that.\n[Calling tool: web_search({"q":"test"})]\n[Tool result: No results]',
);
});
it('returns empty string for empty array content', () => {
const msg: Message = {
role: 'assistant',
content: [],
};
expect(getMessageTextWithTools(msg)).toBe('');
});
});
// ---------------------------------------------------------------------------
// 11. normalizeMessagesForLocal
// ---------------------------------------------------------------------------
describe('normalizeMessagesForLocal', () => {
it('passes through simple text messages', () => {
const messages: Message[] = [
{ role: 'user', content: 'Hello' },
{ role: 'assistant', content: 'Hi there' },
];
const result = normalizeMessagesForLocal(undefined, messages);
expect(result).toEqual([
{ role: 'user', content: 'Hello' },
{ role: 'assistant', content: 'Hi there' },
]);
});
it('prepends system message when provided', () => {
const messages: Message[] = [
{ role: 'user', content: 'Hello' },
];
const result = normalizeMessagesForLocal('You are helpful.', messages);
expect(result).toEqual([
{ role: 'system', content: 'You are helpful.' },
{ role: 'user', content: 'Hello' },
]);
});
it('omits system message when undefined', () => {
const messages: Message[] = [
{ role: 'user', content: 'Hello' },
];
const result = normalizeMessagesForLocal(undefined, messages);
expect(result).toEqual([
{ role: 'user', content: 'Hello' },
]);
});
it('merges consecutive same-role messages', () => {
const messages: Message[] = [
{ role: 'user', content: 'Part 1' },
{ role: 'user', content: 'Part 2' },
{ role: 'assistant', content: 'Response' },
];
const result = normalizeMessagesForLocal(undefined, messages);
expect(result).toEqual([
{ role: 'user', content: 'Part 1\n\nPart 2' },
{ role: 'assistant', content: 'Response' },
]);
});
it('drops empty messages (e.g. image-only content that serializes to "")', () => {
const messages: Message[] = [
{ role: 'user', content: 'Before' },
{
role: 'user',
content: [
{ type: 'image', source: { type: 'url', media_type: 'image/png', url: 'https://example.com/img.png' } },
],
},
{ role: 'assistant', content: 'After' },
];
const result = normalizeMessagesForLocal(undefined, messages);
expect(result).toEqual([
{ role: 'user', content: 'Before' },
{ role: 'assistant', content: 'After' },
]);
});
it('handles realistic agent tool loop sequence', () => {
// Simulates: user asks question → assistant calls tool → user provides result → assistant responds
const messages = [
{ role: 'user', content: 'What is the weather?' },
{
role: 'assistant',
content: [
{ type: 'text', text: 'Let me check.' },
{ type: 'tool_use', name: 'get_weather', input: { city: 'London' } },
],
},
{
role: 'user',
content: [
{ type: 'tool_result', content: 'Sunny, 22°C' },
],
},
{ role: 'assistant', content: 'The weather in London is sunny at 22°C.' },
] as unknown as Message[];
const result = normalizeMessagesForLocal('You are a weather bot.', messages);
expect(result).toEqual([
{ role: 'system', content: 'You are a weather bot.' },
{ role: 'user', content: 'What is the weather?' },
{ role: 'assistant', content: 'Let me check.\n[Calling tool: get_weather({"city":"London"})]' },
{ role: 'user', content: '[Tool result: Sunny, 22°C]' },
{ role: 'assistant', content: 'The weather in London is sunny at 22°C.' },
]);
});
it('returns empty array when all messages are empty', () => {
const messages: Message[] = [
{ role: 'user', content: '' },
{ role: 'assistant', content: '' },
];
const result = normalizeMessagesForLocal(undefined, messages);
expect(result).toEqual([]);
});
it('returns only system message when all messages are empty but system is set', () => {
const messages: Message[] = [
{ role: 'user', content: '' },
];
const result = normalizeMessagesForLocal('System prompt', messages);
expect(result).toEqual([
{ role: 'system', content: 'System prompt' },
]);
});
});