feat: add multimodal media pipeline for image support across all providers and channels

Widen Message.content from string to string | MessageContentPart[] to support
multimodal content. Add Attachment type to channel layer, media conversion
utilities, and image extraction to all channel adapters (Telegram, Discord,
Slack, WhatsApp). Update all model clients (Anthropic, OpenAI, Gemini, Bedrock)
to convert structured content to provider-specific formats. Fix downstream
consumers (tokens, compaction, TUI, local models) to handle the widened type
via getMessageText() helper.
This commit is contained in:
William Valentin
2026-02-06 17:17:21 -08:00
parent cfdd448495
commit a515912537
22 changed files with 788 additions and 37 deletions
+261
View File
@@ -0,0 +1,261 @@
import { describe, it, expect } from 'vitest';
import type { Attachment } from '../channels/types.js';
import type { Message } from './types.js';
import {
isSupportedImage,
attachmentToImageSource,
buildUserMessage,
getMessageText,
hasImages,
} from './media.js';
// ---------------------------------------------------------------------------
// Helpers reusable attachment fixtures
// ---------------------------------------------------------------------------
function makeAttachment(overrides: Partial<Attachment> & { mimeType: string }): Attachment {
return { ...overrides };
}
const jpegBase64Attachment: Attachment = makeAttachment({
mimeType: 'image/jpeg',
data: 'aGVsbG8=', // "hello" in base64
filename: 'photo.jpg',
});
const pngUrlAttachment: Attachment = makeAttachment({
mimeType: 'image/png',
url: 'https://example.com/image.png',
});
const pdfAttachment: Attachment = makeAttachment({
mimeType: 'application/pdf',
data: 'cGRm',
filename: 'doc.pdf',
});
// ---------------------------------------------------------------------------
// 1. isSupportedImage
// ---------------------------------------------------------------------------
describe('isSupportedImage', () => {
// Positive: all four supported MIME types should return true.
it.each([
'image/jpeg',
'image/png',
'image/gif',
'image/webp',
])('returns true for supported type %s', (mime) => {
expect(isSupportedImage(makeAttachment({ mimeType: mime }))).toBe(true);
});
// Negative: unsupported MIME types should return false.
it.each([
'image/bmp',
'application/pdf',
'audio/mp3',
'text/plain',
])('returns false for unsupported type %s', (mime) => {
expect(isSupportedImage(makeAttachment({ mimeType: mime }))).toBe(false);
});
});
// ---------------------------------------------------------------------------
// 2. attachmentToImageSource
// ---------------------------------------------------------------------------
describe('attachmentToImageSource', () => {
// Positive: attachment with base64 data produces a base64 ImageSource.
it('returns base64 ImageSource when attachment has data', () => {
const result = attachmentToImageSource(jpegBase64Attachment);
expect(result).toEqual({
type: 'base64',
media_type: 'image/jpeg',
data: 'aGVsbG8=',
});
});
// Positive: attachment with url (no data) produces a url ImageSource.
it('returns url ImageSource when attachment has url but no data', () => {
const result = attachmentToImageSource(pngUrlAttachment);
expect(result).toEqual({
type: 'url',
media_type: 'image/png',
url: 'https://example.com/image.png',
});
});
// Positive: when both data and url are present, base64 is preferred.
it('prefers base64 data over url when both are present', () => {
const both = makeAttachment({
mimeType: 'image/webp',
data: 'YWJj',
url: 'https://example.com/img.webp',
});
const result = attachmentToImageSource(both);
expect(result).toEqual({
type: 'base64',
media_type: 'image/webp',
data: 'YWJj',
});
});
// Negative: unsupported MIME type returns null.
it('returns null for unsupported MIME type', () => {
expect(attachmentToImageSource(pdfAttachment)).toBeNull();
});
// Negative: supported MIME but neither data nor url returns null.
it('returns null when attachment has neither data nor url', () => {
const bare = makeAttachment({ mimeType: 'image/gif' });
expect(attachmentToImageSource(bare)).toBeNull();
});
});
// ---------------------------------------------------------------------------
// 3. buildUserMessage
// ---------------------------------------------------------------------------
describe('buildUserMessage', () => {
// Positive: plain text message when no attachments argument is provided.
it('returns plain string content when no attachments', () => {
const msg = buildUserMessage('Hello');
expect(msg).toEqual({ role: 'user', content: 'Hello' });
});
// Positive: plain text message when attachments is an empty array.
it('returns plain string content when attachments is empty array', () => {
const msg = buildUserMessage('Hello', []);
expect(msg).toEqual({ role: 'user', content: 'Hello' });
});
// Positive: plain text message when attachments contain no supported images.
it('returns plain string content when no image attachments (PDF only)', () => {
const msg = buildUserMessage('See attached', [pdfAttachment]);
expect(msg).toEqual({ role: 'user', content: 'See attached' });
});
// Positive: multimodal message with text + image parts when image attachment present.
it('returns multimodal message with text + image parts', () => {
const msg = buildUserMessage('Look at this', [jpegBase64Attachment]);
expect(msg.role).toBe('user');
expect(Array.isArray(msg.content)).toBe(true);
const parts = msg.content as Array<{ type: string }>;
expect(parts).toHaveLength(2);
expect(parts[0]).toEqual({ type: 'text', text: 'Look at this' });
expect(parts[1]).toEqual({
type: 'image',
source: { type: 'base64', media_type: 'image/jpeg', data: 'aGVsbG8=' },
});
});
// Positive: multimodal message with just image part when text is empty.
it('returns multimodal message with just image part when text is empty', () => {
const msg = buildUserMessage('', [pngUrlAttachment]);
expect(msg.role).toBe('user');
const parts = msg.content as Array<{ type: string }>;
// Empty text is omitted, only image part
expect(parts).toHaveLength(1);
expect(parts[0]).toEqual({
type: 'image',
source: { type: 'url', media_type: 'image/png', url: 'https://example.com/image.png' },
});
});
// Positive: handles multiple image attachments.
it('handles multiple image attachments', () => {
const msg = buildUserMessage('Two images', [jpegBase64Attachment, pngUrlAttachment]);
const parts = msg.content as Array<{ type: string }>;
expect(parts).toHaveLength(3); // text + 2 images
expect(parts[0]).toEqual({ type: 'text', text: 'Two images' });
expect(parts[1]).toMatchObject({ type: 'image' });
expect(parts[2]).toMatchObject({ type: 'image' });
});
});
// ---------------------------------------------------------------------------
// 4. getMessageText
// ---------------------------------------------------------------------------
describe('getMessageText', () => {
// Positive: returns string directly for string content.
it('returns string directly for string content messages', () => {
const msg: Message = { role: 'user', content: 'plain text' };
expect(getMessageText(msg)).toBe('plain text');
});
// Positive: extracts and joins text parts from multimodal messages.
it('extracts and joins text parts from multimodal messages', () => {
const msg: Message = {
role: 'user',
content: [
{ type: 'text', text: 'Hello ' },
{ type: 'image', source: { type: 'base64', media_type: 'image/png', data: 'x' } },
{ type: 'text', text: 'World' },
],
};
expect(getMessageText(msg)).toBe('Hello World');
});
// Negative: returns empty string for multimodal messages with only image parts.
it('returns empty string for multimodal messages with only image parts', () => {
const msg: Message = {
role: 'user',
content: [
{ type: 'image', source: { type: 'url', media_type: 'image/gif', url: 'https://example.com/a.gif' } },
],
};
expect(getMessageText(msg)).toBe('');
});
});
// ---------------------------------------------------------------------------
// 5. hasImages
// ---------------------------------------------------------------------------
describe('hasImages', () => {
// Negative: string content never has images.
it('returns false for string content messages', () => {
const msg: Message = { role: 'user', content: 'no images here' };
expect(hasImages(msg)).toBe(false);
});
// Negative: multimodal messages with only text parts have no images.
it('returns false for multimodal messages with only text parts', () => {
const msg: Message = {
role: 'user',
content: [{ type: 'text', text: 'just text' }],
};
expect(hasImages(msg)).toBe(false);
});
// Positive: multimodal messages with image parts are detected.
it('returns true for multimodal messages with image parts', () => {
const msg: Message = {
role: 'user',
content: [
{ type: 'text', text: 'caption' },
{ type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
],
};
expect(hasImages(msg)).toBe(true);
});
});