feat: add multimodal media pipeline for image support across all providers and channels

Widen Message.content from string to string | MessageContentPart[] to support multimodal content. Add Attachment type to channel layer, media conversion utilities, and image extraction to all channel adapters (Telegram, Discord, Slack, WhatsApp). Update all model clients (Anthropic, OpenAI, Gemini, Bedrock) to convert structured content to provider-specific formats. Fix downstream consumers (tokens, compaction, TUI, local models) to handle the widened type via getMessageText() helper.
2026-02-06 17:17:21 -08:00
parent cfdd448495
commit a515912537
22 changed files with 788 additions and 37 deletions
@@ -1,6 +1,6 @@
 import Anthropic from '@anthropic-ai/sdk';
-import type { Message } from '@anthropic-ai/sdk/resources/messages/messages.js';
-import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from './types.js';
+import type { Message as AnthropicMessage } from '@anthropic-ai/sdk/resources/messages/messages.js';
+import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, Message, MessageContentPart } from './types.js';

 export interface AnthropicClientConfig {
  apiKey?: string;     // Falls back to ANTHROPIC_API_KEY env var
@@ -9,6 +9,42 @@ export interface AnthropicClientConfig {
  maxTokens?: number;
 }

+/**
+ * Convert Flynn MessageContentPart[] to Anthropic ContentBlockParam format.
+ */
+function toAnthropicContent(content: string | MessageContentPart[]): string | unknown[] {
+  if (typeof content === 'string') {
+    return content;
+  }
+
+  return content.map(part => {
+    if (part.type === 'text') {
+      return { type: 'text', text: part.text };
+    }
+    if (part.type === 'image') {
+      if (part.source.type === 'base64') {
+        return {
+          type: 'image',
+          source: {
+            type: 'base64',
+            media_type: part.source.media_type,
+            data: part.source.data!,
+          },
+        };
+      }
+      // URL-based image
+      return {
+        type: 'image',
+        source: {
+          type: 'url',
+          url: part.source.url!,
+        },
+      };
+    }
+    return part;
+  });
+}
+
 export class AnthropicClient implements ModelClient {
  private client: Anthropic;
  private model: string;
@@ -30,7 +66,7 @@ export class AnthropicClient implements ModelClient {
      system: request.system,
      messages: request.messages.map((m) => ({
        role: m.role,
-        content: m.content,
+        content: toAnthropicContent(m.content),
      })),
    };

@@ -38,7 +74,7 @@ export class AnthropicClient implements ModelClient {
      params.tools = request.tools;
    }

-    const response = await this.client.messages.create(params as unknown as Parameters<typeof this.client.messages.create>[0]) as Message;
+    const response = await this.client.messages.create(params as unknown as Parameters<typeof this.client.messages.create>[0]) as AnthropicMessage;

    const textContent = response.content.find((c) => c.type === 'text');
    const content = textContent?.type === 'text' ? textContent.text : '';
@@ -65,8 +101,8 @@ export class AnthropicClient implements ModelClient {
      system: request.system,
      messages: request.messages.map((m) => ({
        role: m.role,
-        content: m.content,
-      })),
+        content: toAnthropicContent(m.content),
+      })) as Parameters<typeof this.client.messages.stream>[0]['messages'],
    });

    try {
@@ -11,7 +11,7 @@ import type {
  ConverseCommandInput,
  ConverseStreamCommandInput,
 } from '@aws-sdk/client-bedrock-runtime';
-import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition } from './types.js';
+import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition, Message, MessageContentPart } from './types.js';

 export interface BedrockClientConfig {
  model: string;
@@ -157,11 +157,34 @@ export class BedrockClient implements ModelClient {
  }
 }

-function convertMessages(messages: { role: string; content: string }[]): BedrockMessage[] {
-  return messages.map(m => ({
-    role: m.role === 'assistant' ? 'assistant' as const : 'user' as const,
-    content: [{ text: m.content }] as ContentBlock[],
-  }));
+function convertMessages(messages: Message[]): BedrockMessage[] {
+  return messages.map(m => {
+    const role = m.role === 'assistant' ? 'assistant' as const : 'user' as const;
+
+    if (typeof m.content === 'string') {
+      return { role, content: [{ text: m.content }] as ContentBlock[] };
+    }
+
+    // Multimodal content: convert each part
+    const blocks: ContentBlock[] = m.content.map((part: MessageContentPart) => {
+      if (part.type === 'text') {
+        return { text: part.text } as ContentBlock;
+      }
+      // Image part — Bedrock uses { image: { format, source: { bytes } } }
+      if (part.source.type === 'base64' && part.source.data) {
+        return {
+          image: {
+            format: part.source.media_type.split('/')[1] as 'jpeg' | 'png' | 'gif' | 'webp',
+            source: { bytes: Uint8Array.from(atob(part.source.data), c => c.charCodeAt(0)) },
+          },
+        } as unknown as ContentBlock;
+      }
+      // URL images not natively supported by Bedrock — fall back to text placeholder
+      return { text: `[Image: ${part.source.url ?? 'unsupported'}]` } as ContentBlock;
+    });
+
+    return { role, content: blocks };
+  });
 }

 function convertTools(tools: ToolDefinition[]): ToolConfiguration {
@@ -1,6 +1,6 @@
 import { GoogleGenerativeAI } from '@google/generative-ai';
-import type { GenerativeModel, Content, FunctionDeclaration, FunctionDeclarationSchema } from '@google/generative-ai';
-import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition } from './types.js';
+import type { GenerativeModel, Content, Part, FunctionDeclaration, FunctionDeclarationSchema } from '@google/generative-ai';
+import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition, Message, MessageContentPart } from './types.js';

 export interface GeminiClientConfig {
  apiKey?: string;
@@ -154,12 +154,38 @@ export class GeminiClient implements ModelClient {
  }
 }

-/** Convert Flynn's Message[] to Gemini Content[] format */
-function convertMessages(messages: { role: string; content: string }[]): Content[] {
-  return messages.map(m => ({
-    role: m.role === 'assistant' ? 'model' : 'user',
-    parts: [{ text: m.content }],
-  }));
+/** Convert Flynn's Message[] to Gemini Content[] format, including multimodal parts */
+function convertMessages(messages: Message[]): Content[] {
+  return messages.map(m => {
+    const role = m.role === 'assistant' ? 'model' : 'user';
+
+    if (typeof m.content === 'string') {
+      return { role, parts: [{ text: m.content }] };
+    }
+
+    // Multimodal content — convert each part
+    const parts: Part[] = m.content.map(part => {
+      if (part.type === 'text') {
+        return { text: part.text };
+      }
+      if (part.type === 'image') {
+        if (part.source.type === 'base64' && part.source.data) {
+          return {
+            inlineData: {
+              mimeType: part.source.media_type,
+              data: part.source.data,
+            },
+          };
+        }
+        // URL-based images — Gemini doesn't natively support URL refs in inline data,
+        // so we pass as a text description. In production, you'd want to fetch + base64 encode.
+        return { text: `[Image: ${part.source.url ?? 'unavailable'}]` };
+      }
+      return { text: JSON.stringify(part) };
+    });
+
+    return { role, parts };
+  });
 }

 /** Convert Flynn's ToolDefinition to Gemini FunctionDeclaration format */
@@ -7,8 +7,17 @@ export { LlamaCppClient, type LlamaCppClientConfig } from './local/index.js';
 export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js';
 export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js';
 export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js';
+export {
+  isSupportedImage,
+  attachmentToImageSource,
+  buildUserMessage,
+  getMessageText,
+  hasImages,
+} from './media.js';
 export type {
  Message,
+  MessageContentPart,
+  ImageSource,
  ChatRequest,
  ChatResponse,
  ChatStreamEvent,
@@ -1,4 +1,5 @@
 import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from '../types.js';
+import { getMessageText } from '../media.js';

 export interface LlamaCppClientConfig {
  endpoint: string;
@@ -40,7 +41,7 @@ export class LlamaCppClient implements ModelClient {
    }

    for (const msg of request.messages) {
-      messages.push({ role: msg.role, content: msg.content });
+      messages.push({ role: msg.role, content: getMessageText(msg) });
    }

    const headers: Record<string, string> = {
@@ -94,7 +95,7 @@ export class LlamaCppClient implements ModelClient {
    }

    for (const msg of request.messages) {
-      messages.push({ role: msg.role, content: msg.content });
+      messages.push({ role: msg.role, content: getMessageText(msg) });
    }

    const headers: Record<string, string> = {
@@ -1,5 +1,6 @@
 import { Ollama } from 'ollama';
 import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from '../types.js';
+import { getMessageText } from '../media.js';

 export interface OllamaClientConfig {
  host?: string;
@@ -28,7 +29,7 @@ export class OllamaClient implements ModelClient {
    }

    for (const msg of request.messages) {
-      messages.push({ role: msg.role, content: msg.content });
+      messages.push({ role: msg.role, content: getMessageText(msg) });
    }

    const response = await this.client.chat({
@@ -57,7 +58,7 @@ export class OllamaClient implements ModelClient {
    }

    for (const msg of request.messages) {
-      messages.push({ role: msg.role, content: msg.content });
+      messages.push({ role: msg.role, content: getMessageText(msg) });
    }

    try {
@@ -0,0 +1,261 @@
+import { describe, it, expect } from 'vitest';
+import type { Attachment } from '../channels/types.js';
+import type { Message } from './types.js';
+import {
+  isSupportedImage,
+  attachmentToImageSource,
+  buildUserMessage,
+  getMessageText,
+  hasImages,
+} from './media.js';
+
+// ---------------------------------------------------------------------------
+// Helpers – reusable attachment fixtures
+// ---------------------------------------------------------------------------
+
+function makeAttachment(overrides: Partial<Attachment> & { mimeType: string }): Attachment {
+  return { ...overrides };
+}
+
+const jpegBase64Attachment: Attachment = makeAttachment({
+  mimeType: 'image/jpeg',
+  data: 'aGVsbG8=', // "hello" in base64
+  filename: 'photo.jpg',
+});
+
+const pngUrlAttachment: Attachment = makeAttachment({
+  mimeType: 'image/png',
+  url: 'https://example.com/image.png',
+});
+
+const pdfAttachment: Attachment = makeAttachment({
+  mimeType: 'application/pdf',
+  data: 'cGRm',
+  filename: 'doc.pdf',
+});
+
+// ---------------------------------------------------------------------------
+// 1. isSupportedImage
+// ---------------------------------------------------------------------------
+
+describe('isSupportedImage', () => {
+  // Positive: all four supported MIME types should return true.
+  it.each([
+    'image/jpeg',
+    'image/png',
+    'image/gif',
+    'image/webp',
+  ])('returns true for supported type %s', (mime) => {
+    expect(isSupportedImage(makeAttachment({ mimeType: mime }))).toBe(true);
+  });
+
+  // Negative: unsupported MIME types should return false.
+  it.each([
+    'image/bmp',
+    'application/pdf',
+    'audio/mp3',
+    'text/plain',
+  ])('returns false for unsupported type %s', (mime) => {
+    expect(isSupportedImage(makeAttachment({ mimeType: mime }))).toBe(false);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 2. attachmentToImageSource
+// ---------------------------------------------------------------------------
+
+describe('attachmentToImageSource', () => {
+  // Positive: attachment with base64 data produces a base64 ImageSource.
+  it('returns base64 ImageSource when attachment has data', () => {
+    const result = attachmentToImageSource(jpegBase64Attachment);
+
+    expect(result).toEqual({
+      type: 'base64',
+      media_type: 'image/jpeg',
+      data: 'aGVsbG8=',
+    });
+  });
+
+  // Positive: attachment with url (no data) produces a url ImageSource.
+  it('returns url ImageSource when attachment has url but no data', () => {
+    const result = attachmentToImageSource(pngUrlAttachment);
+
+    expect(result).toEqual({
+      type: 'url',
+      media_type: 'image/png',
+      url: 'https://example.com/image.png',
+    });
+  });
+
+  // Positive: when both data and url are present, base64 is preferred.
+  it('prefers base64 data over url when both are present', () => {
+    const both = makeAttachment({
+      mimeType: 'image/webp',
+      data: 'YWJj',
+      url: 'https://example.com/img.webp',
+    });
+
+    const result = attachmentToImageSource(both);
+
+    expect(result).toEqual({
+      type: 'base64',
+      media_type: 'image/webp',
+      data: 'YWJj',
+    });
+  });
+
+  // Negative: unsupported MIME type returns null.
+  it('returns null for unsupported MIME type', () => {
+    expect(attachmentToImageSource(pdfAttachment)).toBeNull();
+  });
+
+  // Negative: supported MIME but neither data nor url returns null.
+  it('returns null when attachment has neither data nor url', () => {
+    const bare = makeAttachment({ mimeType: 'image/gif' });
+
+    expect(attachmentToImageSource(bare)).toBeNull();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 3. buildUserMessage
+// ---------------------------------------------------------------------------
+
+describe('buildUserMessage', () => {
+  // Positive: plain text message when no attachments argument is provided.
+  it('returns plain string content when no attachments', () => {
+    const msg = buildUserMessage('Hello');
+
+    expect(msg).toEqual({ role: 'user', content: 'Hello' });
+  });
+
+  // Positive: plain text message when attachments is an empty array.
+  it('returns plain string content when attachments is empty array', () => {
+    const msg = buildUserMessage('Hello', []);
+
+    expect(msg).toEqual({ role: 'user', content: 'Hello' });
+  });
+
+  // Positive: plain text message when attachments contain no supported images.
+  it('returns plain string content when no image attachments (PDF only)', () => {
+    const msg = buildUserMessage('See attached', [pdfAttachment]);
+
+    expect(msg).toEqual({ role: 'user', content: 'See attached' });
+  });
+
+  // Positive: multimodal message with text + image parts when image attachment present.
+  it('returns multimodal message with text + image parts', () => {
+    const msg = buildUserMessage('Look at this', [jpegBase64Attachment]);
+
+    expect(msg.role).toBe('user');
+    expect(Array.isArray(msg.content)).toBe(true);
+
+    const parts = msg.content as Array<{ type: string }>;
+    expect(parts).toHaveLength(2);
+    expect(parts[0]).toEqual({ type: 'text', text: 'Look at this' });
+    expect(parts[1]).toEqual({
+      type: 'image',
+      source: { type: 'base64', media_type: 'image/jpeg', data: 'aGVsbG8=' },
+    });
+  });
+
+  // Positive: multimodal message with just image part when text is empty.
+  it('returns multimodal message with just image part when text is empty', () => {
+    const msg = buildUserMessage('', [pngUrlAttachment]);
+
+    expect(msg.role).toBe('user');
+    const parts = msg.content as Array<{ type: string }>;
+    // Empty text is omitted, only image part
+    expect(parts).toHaveLength(1);
+    expect(parts[0]).toEqual({
+      type: 'image',
+      source: { type: 'url', media_type: 'image/png', url: 'https://example.com/image.png' },
+    });
+  });
+
+  // Positive: handles multiple image attachments.
+  it('handles multiple image attachments', () => {
+    const msg = buildUserMessage('Two images', [jpegBase64Attachment, pngUrlAttachment]);
+
+    const parts = msg.content as Array<{ type: string }>;
+    expect(parts).toHaveLength(3); // text + 2 images
+    expect(parts[0]).toEqual({ type: 'text', text: 'Two images' });
+    expect(parts[1]).toMatchObject({ type: 'image' });
+    expect(parts[2]).toMatchObject({ type: 'image' });
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 4. getMessageText
+// ---------------------------------------------------------------------------
+
+describe('getMessageText', () => {
+  // Positive: returns string directly for string content.
+  it('returns string directly for string content messages', () => {
+    const msg: Message = { role: 'user', content: 'plain text' };
+
+    expect(getMessageText(msg)).toBe('plain text');
+  });
+
+  // Positive: extracts and joins text parts from multimodal messages.
+  it('extracts and joins text parts from multimodal messages', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'text', text: 'Hello ' },
+        { type: 'image', source: { type: 'base64', media_type: 'image/png', data: 'x' } },
+        { type: 'text', text: 'World' },
+      ],
+    };
+
+    expect(getMessageText(msg)).toBe('Hello World');
+  });
+
+  // Negative: returns empty string for multimodal messages with only image parts.
+  it('returns empty string for multimodal messages with only image parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'image', source: { type: 'url', media_type: 'image/gif', url: 'https://example.com/a.gif' } },
+      ],
+    };
+
+    expect(getMessageText(msg)).toBe('');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 5. hasImages
+// ---------------------------------------------------------------------------
+
+describe('hasImages', () => {
+  // Negative: string content never has images.
+  it('returns false for string content messages', () => {
+    const msg: Message = { role: 'user', content: 'no images here' };
+
+    expect(hasImages(msg)).toBe(false);
+  });
+
+  // Negative: multimodal messages with only text parts have no images.
+  it('returns false for multimodal messages with only text parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [{ type: 'text', text: 'just text' }],
+    };
+
+    expect(hasImages(msg)).toBe(false);
+  });
+
+  // Positive: multimodal messages with image parts are detected.
+  it('returns true for multimodal messages with image parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'text', text: 'caption' },
+        { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
+      ],
+    };
+
+    expect(hasImages(msg)).toBe(true);
+  });
+});
@@ -0,0 +1,101 @@
+/**
+ * Media utilities for converting channel attachments to model content parts.
+ */
+
+import type { Attachment } from '../channels/types.js';
+import type { MessageContentPart, ImageSource, Message } from './types.js';
+
+/** MIME types that vision models generally accept. */
+const SUPPORTED_IMAGE_TYPES = new Set([
+  'image/jpeg',
+  'image/png',
+  'image/gif',
+  'image/webp',
+]);
+
+/** Check whether an attachment is a supported image type. */
+export function isSupportedImage(attachment: Attachment): boolean {
+  return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType);
+}
+
+/** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */
+export function attachmentToImageSource(attachment: Attachment): ImageSource | null {
+  if (!isSupportedImage(attachment)) {
+    return null;
+  }
+
+  if (attachment.data) {
+    return {
+      type: 'base64',
+      media_type: attachment.mimeType,
+      data: attachment.data,
+    };
+  }
+
+  if (attachment.url) {
+    return {
+      type: 'url',
+      media_type: attachment.mimeType,
+      url: attachment.url,
+    };
+  }
+
+  return null;
+}
+
+/**
+ * Build a multimodal Message from text + attachments.
+ * If there are no image attachments, returns a plain text Message.
+ * If there are image attachments, returns a Message with structured content parts.
+ */
+export function buildUserMessage(text: string, attachments?: Attachment[]): Message {
+  const imageParts: MessageContentPart[] = [];
+
+  if (attachments) {
+    for (const att of attachments) {
+      const source = attachmentToImageSource(att);
+      if (source) {
+        imageParts.push({ type: 'image', source });
+      }
+    }
+  }
+
+  // No images — return simple text message (preserves backward compat)
+  if (imageParts.length === 0) {
+    return { role: 'user', content: text };
+  }
+
+  // Build multimodal content: text first, then images
+  const parts: MessageContentPart[] = [];
+  if (text) {
+    parts.push({ type: 'text', text });
+  }
+  parts.push(...imageParts);
+
+  return { role: 'user', content: parts };
+}
+
+/**
+ * Extract the text content from a Message regardless of content format.
+ * For string content, returns the string directly.
+ * For array content, concatenates all text parts.
+ */
+export function getMessageText(message: Message): string {
+  if (typeof message.content === 'string') {
+    return message.content;
+  }
+  return message.content
+    .filter((p): p is { type: 'text'; text: string } => p.type === 'text')
+    .map(p => p.text)
+    .join('');
+}
+
+/**
+ * Check whether a message contains image content parts.
+ */
+export function hasImages(message: Message): boolean {
+  if (typeof message.content === 'string') {
+    return false;
+  }
+  return message.content.some(p => p.type === 'image');
+}
@@ -1,5 +1,5 @@
 import OpenAI from 'openai';
-import type { ChatRequest, ChatResponse, ModelClient } from './types.js';
+import type { ChatRequest, ChatResponse, ModelClient, MessageContentPart } from './types.js';

 export interface OpenAIClientConfig {
  apiKey?: string;
@@ -8,6 +8,31 @@ export interface OpenAIClientConfig {
  baseURL?: string;
 }

+/**
+ * Convert Flynn message content to OpenAI format.
+ * OpenAI uses { type: 'text', text } and { type: 'image_url', image_url: { url } } parts.
+ */
+function toOpenAIContent(content: string | MessageContentPart[]): string | OpenAI.ChatCompletionContentPart[] {
+  if (typeof content === 'string') {
+    return content;
+  }
+
+  return content.map((part): OpenAI.ChatCompletionContentPart => {
+    if (part.type === 'text') {
+      return { type: 'text', text: part.text };
+    }
+    if (part.type === 'image') {
+      // OpenAI accepts data URIs or regular URLs
+      const url = part.source.type === 'base64'
+        ? `data:${part.source.media_type};base64,${part.source.data!}`
+        : part.source.url!;
+      return { type: 'image_url', image_url: { url } };
+    }
+    // Fallback — shouldn't happen
+    return { type: 'text', text: JSON.stringify(part) };
+  });
+}
+
 export class OpenAIClient implements ModelClient {
  private client: OpenAI;
  private model: string;
@@ -30,7 +55,10 @@ export class OpenAIClient implements ModelClient {
    }

    for (const msg of request.messages) {
-      messages.push({ role: msg.role, content: msg.content });
+      messages.push({
+        role: msg.role,
+        content: toOpenAIContent(msg.content),
+      } as OpenAI.ChatCompletionMessageParam);
    }

    // Build params, conditionally including tools
@@ -1,6 +1,23 @@
+/** Image source for multimodal content blocks. */
+export interface ImageSource {
+  type: 'base64' | 'url';
+  /** MIME type (e.g. "image/jpeg", "image/png", "image/webp", "image/gif"). */
+  media_type: string;
+  /** Base64-encoded image data (when type === 'base64'). */
+  data?: string;
+  /** Image URL (when type === 'url'). */
+  url?: string;
+}
+
+/** Individual content part within a multimodal message. */
+export type MessageContentPart =
+  | { type: 'text'; text: string }
+  | { type: 'image'; source: ImageSource };
+
 export interface Message {
  role: 'user' | 'assistant';
-  content: string;
+  /** String for text-only messages, or array for multimodal content. */
+  content: string | MessageContentPart[];
  timestamp?: number;
 }

@@ -25,6 +42,7 @@ export interface ModelToolCall {
 // Content blocks for multi-content responses
 export type ContentBlock =
  | { type: 'text'; text: string }
+  | { type: 'image'; source: ImageSource }
  | { type: 'tool_use'; id: string; name: string; input: unknown };

 // Tool result fed back into conversation