feat: add multimodal media pipeline for image support across all providers and channels

Widen Message.content from string to string | MessageContentPart[] to support multimodal content. Add Attachment type to channel layer, media conversion utilities, and image extraction to all channel adapters (Telegram, Discord, Slack, WhatsApp). Update all model clients (Anthropic, OpenAI, Gemini, Bedrock) to convert structured content to provider-specific formats. Fix downstream consumers (tokens, compaction, TUI, local models) to handle the widened type via getMessageText() helper.
2026-02-06 17:17:21 -08:00
parent cfdd448495
commit a515912537
22 changed files with 788 additions and 37 deletions
@@ -5,6 +5,8 @@ import type { ToolRegistry } from '../../tools/registry.js';
 import type { ToolExecutor } from '../../tools/executor.js';
 import type { ToolResult } from '../../tools/types.js';
 import type { ToolPolicyContext } from '../../tools/policy.js';
+import type { Attachment } from '../../channels/types.js';
+import { buildUserMessage, getMessageText } from '../../models/media.js';

 export interface ToolUseEvent {
  type: 'start' | 'end';
@@ -61,8 +63,8 @@ export class NativeAgent {
    return this.session?.getHistory() ?? [...this.inMemoryHistory];
  }

-  async process(userMessage: string): Promise<string> {
-    const userMsg: Message = { role: 'user', content: userMessage };
+  async process(userMessage: string, attachments?: Attachment[]): Promise<string> {
+    const userMsg = buildUserMessage(userMessage, attachments);

    if (this.session) {
      this.session.addMessage(userMsg);
@@ -5,6 +5,7 @@ import type { ToolRegistry } from '../../tools/registry.js';
 import type { ToolExecutor } from '../../tools/executor.js';
 import type { MemoryStore } from '../../memory/store.js';
 import type { ToolPolicyContext } from '../../tools/policy.js';
+import type { Attachment } from '../../channels/types.js';
 import { NativeAgent } from './agent.js';
 import type { ToolUseEvent } from './agent.js';
 import { shouldCompact } from '../../context/tokens.js';
@@ -209,10 +210,10 @@ export class AgentOrchestrator {
   * When compaction is configured, checks whether the conversation history
   * exceeds the context window threshold and compacts it before processing.
   */
-  async process(userMessage: string): Promise<string> {
+  async process(userMessage: string, attachments?: Attachment[]): Promise<string> {
    this._injectMemoryContext();
    await this.compactIfNeeded();
-    return this._agent.process(userMessage);
+    return this._agent.process(userMessage, attachments);
  }

  /**
@@ -10,6 +10,7 @@ import { Client, GatewayIntentBits, Events } from 'discord.js';
 import type { Message as DiscordMessage } from 'discord.js';

 import type {
+  Attachment,
  InboundMessage,
  OutboundMessage,
  ChannelAdapter,
@@ -50,6 +51,20 @@ export class DiscordAdapter implements ChannelAdapter {
    this.config = config;
  }

+  /** Infer MIME type from URL if contentType is not provided. */
+  private _inferMimeTypeFromUrl(url: string): string | null {
+    const ext = url.split('.').pop()?.toLowerCase();
+    const mimeTypes: Record<string, string> = {
+      png: 'image/png',
+      jpg: 'image/jpeg',
+      jpeg: 'image/jpeg',
+      gif: 'image/gif',
+      webp: 'image/webp',
+      svg: 'image/svg+xml',
+    };
+    return mimeTypes[ext || ''] || null;
+  }
+
  /** Register the inbound message handler. Called by the registry before connect(). */
  onMessage(handler: (msg: InboundMessage) => void): void {
    this.messageHandler = handler;
@@ -159,6 +174,22 @@ export class DiscordAdapter implements ChannelAdapter {
    // Strip bot mention from the message text
    const text = message.content.replace(/<@!?\d+>/g, '').trim();

+    // ── Extract image attachments ──
+    const attachments: Attachment[] = [];
+    if (message.attachments && message.attachments.size > 0) {
+      for (const attachment of message.attachments.values()) {
+        const mimeType = attachment.contentType || this._inferMimeTypeFromUrl(attachment.url);
+        if (mimeType && mimeType.startsWith('image/')) {
+          attachments.push({
+            mimeType,
+            url: attachment.url,
+            filename: attachment.name,
+            size: attachment.size,
+          });
+        }
+      }
+    }
+
    // ── Reset command ──
    if (text === '!reset' || text === 'reset') {
      this.messageHandler({
@@ -180,6 +211,7 @@ export class DiscordAdapter implements ChannelAdapter {
      senderId: message.channelId,
      senderName: message.author.username,
      text,
+      attachments: attachments.length > 0 ? attachments : undefined,
      timestamp: Date.now(),
    });
  }
@@ -3,6 +3,7 @@ export type {
  ChannelStatus,
  InboundMessage,
  OutboundMessage,
+  Attachment,
  ToolStatusEvent,
  MessageHandler,
 } from './types.js';
@@ -8,6 +8,7 @@

 import { App } from '@slack/bolt';
 import type {
+  Attachment,
  InboundMessage,
  OutboundMessage,
  ChannelAdapter,
@@ -35,6 +36,14 @@ interface SlackMessageEvent {
  text?: string;
  bot_id?: string;
  subtype?: string;
+  files?: Array<{
+    id?: string;
+    mimetype?: string;
+    name?: string;
+    size?: number;
+    url_private?: string;
+    url_private_download?: string;
+  }>;
 }

 /**
@@ -160,6 +169,56 @@ export class SlackAdapter implements ChannelAdapter {
    }
  }

+  /**
+   * Download image files from a Slack message and convert to base64 Attachments.
+   * Non-image files are skipped. Download errors are logged but don't crash the handler.
+   */
+  private async extractImageAttachments(
+    files?: SlackMessageEvent['files'],
+  ): Promise<Attachment[]> {
+    if (!files || files.length === 0) return [];
+
+    const attachments: Attachment[] = [];
+
+    for (const file of files) {
+      // Only process image files
+      if (!file.mimetype?.startsWith('image/')) continue;
+
+      const downloadUrl = file.url_private_download || file.url_private;
+      if (!downloadUrl) continue;
+
+      try {
+        const response = await fetch(downloadUrl, {
+          headers: { Authorization: `Bearer ${this.config.botToken}` },
+        });
+
+        if (!response.ok) {
+          console.warn(
+            `Slack: failed to download file ${file.name ?? file.id ?? 'unknown'}: HTTP ${response.status}`,
+          );
+          continue;
+        }
+
+        const arrayBuffer = await response.arrayBuffer();
+        const base64 = Buffer.from(arrayBuffer).toString('base64');
+
+        attachments.push({
+          mimeType: file.mimetype,
+          data: base64,
+          filename: file.name,
+          size: file.size,
+        });
+      } catch (error) {
+        console.warn(
+          `Slack: error downloading file ${file.name ?? file.id ?? 'unknown'}:`,
+          error instanceof Error ? error.message : 'Unknown error',
+        );
+      }
+    }
+
+    return attachments;
+  }
+
  /** Internal: process an inbound Slack message event. */
  private async handleMessage(message: SlackMessageEvent): Promise<void> {
    if (!this.messageHandler) return;
@@ -200,6 +259,9 @@ export class SlackAdapter implements ChannelAdapter {
      ? await this.resolveUserName(message.user)
      : undefined;

+    // Extract image attachments from Slack file uploads
+    const attachments = await this.extractImageAttachments(message.files);
+
    // Detect reset command
    if (text === '!reset' || text === 'reset') {
      this.messageHandler({
@@ -210,6 +272,7 @@ export class SlackAdapter implements ChannelAdapter {
        text: '!reset',
        timestamp: Date.now(),
        metadata: { isCommand: true, command: 'reset' },
+        ...(attachments.length > 0 && { attachments }),
      });
      return;
    }
@@ -222,6 +285,7 @@ export class SlackAdapter implements ChannelAdapter {
      senderName,
      text,
      timestamp: Date.now(),
+      ...(attachments.length > 0 && { attachments }),
    });
  }
 }
@@ -2,6 +2,7 @@ import { Bot } from 'grammy';

 import type { HookEngine } from '../../hooks/index.js';
 import type {
+  Attachment,
  InboundMessage,
  OutboundMessage,
  ChannelAdapter,
@@ -44,6 +45,26 @@ export class TelegramAdapter implements ChannelAdapter {
    this.config = config;
  }

+  /** Download a file from Telegram API and convert to base64. */
+  private async downloadFileToBase64(fileId: string): Promise<string | null> {
+    try {
+      const file = await this.bot?.api.getFile(fileId);
+      if (!file || !file.file_path) return null;
+
+      const token = this.config.botToken;
+      const url = `https://api.telegram.org/file/bot${token}/${file.file_path}`;
+
+      const response = await fetch(url);
+      if (!response.ok) return null;
+
+      const buffer = Buffer.from(await response.arrayBuffer());
+      return buffer.toString('base64');
+    } catch (error) {
+      console.error(`Failed to download file ${fileId}:`, error);
+      return null;
+    }
+  }
+
  /** Register the inbound message handler. Called by the registry before connect(). */
  onMessage(handler: (msg: InboundMessage) => void): void {
    this.messageHandler = handler;
@@ -164,6 +185,84 @@ export class TelegramAdapter implements ChannelAdapter {
      });
    });

+    // ── Photo message handler ──
+
+    this.bot.on('message:photo', async (ctx) => {
+      if (!this.messageHandler) return;
+
+      const photo = ctx.message.photo;
+      if (!photo || photo.length === 0) return;
+
+      const largestPhoto = photo[photo.length - 1];
+
+      await ctx.replyWithChatAction('typing');
+
+      const imageData = await this.downloadFileToBase64(largestPhoto.file_id);
+      if (!imageData) {
+        console.error(`Failed to download photo ${largestPhoto.file_id}`);
+        return;
+      }
+
+      const caption = ctx.message.caption ?? '';
+
+      this.messageHandler({
+        id: String(ctx.message.message_id),
+        channel: 'telegram',
+        senderId: String(ctx.chat.id),
+        senderName: ctx.from?.first_name,
+        text: caption,
+        attachments: [
+          {
+            mimeType: 'image/jpeg',
+            data: imageData,
+            filename: `photo_${largestPhoto.file_unique_id}.jpg`,
+            size: largestPhoto.file_size,
+          },
+        ],
+        timestamp: Date.now(),
+      });
+    });
+
+    // ── Image document handler ──
+
+    this.bot.on('message:document', async (ctx) => {
+      if (!this.messageHandler) return;
+
+      const document = ctx.message.document;
+      if (!document) return;
+
+      const mimeType = document.mime_type ?? '';
+      if (!mimeType.startsWith('image/')) return;
+
+      await ctx.replyWithChatAction('typing');
+
+      const fileData = await this.downloadFileToBase64(document.file_id);
+      if (!fileData) {
+        console.error(`Failed to download document ${document.file_id}`);
+        return;
+      }
+
+      const caption = ctx.message.caption ?? '';
+      const filename = document.file_name ?? document.file_unique_id;
+
+      this.messageHandler({
+        id: String(ctx.message.message_id),
+        channel: 'telegram',
+        senderId: String(ctx.chat.id),
+        senderName: ctx.from?.first_name,
+        text: caption,
+        attachments: [
+          {
+            mimeType,
+            data: fileData,
+            filename,
+            size: document.file_size,
+          },
+        ],
+        timestamp: Date.now(),
+      });
+    });
+
    // ── Start long polling ──

    this.bot.start({
@@ -6,6 +6,20 @@
 * the ChannelAdapter interface to provide a uniform messaging API.
 */

+/** Media attachment received from or sent to a channel. */
+export interface Attachment {
+  /** MIME type (e.g. "image/jpeg", "audio/ogg", "application/pdf"). */
+  mimeType: string;
+  /** Base64-encoded data (preferred for model APIs). */
+  data?: string;
+  /** URL to download the attachment (alternative to data). */
+  url?: string;
+  /** Original filename, if available. */
+  filename?: string;
+  /** File size in bytes, if known. */
+  size?: number;
+}
+
 /** Inbound message received from a channel platform. */
 export interface InboundMessage {
  /** Platform message ID. */
@@ -18,6 +32,8 @@ export interface InboundMessage {
  senderName?: string;
  /** Message text. */
  text: string;
+  /** Media attachments (images, audio, documents). */
+  attachments?: Attachment[];
  /** ID of message being replied to. */
  replyTo?: string;
  /** Unix ms. */
@@ -9,6 +9,7 @@

 import { Client, LocalAuth } from 'whatsapp-web.js';
 import type {
+  Attachment,
  InboundMessage,
  OutboundMessage,
  ChannelAdapter,
@@ -37,6 +38,12 @@ interface WhatsAppMessage {
  fromMe: boolean;
  author?: string;
  _data?: { notifyName?: string };
+  /** Whether this message contains media (image, video, audio, document). */
+  hasMedia?: boolean;
+  /** Message type (e.g. "image", "video", "chat"). */
+  type?: string;
+  /** Download the media attached to this message. */
+  downloadMedia?: () => Promise<{ mimetype: string; data: string; filename?: string } | null>;
 }

 /**
@@ -149,7 +156,7 @@ export class WhatsAppAdapter implements ChannelAdapter {
  }

  /** Internal: process an inbound WhatsApp message. */
-  private handleMessage(message: WhatsAppMessage): void {
+  private async handleMessage(message: WhatsAppMessage): Promise<void> {
    if (!this.messageHandler) return;

    // Ignore messages from the bot itself
@@ -204,6 +211,26 @@ export class WhatsAppAdapter implements ChannelAdapter {

    const senderName = message._data?.notifyName;

+    // Extract image attachments if the message has media
+    const attachments: Attachment[] = [];
+    if (message.hasMedia) {
+      try {
+        const media = await (message as any).downloadMedia();
+        if (media && typeof media.mimetype === 'string' && media.mimetype.startsWith('image/')) {
+          attachments.push({
+            mimeType: media.mimetype,
+            data: media.data,
+            filename: media.filename,
+          });
+        }
+      } catch (error) {
+        console.error(
+          'Failed to download WhatsApp media:',
+          error instanceof Error ? error.message : 'Unknown error',
+        );
+      }
+    }
+
    // Detect reset command
    if (text === '!reset' || text === 'reset') {
      this.messageHandler({
@@ -214,6 +241,7 @@ export class WhatsAppAdapter implements ChannelAdapter {
        text: '!reset',
        timestamp: Date.now(),
        metadata: { isCommand: true, command: 'reset' },
+        ...(attachments.length > 0 ? { attachments } : {}),
      });
      return;
    }
@@ -226,6 +254,7 @@ export class WhatsAppAdapter implements ChannelAdapter {
      senderName,
      text,
      timestamp: Date.now(),
+      ...(attachments.length > 0 ? { attachments } : {}),
    });
  }
 }
@@ -3,6 +3,7 @@ import type { AgentOrchestrator } from '../backends/native/orchestrator.js';
 import type { MemoryStore } from '../memory/store.js';
 import { COMPACTION_SYSTEM_PROMPT, MEMORY_EXTRACTION_PROMPT } from '../backends/native/prompts.js';
 import { estimateMessageTokens } from './tokens.js';
+import { getMessageText } from '../models/media.js';

 export interface CompactionConfig {
  /** Percentage of context window that triggers compaction (default: 80). */
@@ -52,7 +53,7 @@ export async function compactHistory(opts: {
  const toCompact = messages.slice(0, -keepCount);
  const toKeep = messages.slice(-keepCount);

-  const formattedConversation = toCompact.map((msg) => `${msg.role}: ${msg.content}`).join('\n\n');
+  const formattedConversation = toCompact.map((msg) => `${msg.role}: ${getMessageText(msg)}`).join('\n\n');

  const tier = orchestrator.getDelegationTier('compaction');

@@ -1,4 +1,5 @@
 import type { Message } from '../models/types.js';
+import { getMessageText } from '../models/media.js';

 /**
 * Approximate overhead tokens per message (role marker, separators, etc.).
@@ -43,7 +44,7 @@ export function estimateTokens(text: string): number {
 */
 export function estimateMessageTokens(messages: Message[]): number {
  return messages.reduce(
-    (sum, msg) => sum + estimateTokens(msg.content) + MESSAGE_OVERHEAD_TOKENS,
+    (sum, msg) => sum + estimateTokens(getMessageText(msg)) + MESSAGE_OVERHEAD_TOKENS,
    0,
  );
 }
@@ -348,7 +348,7 @@ function createMessageRouter(deps: {
    }

    try {
-      const response = await agent.process(msg.text);
+      const response = await agent.process(msg.text, msg.attachments);
      await reply({ text: response, replyTo: msg.id });
    } catch (error) {
      console.error(`Error processing message from ${msg.channel}:${msg.senderId}:`, error);
@@ -1,6 +1,7 @@
 import React, { memo } from 'react';
 import { Box, Text, Static } from 'ink';
 import type { Message } from '../../../models/types.js';
+import { getMessageText } from '../../../models/media.js';
 import { renderMarkdown } from '../markdown.js';

 export interface MessageListProps {
@@ -61,8 +62,8 @@ const MessageItem = memo(function MessageItem({
        {/* Content */}
        <Text wrap="wrap">
          {message.role === 'assistant'
-            ? renderMarkdown(message.content)
-            : message.content}
+            ? renderMarkdown(getMessageText(message))
+            : getMessageText(message)}
        </Text>
      </Box>
    </Box>
@@ -1,6 +1,6 @@
 import Anthropic from '@anthropic-ai/sdk';
-import type { Message } from '@anthropic-ai/sdk/resources/messages/messages.js';
-import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from './types.js';
+import type { Message as AnthropicMessage } from '@anthropic-ai/sdk/resources/messages/messages.js';
+import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, Message, MessageContentPart } from './types.js';

 export interface AnthropicClientConfig {
  apiKey?: string;     // Falls back to ANTHROPIC_API_KEY env var
@@ -9,6 +9,42 @@ export interface AnthropicClientConfig {
  maxTokens?: number;
 }

+/**
+ * Convert Flynn MessageContentPart[] to Anthropic ContentBlockParam format.
+ */
+function toAnthropicContent(content: string | MessageContentPart[]): string | unknown[] {
+  if (typeof content === 'string') {
+    return content;
+  }
+
+  return content.map(part => {
+    if (part.type === 'text') {
+      return { type: 'text', text: part.text };
+    }
+    if (part.type === 'image') {
+      if (part.source.type === 'base64') {
+        return {
+          type: 'image',
+          source: {
+            type: 'base64',
+            media_type: part.source.media_type,
+            data: part.source.data!,
+          },
+        };
+      }
+      // URL-based image
+      return {
+        type: 'image',
+        source: {
+          type: 'url',
+          url: part.source.url!,
+        },
+      };
+    }
+    return part;
+  });
+}
+
 export class AnthropicClient implements ModelClient {
  private client: Anthropic;
  private model: string;
@@ -30,7 +66,7 @@ export class AnthropicClient implements ModelClient {
      system: request.system,
      messages: request.messages.map((m) => ({
        role: m.role,
-        content: m.content,
+        content: toAnthropicContent(m.content),
      })),
    };

@@ -38,7 +74,7 @@ export class AnthropicClient implements ModelClient {
      params.tools = request.tools;
    }

-    const response = await this.client.messages.create(params as unknown as Parameters<typeof this.client.messages.create>[0]) as Message;
+    const response = await this.client.messages.create(params as unknown as Parameters<typeof this.client.messages.create>[0]) as AnthropicMessage;

    const textContent = response.content.find((c) => c.type === 'text');
    const content = textContent?.type === 'text' ? textContent.text : '';
@@ -65,8 +101,8 @@ export class AnthropicClient implements ModelClient {
      system: request.system,
      messages: request.messages.map((m) => ({
        role: m.role,
-        content: m.content,
-      })),
+        content: toAnthropicContent(m.content),
+      })) as Parameters<typeof this.client.messages.stream>[0]['messages'],
    });

    try {
@@ -11,7 +11,7 @@ import type {
  ConverseCommandInput,
  ConverseStreamCommandInput,
 } from '@aws-sdk/client-bedrock-runtime';
-import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition } from './types.js';
+import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition, Message, MessageContentPart } from './types.js';

 export interface BedrockClientConfig {
  model: string;
@@ -157,11 +157,34 @@ export class BedrockClient implements ModelClient {
  }
 }

-function convertMessages(messages: { role: string; content: string }[]): BedrockMessage[] {
-  return messages.map(m => ({
-    role: m.role === 'assistant' ? 'assistant' as const : 'user' as const,
-    content: [{ text: m.content }] as ContentBlock[],
-  }));
+function convertMessages(messages: Message[]): BedrockMessage[] {
+  return messages.map(m => {
+    const role = m.role === 'assistant' ? 'assistant' as const : 'user' as const;
+
+    if (typeof m.content === 'string') {
+      return { role, content: [{ text: m.content }] as ContentBlock[] };
+    }
+
+    // Multimodal content: convert each part
+    const blocks: ContentBlock[] = m.content.map((part: MessageContentPart) => {
+      if (part.type === 'text') {
+        return { text: part.text } as ContentBlock;
+      }
+      // Image part — Bedrock uses { image: { format, source: { bytes } } }
+      if (part.source.type === 'base64' && part.source.data) {
+        return {
+          image: {
+            format: part.source.media_type.split('/')[1] as 'jpeg' | 'png' | 'gif' | 'webp',
+            source: { bytes: Uint8Array.from(atob(part.source.data), c => c.charCodeAt(0)) },
+          },
+        } as unknown as ContentBlock;
+      }
+      // URL images not natively supported by Bedrock — fall back to text placeholder
+      return { text: `[Image: ${part.source.url ?? 'unsupported'}]` } as ContentBlock;
+    });
+
+    return { role, content: blocks };
+  });
 }

 function convertTools(tools: ToolDefinition[]): ToolConfiguration {
@@ -1,6 +1,6 @@
 import { GoogleGenerativeAI } from '@google/generative-ai';
-import type { GenerativeModel, Content, FunctionDeclaration, FunctionDeclarationSchema } from '@google/generative-ai';
-import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition } from './types.js';
+import type { GenerativeModel, Content, Part, FunctionDeclaration, FunctionDeclarationSchema } from '@google/generative-ai';
+import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition, Message, MessageContentPart } from './types.js';

 export interface GeminiClientConfig {
  apiKey?: string;
@@ -154,12 +154,38 @@ export class GeminiClient implements ModelClient {
  }
 }

-/** Convert Flynn's Message[] to Gemini Content[] format */
-function convertMessages(messages: { role: string; content: string }[]): Content[] {
-  return messages.map(m => ({
-    role: m.role === 'assistant' ? 'model' : 'user',
-    parts: [{ text: m.content }],
-  }));
+/** Convert Flynn's Message[] to Gemini Content[] format, including multimodal parts */
+function convertMessages(messages: Message[]): Content[] {
+  return messages.map(m => {
+    const role = m.role === 'assistant' ? 'model' : 'user';
+
+    if (typeof m.content === 'string') {
+      return { role, parts: [{ text: m.content }] };
+    }
+
+    // Multimodal content — convert each part
+    const parts: Part[] = m.content.map(part => {
+      if (part.type === 'text') {
+        return { text: part.text };
+      }
+      if (part.type === 'image') {
+        if (part.source.type === 'base64' && part.source.data) {
+          return {
+            inlineData: {
+              mimeType: part.source.media_type,
+              data: part.source.data,
+            },
+          };
+        }
+        // URL-based images — Gemini doesn't natively support URL refs in inline data,
+        // so we pass as a text description. In production, you'd want to fetch + base64 encode.
+        return { text: `[Image: ${part.source.url ?? 'unavailable'}]` };
+      }
+      return { text: JSON.stringify(part) };
+    });
+
+    return { role, parts };
+  });
 }

 /** Convert Flynn's ToolDefinition to Gemini FunctionDeclaration format */
@@ -7,8 +7,17 @@ export { LlamaCppClient, type LlamaCppClientConfig } from './local/index.js';
 export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js';
 export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js';
 export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js';
+export {
+  isSupportedImage,
+  attachmentToImageSource,
+  buildUserMessage,
+  getMessageText,
+  hasImages,
+} from './media.js';
 export type {
  Message,
+  MessageContentPart,
+  ImageSource,
  ChatRequest,
  ChatResponse,
  ChatStreamEvent,
@@ -1,4 +1,5 @@
 import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from '../types.js';
+import { getMessageText } from '../media.js';

 export interface LlamaCppClientConfig {
  endpoint: string;
@@ -40,7 +41,7 @@ export class LlamaCppClient implements ModelClient {
    }

    for (const msg of request.messages) {
-      messages.push({ role: msg.role, content: msg.content });
+      messages.push({ role: msg.role, content: getMessageText(msg) });
    }

    const headers: Record<string, string> = {
@@ -94,7 +95,7 @@ export class LlamaCppClient implements ModelClient {
    }

    for (const msg of request.messages) {
-      messages.push({ role: msg.role, content: msg.content });
+      messages.push({ role: msg.role, content: getMessageText(msg) });
    }

    const headers: Record<string, string> = {
@@ -1,5 +1,6 @@
 import { Ollama } from 'ollama';
 import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from '../types.js';
+import { getMessageText } from '../media.js';

 export interface OllamaClientConfig {
  host?: string;
@@ -28,7 +29,7 @@ export class OllamaClient implements ModelClient {
    }

    for (const msg of request.messages) {
-      messages.push({ role: msg.role, content: msg.content });
+      messages.push({ role: msg.role, content: getMessageText(msg) });
    }

    const response = await this.client.chat({
@@ -57,7 +58,7 @@ export class OllamaClient implements ModelClient {
    }

    for (const msg of request.messages) {
-      messages.push({ role: msg.role, content: msg.content });
+      messages.push({ role: msg.role, content: getMessageText(msg) });
    }

    try {
@@ -0,0 +1,261 @@
+import { describe, it, expect } from 'vitest';
+import type { Attachment } from '../channels/types.js';
+import type { Message } from './types.js';
+import {
+  isSupportedImage,
+  attachmentToImageSource,
+  buildUserMessage,
+  getMessageText,
+  hasImages,
+} from './media.js';
+
+// ---------------------------------------------------------------------------
+// Helpers – reusable attachment fixtures
+// ---------------------------------------------------------------------------
+
+function makeAttachment(overrides: Partial<Attachment> & { mimeType: string }): Attachment {
+  return { ...overrides };
+}
+
+const jpegBase64Attachment: Attachment = makeAttachment({
+  mimeType: 'image/jpeg',
+  data: 'aGVsbG8=', // "hello" in base64
+  filename: 'photo.jpg',
+});
+
+const pngUrlAttachment: Attachment = makeAttachment({
+  mimeType: 'image/png',
+  url: 'https://example.com/image.png',
+});
+
+const pdfAttachment: Attachment = makeAttachment({
+  mimeType: 'application/pdf',
+  data: 'cGRm',
+  filename: 'doc.pdf',
+});
+
+// ---------------------------------------------------------------------------
+// 1. isSupportedImage
+// ---------------------------------------------------------------------------
+
+describe('isSupportedImage', () => {
+  // Positive: all four supported MIME types should return true.
+  it.each([
+    'image/jpeg',
+    'image/png',
+    'image/gif',
+    'image/webp',
+  ])('returns true for supported type %s', (mime) => {
+    expect(isSupportedImage(makeAttachment({ mimeType: mime }))).toBe(true);
+  });
+
+  // Negative: unsupported MIME types should return false.
+  it.each([
+    'image/bmp',
+    'application/pdf',
+    'audio/mp3',
+    'text/plain',
+  ])('returns false for unsupported type %s', (mime) => {
+    expect(isSupportedImage(makeAttachment({ mimeType: mime }))).toBe(false);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 2. attachmentToImageSource
+// ---------------------------------------------------------------------------
+
+describe('attachmentToImageSource', () => {
+  // Positive: attachment with base64 data produces a base64 ImageSource.
+  it('returns base64 ImageSource when attachment has data', () => {
+    const result = attachmentToImageSource(jpegBase64Attachment);
+
+    expect(result).toEqual({
+      type: 'base64',
+      media_type: 'image/jpeg',
+      data: 'aGVsbG8=',
+    });
+  });
+
+  // Positive: attachment with url (no data) produces a url ImageSource.
+  it('returns url ImageSource when attachment has url but no data', () => {
+    const result = attachmentToImageSource(pngUrlAttachment);
+
+    expect(result).toEqual({
+      type: 'url',
+      media_type: 'image/png',
+      url: 'https://example.com/image.png',
+    });
+  });
+
+  // Positive: when both data and url are present, base64 is preferred.
+  it('prefers base64 data over url when both are present', () => {
+    const both = makeAttachment({
+      mimeType: 'image/webp',
+      data: 'YWJj',
+      url: 'https://example.com/img.webp',
+    });
+
+    const result = attachmentToImageSource(both);
+
+    expect(result).toEqual({
+      type: 'base64',
+      media_type: 'image/webp',
+      data: 'YWJj',
+    });
+  });
+
+  // Negative: unsupported MIME type returns null.
+  it('returns null for unsupported MIME type', () => {
+    expect(attachmentToImageSource(pdfAttachment)).toBeNull();
+  });
+
+  // Negative: supported MIME but neither data nor url returns null.
+  it('returns null when attachment has neither data nor url', () => {
+    const bare = makeAttachment({ mimeType: 'image/gif' });
+
+    expect(attachmentToImageSource(bare)).toBeNull();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 3. buildUserMessage
+// ---------------------------------------------------------------------------
+
+describe('buildUserMessage', () => {
+  // Positive: plain text message when no attachments argument is provided.
+  it('returns plain string content when no attachments', () => {
+    const msg = buildUserMessage('Hello');
+
+    expect(msg).toEqual({ role: 'user', content: 'Hello' });
+  });
+
+  // Positive: plain text message when attachments is an empty array.
+  it('returns plain string content when attachments is empty array', () => {
+    const msg = buildUserMessage('Hello', []);
+
+    expect(msg).toEqual({ role: 'user', content: 'Hello' });
+  });
+
+  // Positive: plain text message when attachments contain no supported images.
+  it('returns plain string content when no image attachments (PDF only)', () => {
+    const msg = buildUserMessage('See attached', [pdfAttachment]);
+
+    expect(msg).toEqual({ role: 'user', content: 'See attached' });
+  });
+
+  // Positive: multimodal message with text + image parts when image attachment present.
+  it('returns multimodal message with text + image parts', () => {
+    const msg = buildUserMessage('Look at this', [jpegBase64Attachment]);
+
+    expect(msg.role).toBe('user');
+    expect(Array.isArray(msg.content)).toBe(true);
+
+    const parts = msg.content as Array<{ type: string }>;
+    expect(parts).toHaveLength(2);
+    expect(parts[0]).toEqual({ type: 'text', text: 'Look at this' });
+    expect(parts[1]).toEqual({
+      type: 'image',
+      source: { type: 'base64', media_type: 'image/jpeg', data: 'aGVsbG8=' },
+    });
+  });
+
+  // Positive: multimodal message with just image part when text is empty.
+  it('returns multimodal message with just image part when text is empty', () => {
+    const msg = buildUserMessage('', [pngUrlAttachment]);
+
+    expect(msg.role).toBe('user');
+    const parts = msg.content as Array<{ type: string }>;
+    // Empty text is omitted, only image part
+    expect(parts).toHaveLength(1);
+    expect(parts[0]).toEqual({
+      type: 'image',
+      source: { type: 'url', media_type: 'image/png', url: 'https://example.com/image.png' },
+    });
+  });
+
+  // Positive: handles multiple image attachments.
+  it('handles multiple image attachments', () => {
+    const msg = buildUserMessage('Two images', [jpegBase64Attachment, pngUrlAttachment]);
+
+    const parts = msg.content as Array<{ type: string }>;
+    expect(parts).toHaveLength(3); // text + 2 images
+    expect(parts[0]).toEqual({ type: 'text', text: 'Two images' });
+    expect(parts[1]).toMatchObject({ type: 'image' });
+    expect(parts[2]).toMatchObject({ type: 'image' });
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 4. getMessageText
+// ---------------------------------------------------------------------------
+
+describe('getMessageText', () => {
+  // Positive: returns string directly for string content.
+  it('returns string directly for string content messages', () => {
+    const msg: Message = { role: 'user', content: 'plain text' };
+
+    expect(getMessageText(msg)).toBe('plain text');
+  });
+
+  // Positive: extracts and joins text parts from multimodal messages.
+  it('extracts and joins text parts from multimodal messages', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'text', text: 'Hello ' },
+        { type: 'image', source: { type: 'base64', media_type: 'image/png', data: 'x' } },
+        { type: 'text', text: 'World' },
+      ],
+    };
+
+    expect(getMessageText(msg)).toBe('Hello World');
+  });
+
+  // Negative: returns empty string for multimodal messages with only image parts.
+  it('returns empty string for multimodal messages with only image parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'image', source: { type: 'url', media_type: 'image/gif', url: 'https://example.com/a.gif' } },
+      ],
+    };
+
+    expect(getMessageText(msg)).toBe('');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// 5. hasImages
+// ---------------------------------------------------------------------------
+
+describe('hasImages', () => {
+  // Negative: string content never has images.
+  it('returns false for string content messages', () => {
+    const msg: Message = { role: 'user', content: 'no images here' };
+
+    expect(hasImages(msg)).toBe(false);
+  });
+
+  // Negative: multimodal messages with only text parts have no images.
+  it('returns false for multimodal messages with only text parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [{ type: 'text', text: 'just text' }],
+    };
+
+    expect(hasImages(msg)).toBe(false);
+  });
+
+  // Positive: multimodal messages with image parts are detected.
+  it('returns true for multimodal messages with image parts', () => {
+    const msg: Message = {
+      role: 'user',
+      content: [
+        { type: 'text', text: 'caption' },
+        { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } },
+      ],
+    };
+
+    expect(hasImages(msg)).toBe(true);
+  });
+});
@@ -0,0 +1,101 @@
+/**
+ * Media utilities for converting channel attachments to model content parts.
+ */
+
+import type { Attachment } from '../channels/types.js';
+import type { MessageContentPart, ImageSource, Message } from './types.js';
+
+/** MIME types that vision models generally accept. */
+const SUPPORTED_IMAGE_TYPES = new Set([
+  'image/jpeg',
+  'image/png',
+  'image/gif',
+  'image/webp',
+]);
+
+/** Check whether an attachment is a supported image type. */
+export function isSupportedImage(attachment: Attachment): boolean {
+  return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType);
+}
+
+/** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */
+export function attachmentToImageSource(attachment: Attachment): ImageSource | null {
+  if (!isSupportedImage(attachment)) {
+    return null;
+  }
+
+  if (attachment.data) {
+    return {
+      type: 'base64',
+      media_type: attachment.mimeType,
+      data: attachment.data,
+    };
+  }
+
+  if (attachment.url) {
+    return {
+      type: 'url',
+      media_type: attachment.mimeType,
+      url: attachment.url,
+    };
+  }
+
+  return null;
+}
+
+/**
+ * Build a multimodal Message from text + attachments.
+ * If there are no image attachments, returns a plain text Message.
+ * If there are image attachments, returns a Message with structured content parts.
+ */
+export function buildUserMessage(text: string, attachments?: Attachment[]): Message {
+  const imageParts: MessageContentPart[] = [];
+
+  if (attachments) {
+    for (const att of attachments) {
+      const source = attachmentToImageSource(att);
+      if (source) {
+        imageParts.push({ type: 'image', source });
+      }
+    }
+  }
+
+  // No images — return simple text message (preserves backward compat)
+  if (imageParts.length === 0) {
+    return { role: 'user', content: text };
+  }
+
+  // Build multimodal content: text first, then images
+  const parts: MessageContentPart[] = [];
+  if (text) {
+    parts.push({ type: 'text', text });
+  }
+  parts.push(...imageParts);
+
+  return { role: 'user', content: parts };
+}
+
+/**
+ * Extract the text content from a Message regardless of content format.
+ * For string content, returns the string directly.
+ * For array content, concatenates all text parts.
+ */
+export function getMessageText(message: Message): string {
+  if (typeof message.content === 'string') {
+    return message.content;
+  }
+  return message.content
+    .filter((p): p is { type: 'text'; text: string } => p.type === 'text')
+    .map(p => p.text)
+    .join('');
+}
+
+/**
+ * Check whether a message contains image content parts.
+ */
+export function hasImages(message: Message): boolean {
+  if (typeof message.content === 'string') {
+    return false;
+  }
+  return message.content.some(p => p.type === 'image');
+}
@@ -1,5 +1,5 @@
 import OpenAI from 'openai';
-import type { ChatRequest, ChatResponse, ModelClient } from './types.js';
+import type { ChatRequest, ChatResponse, ModelClient, MessageContentPart } from './types.js';

 export interface OpenAIClientConfig {
  apiKey?: string;
@@ -8,6 +8,31 @@ export interface OpenAIClientConfig {
  baseURL?: string;
 }

+/**
+ * Convert Flynn message content to OpenAI format.
+ * OpenAI uses { type: 'text', text } and { type: 'image_url', image_url: { url } } parts.
+ */
+function toOpenAIContent(content: string | MessageContentPart[]): string | OpenAI.ChatCompletionContentPart[] {
+  if (typeof content === 'string') {
+    return content;
+  }
+
+  return content.map((part): OpenAI.ChatCompletionContentPart => {
+    if (part.type === 'text') {
+      return { type: 'text', text: part.text };
+    }
+    if (part.type === 'image') {
+      // OpenAI accepts data URIs or regular URLs
+      const url = part.source.type === 'base64'
+        ? `data:${part.source.media_type};base64,${part.source.data!}`
+        : part.source.url!;
+      return { type: 'image_url', image_url: { url } };
+    }
+    // Fallback — shouldn't happen
+    return { type: 'text', text: JSON.stringify(part) };
+  });
+}
+
 export class OpenAIClient implements ModelClient {
  private client: OpenAI;
  private model: string;
@@ -30,7 +55,10 @@ export class OpenAIClient implements ModelClient {
    }

    for (const msg of request.messages) {
-      messages.push({ role: msg.role, content: msg.content });
+      messages.push({
+        role: msg.role,
+        content: toOpenAIContent(msg.content),
+      } as OpenAI.ChatCompletionMessageParam);
    }

    // Build params, conditionally including tools
@@ -1,6 +1,23 @@
+/** Image source for multimodal content blocks. */
+export interface ImageSource {
+  type: 'base64' | 'url';
+  /** MIME type (e.g. "image/jpeg", "image/png", "image/webp", "image/gif"). */
+  media_type: string;
+  /** Base64-encoded image data (when type === 'base64'). */
+  data?: string;
+  /** Image URL (when type === 'url'). */
+  url?: string;
+}
+
+/** Individual content part within a multimodal message. */
+export type MessageContentPart =
+  | { type: 'text'; text: string }
+  | { type: 'image'; source: ImageSource };
+
 export interface Message {
  role: 'user' | 'assistant';
-  content: string;
+  /** String for text-only messages, or array for multimodal content. */
+  content: string | MessageContentPart[];
  timestamp?: number;
 }

@@ -25,6 +42,7 @@ export interface ModelToolCall {
 // Content blocks for multi-content responses
 export type ContentBlock =
  | { type: 'text'; text: string }
+  | { type: 'image'; source: ImageSource }
  | { type: 'tool_use'; id: string; name: string; input: unknown };

 // Tool result fed back into conversation