feat: add multimodal media pipeline for image support across all providers and channels

Widen Message.content from string to string | MessageContentPart[] to support multimodal content. Add Attachment type to channel layer, media conversion utilities, and image extraction to all channel adapters (Telegram, Discord, Slack, WhatsApp). Update all model clients (Anthropic, OpenAI, Gemini, Bedrock) to convert structured content to provider-specific formats. Fix downstream consumers (tokens, compaction, TUI, local models) to handle the widened type via getMessageText() helper.
2026-02-06 17:17:21 -08:00
parent cfdd448495
commit a515912537
22 changed files with 788 additions and 37 deletions
@@ -0,0 +1,101 @@
+/**
+ * Media utilities for converting channel attachments to model content parts.
+ */
+
+import type { Attachment } from '../channels/types.js';
+import type { MessageContentPart, ImageSource, Message } from './types.js';
+
+/** MIME types that vision models generally accept. */
+const SUPPORTED_IMAGE_TYPES = new Set([
+  'image/jpeg',
+  'image/png',
+  'image/gif',
+  'image/webp',
+]);
+
+/** Check whether an attachment is a supported image type. */
+export function isSupportedImage(attachment: Attachment): boolean {
+  return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType);
+}
+
+/** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */
+export function attachmentToImageSource(attachment: Attachment): ImageSource | null {
+  if (!isSupportedImage(attachment)) {
+    return null;
+  }
+
+  if (attachment.data) {
+    return {
+      type: 'base64',
+      media_type: attachment.mimeType,
+      data: attachment.data,
+    };
+  }
+
+  if (attachment.url) {
+    return {
+      type: 'url',
+      media_type: attachment.mimeType,
+      url: attachment.url,
+    };
+  }
+
+  return null;
+}
+
+/**
+ * Build a multimodal Message from text + attachments.
+ * If there are no image attachments, returns a plain text Message.
+ * If there are image attachments, returns a Message with structured content parts.
+ */
+export function buildUserMessage(text: string, attachments?: Attachment[]): Message {
+  const imageParts: MessageContentPart[] = [];
+
+  if (attachments) {
+    for (const att of attachments) {
+      const source = attachmentToImageSource(att);
+      if (source) {
+        imageParts.push({ type: 'image', source });
+      }
+    }
+  }
+
+  // No images — return simple text message (preserves backward compat)
+  if (imageParts.length === 0) {
+    return { role: 'user', content: text };
+  }
+
+  // Build multimodal content: text first, then images
+  const parts: MessageContentPart[] = [];
+  if (text) {
+    parts.push({ type: 'text', text });
+  }
+  parts.push(...imageParts);
+
+  return { role: 'user', content: parts };
+}
+
+/**
+ * Extract the text content from a Message regardless of content format.
+ * For string content, returns the string directly.
+ * For array content, concatenates all text parts.
+ */
+export function getMessageText(message: Message): string {
+  if (typeof message.content === 'string') {
+    return message.content;
+  }
+  return message.content
+    .filter((p): p is { type: 'text'; text: string } => p.type === 'text')
+    .map(p => p.text)
+    .join('');
+}
+
+/**
+ * Check whether a message contains image content parts.
+ */
+export function hasImages(message: Message): boolean {
+  if (typeof message.content === 'string') {
+    return false;
+  }
+  return message.content.some(p => p.type === 'image');
+}