diff --git a/src/backends/native/agent.ts b/src/backends/native/agent.ts index 7c75766..f1f3f6f 100644 --- a/src/backends/native/agent.ts +++ b/src/backends/native/agent.ts @@ -5,6 +5,8 @@ import type { ToolRegistry } from '../../tools/registry.js'; import type { ToolExecutor } from '../../tools/executor.js'; import type { ToolResult } from '../../tools/types.js'; import type { ToolPolicyContext } from '../../tools/policy.js'; +import type { Attachment } from '../../channels/types.js'; +import { buildUserMessage, getMessageText } from '../../models/media.js'; export interface ToolUseEvent { type: 'start' | 'end'; @@ -61,8 +63,8 @@ export class NativeAgent { return this.session?.getHistory() ?? [...this.inMemoryHistory]; } - async process(userMessage: string): Promise { - const userMsg: Message = { role: 'user', content: userMessage }; + async process(userMessage: string, attachments?: Attachment[]): Promise { + const userMsg = buildUserMessage(userMessage, attachments); if (this.session) { this.session.addMessage(userMsg); diff --git a/src/backends/native/orchestrator.ts b/src/backends/native/orchestrator.ts index 7392719..9d52338 100644 --- a/src/backends/native/orchestrator.ts +++ b/src/backends/native/orchestrator.ts @@ -5,6 +5,7 @@ import type { ToolRegistry } from '../../tools/registry.js'; import type { ToolExecutor } from '../../tools/executor.js'; import type { MemoryStore } from '../../memory/store.js'; import type { ToolPolicyContext } from '../../tools/policy.js'; +import type { Attachment } from '../../channels/types.js'; import { NativeAgent } from './agent.js'; import type { ToolUseEvent } from './agent.js'; import { shouldCompact } from '../../context/tokens.js'; @@ -209,10 +210,10 @@ export class AgentOrchestrator { * When compaction is configured, checks whether the conversation history * exceeds the context window threshold and compacts it before processing. */ - async process(userMessage: string): Promise { + async process(userMessage: string, attachments?: Attachment[]): Promise { this._injectMemoryContext(); await this.compactIfNeeded(); - return this._agent.process(userMessage); + return this._agent.process(userMessage, attachments); } /** diff --git a/src/channels/discord/adapter.ts b/src/channels/discord/adapter.ts index 72e5484..48f2fc0 100644 --- a/src/channels/discord/adapter.ts +++ b/src/channels/discord/adapter.ts @@ -10,6 +10,7 @@ import { Client, GatewayIntentBits, Events } from 'discord.js'; import type { Message as DiscordMessage } from 'discord.js'; import type { + Attachment, InboundMessage, OutboundMessage, ChannelAdapter, @@ -50,6 +51,20 @@ export class DiscordAdapter implements ChannelAdapter { this.config = config; } + /** Infer MIME type from URL if contentType is not provided. */ + private _inferMimeTypeFromUrl(url: string): string | null { + const ext = url.split('.').pop()?.toLowerCase(); + const mimeTypes: Record = { + png: 'image/png', + jpg: 'image/jpeg', + jpeg: 'image/jpeg', + gif: 'image/gif', + webp: 'image/webp', + svg: 'image/svg+xml', + }; + return mimeTypes[ext || ''] || null; + } + /** Register the inbound message handler. Called by the registry before connect(). */ onMessage(handler: (msg: InboundMessage) => void): void { this.messageHandler = handler; @@ -159,6 +174,22 @@ export class DiscordAdapter implements ChannelAdapter { // Strip bot mention from the message text const text = message.content.replace(/<@!?\d+>/g, '').trim(); + // ── Extract image attachments ── + const attachments: Attachment[] = []; + if (message.attachments && message.attachments.size > 0) { + for (const attachment of message.attachments.values()) { + const mimeType = attachment.contentType || this._inferMimeTypeFromUrl(attachment.url); + if (mimeType && mimeType.startsWith('image/')) { + attachments.push({ + mimeType, + url: attachment.url, + filename: attachment.name, + size: attachment.size, + }); + } + } + } + // ── Reset command ── if (text === '!reset' || text === 'reset') { this.messageHandler({ @@ -180,6 +211,7 @@ export class DiscordAdapter implements ChannelAdapter { senderId: message.channelId, senderName: message.author.username, text, + attachments: attachments.length > 0 ? attachments : undefined, timestamp: Date.now(), }); } diff --git a/src/channels/index.ts b/src/channels/index.ts index 0608b21..a397ec2 100644 --- a/src/channels/index.ts +++ b/src/channels/index.ts @@ -3,6 +3,7 @@ export type { ChannelStatus, InboundMessage, OutboundMessage, + Attachment, ToolStatusEvent, MessageHandler, } from './types.js'; diff --git a/src/channels/slack/adapter.ts b/src/channels/slack/adapter.ts index 203860a..1d1f641 100644 --- a/src/channels/slack/adapter.ts +++ b/src/channels/slack/adapter.ts @@ -8,6 +8,7 @@ import { App } from '@slack/bolt'; import type { + Attachment, InboundMessage, OutboundMessage, ChannelAdapter, @@ -35,6 +36,14 @@ interface SlackMessageEvent { text?: string; bot_id?: string; subtype?: string; + files?: Array<{ + id?: string; + mimetype?: string; + name?: string; + size?: number; + url_private?: string; + url_private_download?: string; + }>; } /** @@ -160,6 +169,56 @@ export class SlackAdapter implements ChannelAdapter { } } + /** + * Download image files from a Slack message and convert to base64 Attachments. + * Non-image files are skipped. Download errors are logged but don't crash the handler. + */ + private async extractImageAttachments( + files?: SlackMessageEvent['files'], + ): Promise { + if (!files || files.length === 0) return []; + + const attachments: Attachment[] = []; + + for (const file of files) { + // Only process image files + if (!file.mimetype?.startsWith('image/')) continue; + + const downloadUrl = file.url_private_download || file.url_private; + if (!downloadUrl) continue; + + try { + const response = await fetch(downloadUrl, { + headers: { Authorization: `Bearer ${this.config.botToken}` }, + }); + + if (!response.ok) { + console.warn( + `Slack: failed to download file ${file.name ?? file.id ?? 'unknown'}: HTTP ${response.status}`, + ); + continue; + } + + const arrayBuffer = await response.arrayBuffer(); + const base64 = Buffer.from(arrayBuffer).toString('base64'); + + attachments.push({ + mimeType: file.mimetype, + data: base64, + filename: file.name, + size: file.size, + }); + } catch (error) { + console.warn( + `Slack: error downloading file ${file.name ?? file.id ?? 'unknown'}:`, + error instanceof Error ? error.message : 'Unknown error', + ); + } + } + + return attachments; + } + /** Internal: process an inbound Slack message event. */ private async handleMessage(message: SlackMessageEvent): Promise { if (!this.messageHandler) return; @@ -200,6 +259,9 @@ export class SlackAdapter implements ChannelAdapter { ? await this.resolveUserName(message.user) : undefined; + // Extract image attachments from Slack file uploads + const attachments = await this.extractImageAttachments(message.files); + // Detect reset command if (text === '!reset' || text === 'reset') { this.messageHandler({ @@ -210,6 +272,7 @@ export class SlackAdapter implements ChannelAdapter { text: '!reset', timestamp: Date.now(), metadata: { isCommand: true, command: 'reset' }, + ...(attachments.length > 0 && { attachments }), }); return; } @@ -222,6 +285,7 @@ export class SlackAdapter implements ChannelAdapter { senderName, text, timestamp: Date.now(), + ...(attachments.length > 0 && { attachments }), }); } } diff --git a/src/channels/telegram/adapter.ts b/src/channels/telegram/adapter.ts index f46d30c..dec5739 100644 --- a/src/channels/telegram/adapter.ts +++ b/src/channels/telegram/adapter.ts @@ -2,6 +2,7 @@ import { Bot } from 'grammy'; import type { HookEngine } from '../../hooks/index.js'; import type { + Attachment, InboundMessage, OutboundMessage, ChannelAdapter, @@ -44,6 +45,26 @@ export class TelegramAdapter implements ChannelAdapter { this.config = config; } + /** Download a file from Telegram API and convert to base64. */ + private async downloadFileToBase64(fileId: string): Promise { + try { + const file = await this.bot?.api.getFile(fileId); + if (!file || !file.file_path) return null; + + const token = this.config.botToken; + const url = `https://api.telegram.org/file/bot${token}/${file.file_path}`; + + const response = await fetch(url); + if (!response.ok) return null; + + const buffer = Buffer.from(await response.arrayBuffer()); + return buffer.toString('base64'); + } catch (error) { + console.error(`Failed to download file ${fileId}:`, error); + return null; + } + } + /** Register the inbound message handler. Called by the registry before connect(). */ onMessage(handler: (msg: InboundMessage) => void): void { this.messageHandler = handler; @@ -164,6 +185,84 @@ export class TelegramAdapter implements ChannelAdapter { }); }); + // ── Photo message handler ── + + this.bot.on('message:photo', async (ctx) => { + if (!this.messageHandler) return; + + const photo = ctx.message.photo; + if (!photo || photo.length === 0) return; + + const largestPhoto = photo[photo.length - 1]; + + await ctx.replyWithChatAction('typing'); + + const imageData = await this.downloadFileToBase64(largestPhoto.file_id); + if (!imageData) { + console.error(`Failed to download photo ${largestPhoto.file_id}`); + return; + } + + const caption = ctx.message.caption ?? ''; + + this.messageHandler({ + id: String(ctx.message.message_id), + channel: 'telegram', + senderId: String(ctx.chat.id), + senderName: ctx.from?.first_name, + text: caption, + attachments: [ + { + mimeType: 'image/jpeg', + data: imageData, + filename: `photo_${largestPhoto.file_unique_id}.jpg`, + size: largestPhoto.file_size, + }, + ], + timestamp: Date.now(), + }); + }); + + // ── Image document handler ── + + this.bot.on('message:document', async (ctx) => { + if (!this.messageHandler) return; + + const document = ctx.message.document; + if (!document) return; + + const mimeType = document.mime_type ?? ''; + if (!mimeType.startsWith('image/')) return; + + await ctx.replyWithChatAction('typing'); + + const fileData = await this.downloadFileToBase64(document.file_id); + if (!fileData) { + console.error(`Failed to download document ${document.file_id}`); + return; + } + + const caption = ctx.message.caption ?? ''; + const filename = document.file_name ?? document.file_unique_id; + + this.messageHandler({ + id: String(ctx.message.message_id), + channel: 'telegram', + senderId: String(ctx.chat.id), + senderName: ctx.from?.first_name, + text: caption, + attachments: [ + { + mimeType, + data: fileData, + filename, + size: document.file_size, + }, + ], + timestamp: Date.now(), + }); + }); + // ── Start long polling ── this.bot.start({ diff --git a/src/channels/types.ts b/src/channels/types.ts index f8ee818..3eebb5b 100644 --- a/src/channels/types.ts +++ b/src/channels/types.ts @@ -6,6 +6,20 @@ * the ChannelAdapter interface to provide a uniform messaging API. */ +/** Media attachment received from or sent to a channel. */ +export interface Attachment { + /** MIME type (e.g. "image/jpeg", "audio/ogg", "application/pdf"). */ + mimeType: string; + /** Base64-encoded data (preferred for model APIs). */ + data?: string; + /** URL to download the attachment (alternative to data). */ + url?: string; + /** Original filename, if available. */ + filename?: string; + /** File size in bytes, if known. */ + size?: number; +} + /** Inbound message received from a channel platform. */ export interface InboundMessage { /** Platform message ID. */ @@ -18,6 +32,8 @@ export interface InboundMessage { senderName?: string; /** Message text. */ text: string; + /** Media attachments (images, audio, documents). */ + attachments?: Attachment[]; /** ID of message being replied to. */ replyTo?: string; /** Unix ms. */ diff --git a/src/channels/whatsapp/adapter.ts b/src/channels/whatsapp/adapter.ts index 62c05a7..b7a7221 100644 --- a/src/channels/whatsapp/adapter.ts +++ b/src/channels/whatsapp/adapter.ts @@ -9,6 +9,7 @@ import { Client, LocalAuth } from 'whatsapp-web.js'; import type { + Attachment, InboundMessage, OutboundMessage, ChannelAdapter, @@ -37,6 +38,12 @@ interface WhatsAppMessage { fromMe: boolean; author?: string; _data?: { notifyName?: string }; + /** Whether this message contains media (image, video, audio, document). */ + hasMedia?: boolean; + /** Message type (e.g. "image", "video", "chat"). */ + type?: string; + /** Download the media attached to this message. */ + downloadMedia?: () => Promise<{ mimetype: string; data: string; filename?: string } | null>; } /** @@ -149,7 +156,7 @@ export class WhatsAppAdapter implements ChannelAdapter { } /** Internal: process an inbound WhatsApp message. */ - private handleMessage(message: WhatsAppMessage): void { + private async handleMessage(message: WhatsAppMessage): Promise { if (!this.messageHandler) return; // Ignore messages from the bot itself @@ -204,6 +211,26 @@ export class WhatsAppAdapter implements ChannelAdapter { const senderName = message._data?.notifyName; + // Extract image attachments if the message has media + const attachments: Attachment[] = []; + if (message.hasMedia) { + try { + const media = await (message as any).downloadMedia(); + if (media && typeof media.mimetype === 'string' && media.mimetype.startsWith('image/')) { + attachments.push({ + mimeType: media.mimetype, + data: media.data, + filename: media.filename, + }); + } + } catch (error) { + console.error( + 'Failed to download WhatsApp media:', + error instanceof Error ? error.message : 'Unknown error', + ); + } + } + // Detect reset command if (text === '!reset' || text === 'reset') { this.messageHandler({ @@ -214,6 +241,7 @@ export class WhatsAppAdapter implements ChannelAdapter { text: '!reset', timestamp: Date.now(), metadata: { isCommand: true, command: 'reset' }, + ...(attachments.length > 0 ? { attachments } : {}), }); return; } @@ -226,6 +254,7 @@ export class WhatsAppAdapter implements ChannelAdapter { senderName, text, timestamp: Date.now(), + ...(attachments.length > 0 ? { attachments } : {}), }); } } diff --git a/src/context/compaction.ts b/src/context/compaction.ts index 8fcbd03..441cf17 100644 --- a/src/context/compaction.ts +++ b/src/context/compaction.ts @@ -3,6 +3,7 @@ import type { AgentOrchestrator } from '../backends/native/orchestrator.js'; import type { MemoryStore } from '../memory/store.js'; import { COMPACTION_SYSTEM_PROMPT, MEMORY_EXTRACTION_PROMPT } from '../backends/native/prompts.js'; import { estimateMessageTokens } from './tokens.js'; +import { getMessageText } from '../models/media.js'; export interface CompactionConfig { /** Percentage of context window that triggers compaction (default: 80). */ @@ -52,7 +53,7 @@ export async function compactHistory(opts: { const toCompact = messages.slice(0, -keepCount); const toKeep = messages.slice(-keepCount); - const formattedConversation = toCompact.map((msg) => `${msg.role}: ${msg.content}`).join('\n\n'); + const formattedConversation = toCompact.map((msg) => `${msg.role}: ${getMessageText(msg)}`).join('\n\n'); const tier = orchestrator.getDelegationTier('compaction'); diff --git a/src/context/tokens.ts b/src/context/tokens.ts index 9bf278d..b3358a6 100644 --- a/src/context/tokens.ts +++ b/src/context/tokens.ts @@ -1,4 +1,5 @@ import type { Message } from '../models/types.js'; +import { getMessageText } from '../models/media.js'; /** * Approximate overhead tokens per message (role marker, separators, etc.). @@ -43,7 +44,7 @@ export function estimateTokens(text: string): number { */ export function estimateMessageTokens(messages: Message[]): number { return messages.reduce( - (sum, msg) => sum + estimateTokens(msg.content) + MESSAGE_OVERHEAD_TOKENS, + (sum, msg) => sum + estimateTokens(getMessageText(msg)) + MESSAGE_OVERHEAD_TOKENS, 0, ); } diff --git a/src/daemon/index.ts b/src/daemon/index.ts index dc2dbb0..8141f6c 100644 --- a/src/daemon/index.ts +++ b/src/daemon/index.ts @@ -348,7 +348,7 @@ function createMessageRouter(deps: { } try { - const response = await agent.process(msg.text); + const response = await agent.process(msg.text, msg.attachments); await reply({ text: response, replyTo: msg.id }); } catch (error) { console.error(`Error processing message from ${msg.channel}:${msg.senderId}:`, error); diff --git a/src/frontends/tui/components/MessageList.tsx b/src/frontends/tui/components/MessageList.tsx index ad2518d..517f1c1 100644 --- a/src/frontends/tui/components/MessageList.tsx +++ b/src/frontends/tui/components/MessageList.tsx @@ -1,6 +1,7 @@ import React, { memo } from 'react'; import { Box, Text, Static } from 'ink'; import type { Message } from '../../../models/types.js'; +import { getMessageText } from '../../../models/media.js'; import { renderMarkdown } from '../markdown.js'; export interface MessageListProps { @@ -61,8 +62,8 @@ const MessageItem = memo(function MessageItem({ {/* Content */} {message.role === 'assistant' - ? renderMarkdown(message.content) - : message.content} + ? renderMarkdown(getMessageText(message)) + : getMessageText(message)} diff --git a/src/models/anthropic.ts b/src/models/anthropic.ts index a92bc86..08aac81 100644 --- a/src/models/anthropic.ts +++ b/src/models/anthropic.ts @@ -1,6 +1,6 @@ import Anthropic from '@anthropic-ai/sdk'; -import type { Message } from '@anthropic-ai/sdk/resources/messages/messages.js'; -import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from './types.js'; +import type { Message as AnthropicMessage } from '@anthropic-ai/sdk/resources/messages/messages.js'; +import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, Message, MessageContentPart } from './types.js'; export interface AnthropicClientConfig { apiKey?: string; // Falls back to ANTHROPIC_API_KEY env var @@ -9,6 +9,42 @@ export interface AnthropicClientConfig { maxTokens?: number; } +/** + * Convert Flynn MessageContentPart[] to Anthropic ContentBlockParam format. + */ +function toAnthropicContent(content: string | MessageContentPart[]): string | unknown[] { + if (typeof content === 'string') { + return content; + } + + return content.map(part => { + if (part.type === 'text') { + return { type: 'text', text: part.text }; + } + if (part.type === 'image') { + if (part.source.type === 'base64') { + return { + type: 'image', + source: { + type: 'base64', + media_type: part.source.media_type, + data: part.source.data!, + }, + }; + } + // URL-based image + return { + type: 'image', + source: { + type: 'url', + url: part.source.url!, + }, + }; + } + return part; + }); +} + export class AnthropicClient implements ModelClient { private client: Anthropic; private model: string; @@ -30,7 +66,7 @@ export class AnthropicClient implements ModelClient { system: request.system, messages: request.messages.map((m) => ({ role: m.role, - content: m.content, + content: toAnthropicContent(m.content), })), }; @@ -38,7 +74,7 @@ export class AnthropicClient implements ModelClient { params.tools = request.tools; } - const response = await this.client.messages.create(params as unknown as Parameters[0]) as Message; + const response = await this.client.messages.create(params as unknown as Parameters[0]) as AnthropicMessage; const textContent = response.content.find((c) => c.type === 'text'); const content = textContent?.type === 'text' ? textContent.text : ''; @@ -65,8 +101,8 @@ export class AnthropicClient implements ModelClient { system: request.system, messages: request.messages.map((m) => ({ role: m.role, - content: m.content, - })), + content: toAnthropicContent(m.content), + })) as Parameters[0]['messages'], }); try { diff --git a/src/models/bedrock.ts b/src/models/bedrock.ts index 9fd2cb8..0f1001f 100644 --- a/src/models/bedrock.ts +++ b/src/models/bedrock.ts @@ -11,7 +11,7 @@ import type { ConverseCommandInput, ConverseStreamCommandInput, } from '@aws-sdk/client-bedrock-runtime'; -import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition } from './types.js'; +import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition, Message, MessageContentPart } from './types.js'; export interface BedrockClientConfig { model: string; @@ -157,11 +157,34 @@ export class BedrockClient implements ModelClient { } } -function convertMessages(messages: { role: string; content: string }[]): BedrockMessage[] { - return messages.map(m => ({ - role: m.role === 'assistant' ? 'assistant' as const : 'user' as const, - content: [{ text: m.content }] as ContentBlock[], - })); +function convertMessages(messages: Message[]): BedrockMessage[] { + return messages.map(m => { + const role = m.role === 'assistant' ? 'assistant' as const : 'user' as const; + + if (typeof m.content === 'string') { + return { role, content: [{ text: m.content }] as ContentBlock[] }; + } + + // Multimodal content: convert each part + const blocks: ContentBlock[] = m.content.map((part: MessageContentPart) => { + if (part.type === 'text') { + return { text: part.text } as ContentBlock; + } + // Image part — Bedrock uses { image: { format, source: { bytes } } } + if (part.source.type === 'base64' && part.source.data) { + return { + image: { + format: part.source.media_type.split('/')[1] as 'jpeg' | 'png' | 'gif' | 'webp', + source: { bytes: Uint8Array.from(atob(part.source.data), c => c.charCodeAt(0)) }, + }, + } as unknown as ContentBlock; + } + // URL images not natively supported by Bedrock — fall back to text placeholder + return { text: `[Image: ${part.source.url ?? 'unsupported'}]` } as ContentBlock; + }); + + return { role, content: blocks }; + }); } function convertTools(tools: ToolDefinition[]): ToolConfiguration { diff --git a/src/models/gemini.ts b/src/models/gemini.ts index ccb58ce..474d835 100644 --- a/src/models/gemini.ts +++ b/src/models/gemini.ts @@ -1,6 +1,6 @@ import { GoogleGenerativeAI } from '@google/generative-ai'; -import type { GenerativeModel, Content, FunctionDeclaration, FunctionDeclarationSchema } from '@google/generative-ai'; -import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition } from './types.js'; +import type { GenerativeModel, Content, Part, FunctionDeclaration, FunctionDeclarationSchema } from '@google/generative-ai'; +import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition, Message, MessageContentPart } from './types.js'; export interface GeminiClientConfig { apiKey?: string; @@ -154,12 +154,38 @@ export class GeminiClient implements ModelClient { } } -/** Convert Flynn's Message[] to Gemini Content[] format */ -function convertMessages(messages: { role: string; content: string }[]): Content[] { - return messages.map(m => ({ - role: m.role === 'assistant' ? 'model' : 'user', - parts: [{ text: m.content }], - })); +/** Convert Flynn's Message[] to Gemini Content[] format, including multimodal parts */ +function convertMessages(messages: Message[]): Content[] { + return messages.map(m => { + const role = m.role === 'assistant' ? 'model' : 'user'; + + if (typeof m.content === 'string') { + return { role, parts: [{ text: m.content }] }; + } + + // Multimodal content — convert each part + const parts: Part[] = m.content.map(part => { + if (part.type === 'text') { + return { text: part.text }; + } + if (part.type === 'image') { + if (part.source.type === 'base64' && part.source.data) { + return { + inlineData: { + mimeType: part.source.media_type, + data: part.source.data, + }, + }; + } + // URL-based images — Gemini doesn't natively support URL refs in inline data, + // so we pass as a text description. In production, you'd want to fetch + base64 encode. + return { text: `[Image: ${part.source.url ?? 'unavailable'}]` }; + } + return { text: JSON.stringify(part) }; + }); + + return { role, parts }; + }); } /** Convert Flynn's ToolDefinition to Gemini FunctionDeclaration format */ diff --git a/src/models/index.ts b/src/models/index.ts index 0dc8583..d53e278 100644 --- a/src/models/index.ts +++ b/src/models/index.ts @@ -7,8 +7,17 @@ export { LlamaCppClient, type LlamaCppClientConfig } from './local/index.js'; export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js'; export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js'; export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js'; +export { + isSupportedImage, + attachmentToImageSource, + buildUserMessage, + getMessageText, + hasImages, +} from './media.js'; export type { Message, + MessageContentPart, + ImageSource, ChatRequest, ChatResponse, ChatStreamEvent, diff --git a/src/models/local/llamacpp.ts b/src/models/local/llamacpp.ts index 6665d1d..d3c50bf 100644 --- a/src/models/local/llamacpp.ts +++ b/src/models/local/llamacpp.ts @@ -1,4 +1,5 @@ import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from '../types.js'; +import { getMessageText } from '../media.js'; export interface LlamaCppClientConfig { endpoint: string; @@ -40,7 +41,7 @@ export class LlamaCppClient implements ModelClient { } for (const msg of request.messages) { - messages.push({ role: msg.role, content: msg.content }); + messages.push({ role: msg.role, content: getMessageText(msg) }); } const headers: Record = { @@ -94,7 +95,7 @@ export class LlamaCppClient implements ModelClient { } for (const msg of request.messages) { - messages.push({ role: msg.role, content: msg.content }); + messages.push({ role: msg.role, content: getMessageText(msg) }); } const headers: Record = { diff --git a/src/models/local/ollama.ts b/src/models/local/ollama.ts index 74e5d0d..bce5cdd 100644 --- a/src/models/local/ollama.ts +++ b/src/models/local/ollama.ts @@ -1,5 +1,6 @@ import { Ollama } from 'ollama'; import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from '../types.js'; +import { getMessageText } from '../media.js'; export interface OllamaClientConfig { host?: string; @@ -28,7 +29,7 @@ export class OllamaClient implements ModelClient { } for (const msg of request.messages) { - messages.push({ role: msg.role, content: msg.content }); + messages.push({ role: msg.role, content: getMessageText(msg) }); } const response = await this.client.chat({ @@ -57,7 +58,7 @@ export class OllamaClient implements ModelClient { } for (const msg of request.messages) { - messages.push({ role: msg.role, content: msg.content }); + messages.push({ role: msg.role, content: getMessageText(msg) }); } try { diff --git a/src/models/media.test.ts b/src/models/media.test.ts new file mode 100644 index 0000000..8ac73d2 --- /dev/null +++ b/src/models/media.test.ts @@ -0,0 +1,261 @@ +import { describe, it, expect } from 'vitest'; +import type { Attachment } from '../channels/types.js'; +import type { Message } from './types.js'; +import { + isSupportedImage, + attachmentToImageSource, + buildUserMessage, + getMessageText, + hasImages, +} from './media.js'; + +// --------------------------------------------------------------------------- +// Helpers – reusable attachment fixtures +// --------------------------------------------------------------------------- + +function makeAttachment(overrides: Partial & { mimeType: string }): Attachment { + return { ...overrides }; +} + +const jpegBase64Attachment: Attachment = makeAttachment({ + mimeType: 'image/jpeg', + data: 'aGVsbG8=', // "hello" in base64 + filename: 'photo.jpg', +}); + +const pngUrlAttachment: Attachment = makeAttachment({ + mimeType: 'image/png', + url: 'https://example.com/image.png', +}); + +const pdfAttachment: Attachment = makeAttachment({ + mimeType: 'application/pdf', + data: 'cGRm', + filename: 'doc.pdf', +}); + +// --------------------------------------------------------------------------- +// 1. isSupportedImage +// --------------------------------------------------------------------------- + +describe('isSupportedImage', () => { + // Positive: all four supported MIME types should return true. + it.each([ + 'image/jpeg', + 'image/png', + 'image/gif', + 'image/webp', + ])('returns true for supported type %s', (mime) => { + expect(isSupportedImage(makeAttachment({ mimeType: mime }))).toBe(true); + }); + + // Negative: unsupported MIME types should return false. + it.each([ + 'image/bmp', + 'application/pdf', + 'audio/mp3', + 'text/plain', + ])('returns false for unsupported type %s', (mime) => { + expect(isSupportedImage(makeAttachment({ mimeType: mime }))).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// 2. attachmentToImageSource +// --------------------------------------------------------------------------- + +describe('attachmentToImageSource', () => { + // Positive: attachment with base64 data produces a base64 ImageSource. + it('returns base64 ImageSource when attachment has data', () => { + const result = attachmentToImageSource(jpegBase64Attachment); + + expect(result).toEqual({ + type: 'base64', + media_type: 'image/jpeg', + data: 'aGVsbG8=', + }); + }); + + // Positive: attachment with url (no data) produces a url ImageSource. + it('returns url ImageSource when attachment has url but no data', () => { + const result = attachmentToImageSource(pngUrlAttachment); + + expect(result).toEqual({ + type: 'url', + media_type: 'image/png', + url: 'https://example.com/image.png', + }); + }); + + // Positive: when both data and url are present, base64 is preferred. + it('prefers base64 data over url when both are present', () => { + const both = makeAttachment({ + mimeType: 'image/webp', + data: 'YWJj', + url: 'https://example.com/img.webp', + }); + + const result = attachmentToImageSource(both); + + expect(result).toEqual({ + type: 'base64', + media_type: 'image/webp', + data: 'YWJj', + }); + }); + + // Negative: unsupported MIME type returns null. + it('returns null for unsupported MIME type', () => { + expect(attachmentToImageSource(pdfAttachment)).toBeNull(); + }); + + // Negative: supported MIME but neither data nor url returns null. + it('returns null when attachment has neither data nor url', () => { + const bare = makeAttachment({ mimeType: 'image/gif' }); + + expect(attachmentToImageSource(bare)).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// 3. buildUserMessage +// --------------------------------------------------------------------------- + +describe('buildUserMessage', () => { + // Positive: plain text message when no attachments argument is provided. + it('returns plain string content when no attachments', () => { + const msg = buildUserMessage('Hello'); + + expect(msg).toEqual({ role: 'user', content: 'Hello' }); + }); + + // Positive: plain text message when attachments is an empty array. + it('returns plain string content when attachments is empty array', () => { + const msg = buildUserMessage('Hello', []); + + expect(msg).toEqual({ role: 'user', content: 'Hello' }); + }); + + // Positive: plain text message when attachments contain no supported images. + it('returns plain string content when no image attachments (PDF only)', () => { + const msg = buildUserMessage('See attached', [pdfAttachment]); + + expect(msg).toEqual({ role: 'user', content: 'See attached' }); + }); + + // Positive: multimodal message with text + image parts when image attachment present. + it('returns multimodal message with text + image parts', () => { + const msg = buildUserMessage('Look at this', [jpegBase64Attachment]); + + expect(msg.role).toBe('user'); + expect(Array.isArray(msg.content)).toBe(true); + + const parts = msg.content as Array<{ type: string }>; + expect(parts).toHaveLength(2); + expect(parts[0]).toEqual({ type: 'text', text: 'Look at this' }); + expect(parts[1]).toEqual({ + type: 'image', + source: { type: 'base64', media_type: 'image/jpeg', data: 'aGVsbG8=' }, + }); + }); + + // Positive: multimodal message with just image part when text is empty. + it('returns multimodal message with just image part when text is empty', () => { + const msg = buildUserMessage('', [pngUrlAttachment]); + + expect(msg.role).toBe('user'); + const parts = msg.content as Array<{ type: string }>; + // Empty text is omitted, only image part + expect(parts).toHaveLength(1); + expect(parts[0]).toEqual({ + type: 'image', + source: { type: 'url', media_type: 'image/png', url: 'https://example.com/image.png' }, + }); + }); + + // Positive: handles multiple image attachments. + it('handles multiple image attachments', () => { + const msg = buildUserMessage('Two images', [jpegBase64Attachment, pngUrlAttachment]); + + const parts = msg.content as Array<{ type: string }>; + expect(parts).toHaveLength(3); // text + 2 images + expect(parts[0]).toEqual({ type: 'text', text: 'Two images' }); + expect(parts[1]).toMatchObject({ type: 'image' }); + expect(parts[2]).toMatchObject({ type: 'image' }); + }); +}); + +// --------------------------------------------------------------------------- +// 4. getMessageText +// --------------------------------------------------------------------------- + +describe('getMessageText', () => { + // Positive: returns string directly for string content. + it('returns string directly for string content messages', () => { + const msg: Message = { role: 'user', content: 'plain text' }; + + expect(getMessageText(msg)).toBe('plain text'); + }); + + // Positive: extracts and joins text parts from multimodal messages. + it('extracts and joins text parts from multimodal messages', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'text', text: 'Hello ' }, + { type: 'image', source: { type: 'base64', media_type: 'image/png', data: 'x' } }, + { type: 'text', text: 'World' }, + ], + }; + + expect(getMessageText(msg)).toBe('Hello World'); + }); + + // Negative: returns empty string for multimodal messages with only image parts. + it('returns empty string for multimodal messages with only image parts', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'image', source: { type: 'url', media_type: 'image/gif', url: 'https://example.com/a.gif' } }, + ], + }; + + expect(getMessageText(msg)).toBe(''); + }); +}); + +// --------------------------------------------------------------------------- +// 5. hasImages +// --------------------------------------------------------------------------- + +describe('hasImages', () => { + // Negative: string content never has images. + it('returns false for string content messages', () => { + const msg: Message = { role: 'user', content: 'no images here' }; + + expect(hasImages(msg)).toBe(false); + }); + + // Negative: multimodal messages with only text parts have no images. + it('returns false for multimodal messages with only text parts', () => { + const msg: Message = { + role: 'user', + content: [{ type: 'text', text: 'just text' }], + }; + + expect(hasImages(msg)).toBe(false); + }); + + // Positive: multimodal messages with image parts are detected. + it('returns true for multimodal messages with image parts', () => { + const msg: Message = { + role: 'user', + content: [ + { type: 'text', text: 'caption' }, + { type: 'image', source: { type: 'base64', media_type: 'image/jpeg', data: 'abc' } }, + ], + }; + + expect(hasImages(msg)).toBe(true); + }); +}); diff --git a/src/models/media.ts b/src/models/media.ts new file mode 100644 index 0000000..796d946 --- /dev/null +++ b/src/models/media.ts @@ -0,0 +1,101 @@ +/** + * Media utilities for converting channel attachments to model content parts. + */ + +import type { Attachment } from '../channels/types.js'; +import type { MessageContentPart, ImageSource, Message } from './types.js'; + +/** MIME types that vision models generally accept. */ +const SUPPORTED_IMAGE_TYPES = new Set([ + 'image/jpeg', + 'image/png', + 'image/gif', + 'image/webp', +]); + +/** Check whether an attachment is a supported image type. */ +export function isSupportedImage(attachment: Attachment): boolean { + return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType); +} + +/** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */ +export function attachmentToImageSource(attachment: Attachment): ImageSource | null { + if (!isSupportedImage(attachment)) { + return null; + } + + if (attachment.data) { + return { + type: 'base64', + media_type: attachment.mimeType, + data: attachment.data, + }; + } + + if (attachment.url) { + return { + type: 'url', + media_type: attachment.mimeType, + url: attachment.url, + }; + } + + return null; +} + +/** + * Build a multimodal Message from text + attachments. + * If there are no image attachments, returns a plain text Message. + * If there are image attachments, returns a Message with structured content parts. + */ +export function buildUserMessage(text: string, attachments?: Attachment[]): Message { + const imageParts: MessageContentPart[] = []; + + if (attachments) { + for (const att of attachments) { + const source = attachmentToImageSource(att); + if (source) { + imageParts.push({ type: 'image', source }); + } + } + } + + // No images — return simple text message (preserves backward compat) + if (imageParts.length === 0) { + return { role: 'user', content: text }; + } + + // Build multimodal content: text first, then images + const parts: MessageContentPart[] = []; + if (text) { + parts.push({ type: 'text', text }); + } + parts.push(...imageParts); + + return { role: 'user', content: parts }; +} + +/** + * Extract the text content from a Message regardless of content format. + * For string content, returns the string directly. + * For array content, concatenates all text parts. + */ +export function getMessageText(message: Message): string { + if (typeof message.content === 'string') { + return message.content; + } + return message.content + .filter((p): p is { type: 'text'; text: string } => p.type === 'text') + .map(p => p.text) + .join(''); +} + +/** + * Check whether a message contains image content parts. + */ +export function hasImages(message: Message): boolean { + if (typeof message.content === 'string') { + return false; + } + return message.content.some(p => p.type === 'image'); +} diff --git a/src/models/openai.ts b/src/models/openai.ts index 51faeac..dfdd032 100644 --- a/src/models/openai.ts +++ b/src/models/openai.ts @@ -1,5 +1,5 @@ import OpenAI from 'openai'; -import type { ChatRequest, ChatResponse, ModelClient } from './types.js'; +import type { ChatRequest, ChatResponse, ModelClient, MessageContentPart } from './types.js'; export interface OpenAIClientConfig { apiKey?: string; @@ -8,6 +8,31 @@ export interface OpenAIClientConfig { baseURL?: string; } +/** + * Convert Flynn message content to OpenAI format. + * OpenAI uses { type: 'text', text } and { type: 'image_url', image_url: { url } } parts. + */ +function toOpenAIContent(content: string | MessageContentPart[]): string | OpenAI.ChatCompletionContentPart[] { + if (typeof content === 'string') { + return content; + } + + return content.map((part): OpenAI.ChatCompletionContentPart => { + if (part.type === 'text') { + return { type: 'text', text: part.text }; + } + if (part.type === 'image') { + // OpenAI accepts data URIs or regular URLs + const url = part.source.type === 'base64' + ? `data:${part.source.media_type};base64,${part.source.data!}` + : part.source.url!; + return { type: 'image_url', image_url: { url } }; + } + // Fallback — shouldn't happen + return { type: 'text', text: JSON.stringify(part) }; + }); +} + export class OpenAIClient implements ModelClient { private client: OpenAI; private model: string; @@ -30,7 +55,10 @@ export class OpenAIClient implements ModelClient { } for (const msg of request.messages) { - messages.push({ role: msg.role, content: msg.content }); + messages.push({ + role: msg.role, + content: toOpenAIContent(msg.content), + } as OpenAI.ChatCompletionMessageParam); } // Build params, conditionally including tools diff --git a/src/models/types.ts b/src/models/types.ts index 9fbae44..152bd37 100644 --- a/src/models/types.ts +++ b/src/models/types.ts @@ -1,6 +1,23 @@ +/** Image source for multimodal content blocks. */ +export interface ImageSource { + type: 'base64' | 'url'; + /** MIME type (e.g. "image/jpeg", "image/png", "image/webp", "image/gif"). */ + media_type: string; + /** Base64-encoded image data (when type === 'base64'). */ + data?: string; + /** Image URL (when type === 'url'). */ + url?: string; +} + +/** Individual content part within a multimodal message. */ +export type MessageContentPart = + | { type: 'text'; text: string } + | { type: 'image'; source: ImageSource }; + export interface Message { role: 'user' | 'assistant'; - content: string; + /** String for text-only messages, or array for multimodal content. */ + content: string | MessageContentPart[]; timestamp?: number; } @@ -25,6 +42,7 @@ export interface ModelToolCall { // Content blocks for multi-content responses export type ContentBlock = | { type: 'text'; text: string } + | { type: 'image'; source: ImageSource } | { type: 'tool_use'; id: string; name: string; input: unknown }; // Tool result fed back into conversation