feat: add multimodal media pipeline for image support across all providers and channels
Widen Message.content from string to string | MessageContentPart[] to support multimodal content. Add Attachment type to channel layer, media conversion utilities, and image extraction to all channel adapters (Telegram, Discord, Slack, WhatsApp). Update all model clients (Anthropic, OpenAI, Gemini, Bedrock) to convert structured content to provider-specific formats. Fix downstream consumers (tokens, compaction, TUI, local models) to handle the widened type via getMessageText() helper.
This commit is contained in:
+34
-8
@@ -1,6 +1,6 @@
|
||||
import { GoogleGenerativeAI } from '@google/generative-ai';
|
||||
import type { GenerativeModel, Content, FunctionDeclaration, FunctionDeclarationSchema } from '@google/generative-ai';
|
||||
import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition } from './types.js';
|
||||
import type { GenerativeModel, Content, Part, FunctionDeclaration, FunctionDeclarationSchema } from '@google/generative-ai';
|
||||
import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient, ModelToolCall, ToolDefinition, Message, MessageContentPart } from './types.js';
|
||||
|
||||
export interface GeminiClientConfig {
|
||||
apiKey?: string;
|
||||
@@ -154,12 +154,38 @@ export class GeminiClient implements ModelClient {
|
||||
}
|
||||
}
|
||||
|
||||
/** Convert Flynn's Message[] to Gemini Content[] format */
|
||||
function convertMessages(messages: { role: string; content: string }[]): Content[] {
|
||||
return messages.map(m => ({
|
||||
role: m.role === 'assistant' ? 'model' : 'user',
|
||||
parts: [{ text: m.content }],
|
||||
}));
|
||||
/** Convert Flynn's Message[] to Gemini Content[] format, including multimodal parts */
|
||||
function convertMessages(messages: Message[]): Content[] {
|
||||
return messages.map(m => {
|
||||
const role = m.role === 'assistant' ? 'model' : 'user';
|
||||
|
||||
if (typeof m.content === 'string') {
|
||||
return { role, parts: [{ text: m.content }] };
|
||||
}
|
||||
|
||||
// Multimodal content — convert each part
|
||||
const parts: Part[] = m.content.map(part => {
|
||||
if (part.type === 'text') {
|
||||
return { text: part.text };
|
||||
}
|
||||
if (part.type === 'image') {
|
||||
if (part.source.type === 'base64' && part.source.data) {
|
||||
return {
|
||||
inlineData: {
|
||||
mimeType: part.source.media_type,
|
||||
data: part.source.data,
|
||||
},
|
||||
};
|
||||
}
|
||||
// URL-based images — Gemini doesn't natively support URL refs in inline data,
|
||||
// so we pass as a text description. In production, you'd want to fetch + base64 encode.
|
||||
return { text: `[Image: ${part.source.url ?? 'unavailable'}]` };
|
||||
}
|
||||
return { text: JSON.stringify(part) };
|
||||
});
|
||||
|
||||
return { role, parts };
|
||||
});
|
||||
}
|
||||
|
||||
/** Convert Flynn's ToolDefinition to Gemini FunctionDeclaration format */
|
||||
|
||||
Reference in New Issue
Block a user