flynn/src/models/media.ts

/**
 * Media utilities for converting channel attachments to model content parts.
 */

import type { Attachment } from '../channels/types.js';
import type { MessageContentPart, ImageSource, Message } from './types.js';

/** MIME types that vision models generally accept. */
const SUPPORTED_IMAGE_TYPES = new Set([
  'image/jpeg',
  'image/png',
  'image/gif',
  'image/webp',
]);

/** MIME types that are audio (not image). */
const SUPPORTED_AUDIO_TYPES = new Set([
  'audio/ogg',
  'audio/mpeg',
  'audio/mp3',
  'audio/wav',
  'audio/webm',
  'audio/mp4',
  'audio/x-m4a',
]);

/** Check whether an attachment is a supported image type. */
export function isSupportedImage(attachment: Attachment): boolean {
  return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType);
}

/** Check whether an attachment is a supported audio type. */
export function isSupportedAudio(attachment: Attachment): boolean {
  return SUPPORTED_AUDIO_TYPES.has(attachment.mimeType);
}

/** Convert MIME type to file extension. */
export function mimeToExtension(mime: string): string {
  const map: Record<string, string> = {
    'audio/ogg': 'ogg',
    'audio/mpeg': 'mp3',
    'audio/mp3': 'mp3',
    'audio/wav': 'wav',
    'audio/webm': 'webm',
    'audio/mp4': 'm4a',
    'audio/x-m4a': 'm4a',
  };
  return map[mime] ?? 'bin';
}

/** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */
export function attachmentToImageSource(attachment: Attachment): ImageSource | null {
  if (!isSupportedImage(attachment)) {
    return null;
  }

  if (attachment.data) {
    return {
      type: 'base64',
      media_type: attachment.mimeType,
      data: attachment.data,
    };
  }

  if (attachment.url) {
    return {
      type: 'url',
      media_type: attachment.mimeType,
      url: attachment.url,
    };
  }

  return null;
}

/**
 * Build a multimodal Message from text + attachments.
 * If there are no image attachments, returns a plain text Message.
 * If there are image attachments, returns a Message with structured content parts.
 */
export function buildUserMessage(text: string, attachments?: Attachment[]): Message {
  const imageParts: MessageContentPart[] = [];

  if (attachments) {
    for (const att of attachments) {
      const source = attachmentToImageSource(att);
      if (source) {
        imageParts.push({ type: 'image', source });
      }
    }
  }

  // No images — return simple text message (preserves backward compat)
  if (imageParts.length === 0) {
    return { role: 'user', content: text };
  }

  // Build multimodal content: text first, then images
  const parts: MessageContentPart[] = [];
  if (text) {
    parts.push({ type: 'text', text });
  }
  parts.push(...imageParts);

  return { role: 'user', content: parts };
}

/**
 * Extract the text content from a Message regardless of content format.
 * For string content, returns the string directly.
 * For array content, concatenates all text parts.
 */
export function getMessageText(message: Message): string {
  if (typeof message.content === 'string') {
    return message.content;
  }
  return message.content
    .filter((p): p is { type: 'text'; text: string } => p.type === 'text')
    .map(p => p.text)
    .join('');
}

/** Configuration for audio transcription via Whisper-compatible API. */
export interface AudioTranscriptionConfig {
  /** Whisper-compatible API endpoint (e.g. "https://api.openai.com/v1/audio/transcriptions") */
  endpoint?: string;
  /** API key for the transcription service */
  apiKey?: string;
  /** Model name (default: "whisper-1") */
  model?: string;
}

/**
 * Transcribe an audio attachment to text using the OpenAI-compatible /v1/audio/transcriptions API.
 * Falls back to a placeholder message if no transcription endpoint is configured.
 */
export async function transcribeAudio(
  attachment: Attachment,
  config?: AudioTranscriptionConfig,
): Promise<string> {
  if (!config?.endpoint) {
    return '[Audio message received but no transcription service is configured]';
  }

  try {
    const audioBuffer = Buffer.from(attachment.data!, 'base64');
    const ext = mimeToExtension(attachment.mimeType);
    const formData = new FormData();
    formData.append('file', new Blob([audioBuffer], { type: attachment.mimeType }), `audio.${ext}`);
    formData.append('model', config.model ?? 'whisper-1');

    const headers: Record<string, string> = {};
    if (config.apiKey) {
      headers['Authorization'] = `Bearer ${config.apiKey}`;
    }

    const res = await fetch(config.endpoint, { method: 'POST', body: formData, headers });
    if (!res.ok) {
      throw new Error(`Transcription failed: ${res.status} ${res.statusText}`);
    }
    const json = await res.json() as { text: string };
    return json.text;
  } catch (error) {
    console.error(
      `Failed to transcribe audio (${attachment.mimeType}):`,
      error instanceof Error ? error.message : 'Unknown error',
    );
    return '[Audio message transcription failed]';
  }
}

/**
 * Build a multimodal Message from text + attachments, with optional audio transcription.
 * Audio attachments are transcribed to text and prepended to the message.
 * Image attachments are converted to content parts as before.
 */
export async function buildUserMessageWithAudio(
  text: string,
  attachments?: Attachment[],
  audioConfig?: AudioTranscriptionConfig,
): Promise<Message> {
  const imageParts: MessageContentPart[] = [];

  // Separate image and audio attachments
  const imageAttachments = (attachments ?? []).filter(a => isSupportedImage(a));
  const audioAttachments = (attachments ?? []).filter(a => isSupportedAudio(a));

  // Transcribe audio attachments and prepend to text (only if config is provided)
  let processedText = text;
  if (audioConfig?.endpoint) {
    for (const audioAttachment of audioAttachments) {
      const transcript = await transcribeAudio(audioAttachment, audioConfig);
      processedText = `[Voice message]: ${transcript}\n\n${processedText}`;
    }
  }

  // Convert image attachments to content parts
  for (const att of imageAttachments) {
    const source = attachmentToImageSource(att);
    if (source) {
      imageParts.push({ type: 'image', source });
    }
  }

  // No images or audio — return simple text message
  if (imageParts.length === 0) {
    return { role: 'user', content: processedText };
  }

  // Build multimodal content: text first, then images
  const parts: MessageContentPart[] = [];
  if (processedText) {
    parts.push({ type: 'text', text: processedText });
  }
  parts.push(...imageParts);

  return { role: 'user', content: parts };
}

/**
 * Check whether a message contains image content parts.
 */
export function hasImages(message: Message): boolean {
  if (typeof message.content === 'string') {
    return false;
  }
  return message.content.some(p => p.type === 'image');
}