flynn/src/models/media.ts

/**
 * Media utilities for converting channel attachments to model content parts.
 */

import type { Attachment } from '../channels/types.js';
import type { MessageContentPart, ImageSource, AudioSource, Message } from './types.js';

/** MIME types that vision models generally accept. */
const SUPPORTED_IMAGE_TYPES = new Set([
  'image/jpeg',
  'image/png',
  'image/gif',
  'image/webp',
]);

/** MIME types that are audio (not image). */
const SUPPORTED_AUDIO_TYPES = new Set([
  'audio/ogg',
  'audio/mpeg',
  'audio/mp3',
  'audio/wav',
  'audio/webm',
  'audio/mp4',
  'audio/x-m4a',
]);

/** Check whether an attachment is a supported image type. */
export function isSupportedImage(attachment: Attachment): boolean {
  return SUPPORTED_IMAGE_TYPES.has(attachment.mimeType);
}

/** Check whether an attachment is a supported audio type. */
export function isSupportedAudio(attachment: Attachment): boolean {
  return SUPPORTED_AUDIO_TYPES.has(attachment.mimeType);
}

/** Convert MIME type to file extension. */
export function mimeToExtension(mime: string): string {
  const map: Record<string, string> = {
    'audio/ogg': 'ogg',
    'audio/mpeg': 'mp3',
    'audio/mp3': 'mp3',
    'audio/wav': 'wav',
    'audio/webm': 'webm',
    'audio/mp4': 'm4a',
    'audio/x-m4a': 'm4a',
  };
  return map[mime] ?? 'bin';
}

/** Convert a channel Attachment to a model ImageSource. Prefers base64 data, falls back to URL. */
export function attachmentToImageSource(attachment: Attachment): ImageSource | null {
  if (!isSupportedImage(attachment)) {
    return null;
  }

  if (attachment.data) {
    return {
      type: 'base64',
      media_type: attachment.mimeType,
      data: attachment.data,
    };
  }

  if (attachment.url) {
    return {
      type: 'url',
      media_type: attachment.mimeType,
      url: attachment.url,
    };
  }

  return null;
}

/** Convert a channel Attachment to a model AudioSource. Only base64 data is supported. */
export function attachmentToAudioSource(attachment: Attachment): AudioSource | null {
  if (!isSupportedAudio(attachment)) {
    return null;
  }

  if (!attachment.data) {
    return null;
  }

  return {
    media_type: attachment.mimeType,
    data: attachment.data,
  };
}

/**
 * Build a multimodal Message from text + attachments.
 * If there are no image or audio attachments, returns a plain text Message.
 * If there are image/audio attachments, returns a Message with structured content parts.
 */
export function buildUserMessage(text: string, attachments?: Attachment[]): Message {
  const mediaParts: MessageContentPart[] = [];

  if (attachments) {
    for (const att of attachments) {
      const imageSource = attachmentToImageSource(att);
      if (imageSource) {
        mediaParts.push({ type: 'image', source: imageSource });
        continue;
      }
      const audioSource = attachmentToAudioSource(att);
      if (audioSource) {
        mediaParts.push({ type: 'audio', source: audioSource });
      }
    }
  }

  // No media — return simple text message (preserves backward compat)
  if (mediaParts.length === 0) {
    return { role: 'user', content: text };
  }

  // Build multimodal content: text first, then media
  const parts: MessageContentPart[] = [];
  if (text) {
    parts.push({ type: 'text', text });
  }
  parts.push(...mediaParts);

  return { role: 'user', content: parts };
}

/**
 * Extract the text content from a Message regardless of content format.
 * For string content, returns the string directly.
 * For array content, concatenates all text parts.
 */
export function getMessageText(message: Message): string {
  if (typeof message.content === 'string') {
    return message.content;
  }
  return message.content
    .filter((p): p is { type: 'text'; text: string } => p.type === 'text')
    .map(p => p.text)
    .join('');
}

/**
 * Serialize a message's content to a plain string, including tool_use and
 * tool_result structured blocks that getMessageText() would discard.
 * This is needed for local model backends (llama.cpp, Ollama) whose chat
 * templates don't understand Anthropic-style structured content blocks.
 */
export function getMessageTextWithTools(message: Message): string {
  if (typeof message.content === 'string') {
    return message.content;
  }

  const parts: string[] = [];
  for (const block of message.content as Record<string, unknown>[]) {
    if (block.type === 'text' && typeof block.text === 'string') {
      parts.push(block.text);
    } else if (block.type === 'tool_use') {
      const name = block.name as string;
      let argsStr: string;
      try {
        argsStr = JSON.stringify(block.input);
      } catch {
        argsStr = String(block.input);
      }
      parts.push(`[Calling tool: ${name}(${argsStr})]`);
    } else if (block.type === 'tool_result') {
      const content = (block.content as string) ?? '';
      const isError = block.is_error ? ' (error)' : '';
      parts.push(`[Tool result${isError}: ${content}]`);
    } else if (block.type === 'audio') {
      const source = block.source as Record<string, unknown>;
      if (source?.transcript) {
        parts.push(`[Voice message]: ${source.transcript}`);
      } else {
        parts.push('[Audio attachment]');
      }
    }
  }
  return parts.join('\n');
}

interface SimpleMessage {
  role: 'system' | 'user' | 'assistant';
  content: string;
}

/**
 * Normalize a message array for local model backends that require strict
 * role alternation (system? -> user -> assistant -> user -> ...).
 *
 * 1. Serializes structured tool_use/tool_result content blocks to text
 * 2. Drops empty messages
 * 3. Merges consecutive same-role messages with a newline separator
 */
export function normalizeMessagesForLocal(
  system: string | undefined,
  messages: Message[],
): SimpleMessage[] {
  const result: SimpleMessage[] = [];

  if (system) {
    result.push({ role: 'system', content: system });
  }

  for (const msg of messages) {
    const text = getMessageTextWithTools(msg);
    if (!text) {continue;} // drop empty messages

    const last = result.length > 0 ? result[result.length - 1] : undefined;
    if (last && last.role === msg.role) {
      // Merge consecutive same-role messages
      last.content += '\n\n' + text;
    } else {
      result.push({ role: msg.role, content: text });
    }
  }

  return result;
}

/** Configuration for audio transcription via Whisper-compatible API. */
export interface AudioTranscriptionConfig {
  /** Whisper-compatible API endpoint (e.g. "https://api.openai.com/v1/audio/transcriptions") */
  endpoint?: string;
  /** API key for the transcription service */
  apiKey?: string;
  /** Model name (default: "whisper-1") */
  model?: string;
}

/**
 * Transcribe an audio attachment to text using the OpenAI-compatible /v1/audio/transcriptions API.
 * Falls back to a placeholder message if no transcription endpoint is configured.
 */
export async function transcribeAudio(
  attachment: Attachment,
  config?: AudioTranscriptionConfig,
): Promise<string> {
  if (!config?.endpoint) {
    return '[Audio message received but no transcription service is configured]';
  }
  if (!attachment.data) {
    return '[Audio message transcription failed]';
  }

  try {
    const audioBuffer = Buffer.from(attachment.data, 'base64');
    const ext = mimeToExtension(attachment.mimeType);
    const formData = new FormData();
    formData.append('file', new Blob([audioBuffer], { type: attachment.mimeType }), `audio.${ext}`);
    formData.append('model', config.model ?? 'whisper-1');

    const headers: Record<string, string> = {};
    if (config.apiKey) {
      headers['Authorization'] = `Bearer ${config.apiKey}`;
    }

    const res = await fetch(config.endpoint, { method: 'POST', body: formData, headers });
    if (!res.ok) {
      throw new Error(`Transcription failed: ${res.status} ${res.statusText}`);
    }
    const json = await res.json() as { text: string };
    return json.text;
  } catch (error) {
    console.error(
      `Failed to transcribe audio (${attachment.mimeType}):`,
      error instanceof Error ? error.message : 'Unknown error',
    );
    return '[Audio message transcription failed]';
  }
}

/**
 * Build a multimodal Message from text + attachments, with optional audio transcription.
 * Audio attachments are transcribed to text and prepended to the message.
 * Image attachments are converted to content parts as before.
 */
export async function buildUserMessageWithAudio(
  text: string,
  attachments?: Attachment[],
  audioConfig?: AudioTranscriptionConfig,
): Promise<Message> {
  const imageParts: MessageContentPart[] = [];

  // Separate image and audio attachments
  const imageAttachments = (attachments ?? []).filter(a => isSupportedImage(a));
  const audioAttachments = (attachments ?? []).filter(a => isSupportedAudio(a));

  // Transcribe audio attachments and prepend to text (only if config is provided)
  let processedText = text;
  if (audioConfig?.endpoint) {
    for (const audioAttachment of audioAttachments) {
      const transcript = await transcribeAudio(audioAttachment, audioConfig);
      processedText = `[Voice message]: ${transcript}\n\n${processedText}`;
    }
  }

  // Convert image attachments to content parts
  for (const att of imageAttachments) {
    const source = attachmentToImageSource(att);
    if (source) {
      imageParts.push({ type: 'image', source });
    }
  }

  // No images or audio — return simple text message
  if (imageParts.length === 0) {
    return { role: 'user', content: processedText };
  }

  // Build multimodal content: text first, then images
  const parts: MessageContentPart[] = [];
  if (processedText) {
    parts.push({ type: 'text', text: processedText });
  }
  parts.push(...imageParts);

  return { role: 'user', content: parts };
}

/**
 * Check whether a message contains image content parts.
 */
export function hasImages(message: Message): boolean {
  if (typeof message.content === 'string') {
    return false;
  }
  return message.content.some(p => p.type === 'image');
}

/**
 * Check whether a message contains audio content parts.
 */
export function hasAudio(message: Message): boolean {
  if (typeof message.content === 'string') {
    return false;
  }
  return message.content.some(p => p.type === 'audio');
}

/**
 * Strip audio parts from a message, replacing them with their transcripts as text.
 * Used for model providers that don't support native audio input (Anthropic, Bedrock, local).
 */
export function stripAudioParts(message: Message): Message {
  if (typeof message.content === 'string') {
    return message;
  }

  const newParts: MessageContentPart[] = [];
  for (const part of message.content) {
    if (part.type === 'audio') {
      // Replace audio with transcript text if available
      if (part.source.transcript) {
        newParts.push({ type: 'text', text: `[Voice message]: ${part.source.transcript}` });
      } else {
        newParts.push({ type: 'text', text: '[Audio message received but no transcript available]' });
      }
    } else {
      newParts.push(part);
    }
  }

  // If all that's left is a single text part, simplify to string content
  if (newParts.length === 1 && newParts[0].type === 'text') {
    return { ...message, content: newParts[0].text };
  }

  return { ...message, content: newParts };
}