flynn/src/tools/builtin/audio-transcribe.ts

import type { Tool, ToolResult } from '../types.js';

interface AudioTranscribeArgs {
  data?: string;
  url?: string;
  mime_type?: string;
  language?: string;
  prompt?: string;
}

const SUPPORTED_MIME_TYPES = new Set([
  'audio/ogg',
  'audio/mpeg',
  'audio/mp3',
  'audio/wav',
  'audio/webm',
  'audio/mp4',
  'audio/x-m4a',
]);

const PROVIDER_ENDPOINTS: Record<string, string> = {
  openai: 'https://api.openai.com/v1/audio/transcriptions',
  groq: 'https://api.groq.com/openai/v1/audio/transcriptions',
  ollama: 'http://localhost:11434/api/generate',
  llamacpp: 'http://localhost:8080/v1/audio/transcriptions',
};

function validateUrl(url: string): { valid: boolean; error?: string } {
  let parsed: URL;
  try {
    parsed = new URL(url);
  } catch {
    return { valid: false, error: `Invalid URL: ${url}` };
  }

  if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
    return { valid: false, error: `Only http/https URLs are allowed, got ${parsed.protocol}` };
  }

  const hostname = parsed.hostname;
  if (hostname === 'localhost' || hostname === '127.0.0.1' || hostname === '::1' || hostname === '0.0.0.0') {
    return { valid: false, error: 'URLs pointing to localhost are not allowed' };
  }

  // Block private/internal IP ranges
  if (/^(10\.|172\.(1[6-9]|2\d|3[01])\.|192\.168\.|169\.254\.)/.test(hostname)) {
    return { valid: false, error: 'URLs pointing to private/internal networks are not allowed' };
  }

  return { valid: true };
}

function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: string } {
  const hasData = args.data !== undefined && args.data !== '';
  const hasUrl = args.url !== undefined && args.url !== '';

  if (!hasData && !hasUrl) {
    return { valid: false, error: 'Either data or url must be provided' };
  }

  if (hasData && hasUrl) {
    return { valid: false, error: 'Only one of data or url can be provided' };
  }

  if (hasData && !args.mime_type) {
    return { valid: false, error: 'mime_type is required when using data' };
  }

  if (args.mime_type && !SUPPORTED_MIME_TYPES.has(args.mime_type)) {
    return { valid: false, error: `Unsupported MIME type: ${args.mime_type}. Supported: ${Array.from(SUPPORTED_MIME_TYPES).join(', ')}` };
  }

  if (hasUrl) {
    const urlValidation = validateUrl(args.url!);
    if (!urlValidation.valid) {
      return urlValidation;
    }
  }

  return { valid: true };
}

interface AudioTranscriptionConfig {
  endpoint?: string;
  apiKey?: string;
  model?: string;
}

export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig): Tool {
  return {
    name: 'audio.transcribe',
    description: 'Transcribe an audio file to text using a Whisper-compatible API. Accepts base64-encoded audio data or a URL to download to audio file.',
    inputSchema: {
      type: 'object',
      properties: {
        data: {
          type: 'string',
          description: 'Base64-encoded audio data (alternative to url)',
        },
        url: {
          type: 'string',
          description: 'URL to download to audio file (alternative to data)',
        },
        mime_type: {
          type: 'string',
          description: 'MIME type of audio file (required when using data, e.g., audio/ogg, audio/mpeg, audio/wav)',
        },
        language: {
          type: 'string',
          description: 'Language code (e.g., en, es, fr) - optional',
        },
        prompt: {
          type: 'string',
          description: 'Optional text to guide transcription (OpenAI/Groq/custom only)',
        },
      },
    },

    execute: async (rawArgs: unknown): Promise<ToolResult> => {
      const args = rawArgs as AudioTranscribeArgs;

      const validation = validateInput(args);
      if (!validation.valid) {
        return {
          success: false,
          output: '',
          error: validation.error,
        };
      }

      if (!audioConfig?.endpoint) {
        return {
          success: false,
          output: '',
          error: 'Audio transcription endpoint not configured. Set audio.endpoint in config.yaml',
        };
      }

      try {
        let filename = 'audio.bin';
        let audioBlob: Blob | undefined;

        if (args.data) {
          const rawBuffer = Buffer.from(args.data, 'base64');
          const audioBuffer = rawBuffer.buffer;

          const extMap: Record<string, string> = {
            'audio/ogg': 'ogg',
            'audio/mpeg': 'mp3',
            'audio/mp3': 'mp3',
            'audio/wav': 'wav',
            'audio/webm': 'webm',
            'audio/mp4': 'm4a',
            'audio/x-m4a': 'm4a',
          };
          const ext = extMap[args.mime_type!] || 'bin';
          filename = `audio.${ext}`;

          const mimeType = args.mime_type ?? 'audio/wav';
          audioBlob = new Blob([audioBuffer], { type: mimeType });
        } else if (args.url) {
          const response = await fetch(args.url);
          if (!response.ok) {
            throw new Error(`Failed to download audio from ${args.url}: ${response.status} ${response.statusText}`);
          }
          const arrayBuffer = await response.arrayBuffer();

          const urlExt = args.url.split('.').pop()?.split('?')[0] || 'bin';
          filename = `audio.${urlExt}`;

          const contentType = response.headers.get('content-type') ?? 'audio/wav';
          audioBlob = new Blob([arrayBuffer], { type: contentType });
        }

        const endpoint = audioConfig.endpoint;
        const model = audioConfig.model ?? 'whisper-1';

        if (endpoint === PROVIDER_ENDPOINTS.ollama) {
          const ollamaResponse = await fetch(endpoint, {
            method: 'POST',
            headers: { 'Content-Type': 'application/json' },
            body: JSON.stringify({
              model: model,
              audio: args.data ?? args.url,
              stream: false,
            }),
          });

          if (!ollamaResponse.ok) {
            throw new Error(`Ollama transcription failed: ${ollamaResponse.status}`);
          }

          const ollamaJson = await ollamaResponse.json() as { response?: string };
          return {
            success: true,
            output: ollamaJson.response ?? 'No response from Ollama',
          };
        }

        const formData = new FormData();
        formData.append('file', audioBlob, filename);
        formData.append('model', model);

        if (args.language) {
          formData.append('language', args.language);
        }

        if (args.prompt) {
          formData.append('prompt', args.prompt);
        }

        const fetchOptions: RequestInit = {
          method: 'POST',
          body: formData,
        };

        const headers: Record<string, string> = {};
        if (audioConfig.apiKey) {
          headers['Authorization'] = `Bearer ${audioConfig.apiKey}`;
        }

        if (Object.keys(headers).length > 0) {
          fetchOptions.headers = headers;
        }

        const response = await fetch(endpoint, fetchOptions);

        if (!response.ok) {
          const errorText = await response.text();
          throw new Error(`Transcription request failed (${response.status}): ${errorText}`);
        }

        const json = await response.json() as { text: string };
        return {
          success: true,
          output: json.text,
        };
      } catch (error) {
        return {
          success: false,
          output: '',
          error: error instanceof Error ? error.message : 'Unknown error occurred',
        };
      }
    },
  };
}