feat: implement tier-a4 tts voice output replies

2026-02-18 10:22:28 -08:00
parent 3eb07875f1
commit a71aa5992d
11 changed files with 482 additions and 4 deletions
@@ -10,6 +10,7 @@ export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js
 export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js';
 export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js';
 export { supportsAudioInput } from './capabilities.js';
+export { synthesizeSpeechAttachment, type TtsSynthesisConfig, type TtsOutputFormat } from './tts.js';
 export {
  isSupportedImage,
  isSupportedAudio,
@@ -0,0 +1,67 @@
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+import { synthesizeSpeechAttachment } from './tts.js';
+
+describe('synthesizeSpeechAttachment', () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('returns null when text is empty', async () => {
+    const result = await synthesizeSpeechAttachment('   ', {
+      endpoint: 'https://example.com/v1/audio/speech',
+    });
+    expect(result).toBeNull();
+  });
+
+  it('returns null when endpoint is missing', async () => {
+    const result = await synthesizeSpeechAttachment('hello', {});
+    expect(result).toBeNull();
+  });
+
+  it('returns an outbound audio attachment on success', async () => {
+    const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      status: 200,
+      statusText: 'OK',
+      arrayBuffer: async () => Uint8Array.from([1, 2, 3, 4]).buffer,
+    } as Response);
+
+    const result = await synthesizeSpeechAttachment('Hello from Flynn', {
+      endpoint: 'https://example.com/v1/audio/speech',
+      apiKey: 'sk-test',
+      model: 'gpt-4o-mini-tts',
+      voice: 'alloy',
+      format: 'mp3',
+    });
+
+    expect(fetchSpy).toHaveBeenCalledWith(
+      'https://example.com/v1/audio/speech',
+      expect.objectContaining({
+        method: 'POST',
+        headers: expect.objectContaining({
+          'Content-Type': 'application/json',
+          Authorization: 'Bearer sk-test',
+        }),
+      }),
+    );
+    expect(result).toMatchObject({
+      mimeType: 'audio/mpeg',
+      data: 'AQIDBA==',
+    });
+    expect(result?.filename).toMatch(/^flynn-reply-\d+\.mp3$/);
+  });
+
+  it('throws when the tts endpoint returns an error', async () => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: false,
+      status: 429,
+      statusText: 'Too Many Requests',
+      text: async () => 'rate limit',
+    } as Response);
+
+    await expect(synthesizeSpeechAttachment('Hello', {
+      endpoint: 'https://example.com/v1/audio/speech',
+    })).rejects.toThrow(/TTS request failed: 429 Too Many Requests/i);
+  });
+});
@@ -0,0 +1,88 @@
+import type { OutboundAttachment } from '../channels/types.js';
+
+export type TtsOutputFormat = 'mp3' | 'wav' | 'opus';
+
+export interface TtsSynthesisConfig {
+  endpoint?: string;
+  apiKey?: string;
+  model?: string;
+  voice?: string;
+  format?: TtsOutputFormat;
+}
+
+function outputFormatToMimeType(format: TtsOutputFormat): string {
+  switch (format) {
+    case 'wav':
+      return 'audio/wav';
+    case 'opus':
+      return 'audio/ogg';
+    case 'mp3':
+    default:
+      return 'audio/mpeg';
+  }
+}
+
+function outputFormatToExtension(format: TtsOutputFormat): string {
+  switch (format) {
+    case 'wav':
+      return 'wav';
+    case 'opus':
+      return 'ogg';
+    case 'mp3':
+    default:
+      return 'mp3';
+  }
+}
+
+/** Synthesize speech via an OpenAI-compatible /v1/audio/speech endpoint. */
+export async function synthesizeSpeechAttachment(
+  text: string,
+  config: TtsSynthesisConfig,
+): Promise<OutboundAttachment | null> {
+  const trimmed = text.trim();
+  if (!trimmed) {
+    return null;
+  }
+  if (!config.endpoint) {
+    return null;
+  }
+
+  const format = config.format ?? 'mp3';
+  const model = config.model ?? 'gpt-4o-mini-tts';
+  const voice = config.voice ?? 'alloy';
+
+  const headers: Record<string, string> = {
+    'Content-Type': 'application/json',
+  };
+  if (config.apiKey) {
+    headers.Authorization = `Bearer ${config.apiKey}`;
+  }
+
+  const response = await fetch(config.endpoint, {
+    method: 'POST',
+    headers,
+    body: JSON.stringify({
+      model,
+      voice,
+      input: trimmed,
+      response_format: format,
+    }),
+  });
+
+  if (!response.ok) {
+    const detail = await response.text().catch(() => '');
+    throw new Error(
+      `TTS request failed: ${response.status} ${response.statusText}${detail ? ` - ${detail.slice(0, 200)}` : ''}`,
+    );
+  }
+
+  const audioBytes = await response.arrayBuffer();
+  const data = Buffer.from(audioBytes).toString('base64');
+  const extension = outputFormatToExtension(format);
+
+  return {
+    mimeType: outputFormatToMimeType(format),
+    data,
+    filename: `flynn-reply-${Date.now()}.${extension}`,
+  };
+}