feat: implement tier-a4 tts voice output replies
This commit is contained in:
@@ -10,6 +10,7 @@ export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js
|
||||
export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js';
|
||||
export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js';
|
||||
export { supportsAudioInput } from './capabilities.js';
|
||||
export { synthesizeSpeechAttachment, type TtsSynthesisConfig, type TtsOutputFormat } from './tts.js';
|
||||
export {
|
||||
isSupportedImage,
|
||||
isSupportedAudio,
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
import { afterEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
import { synthesizeSpeechAttachment } from './tts.js';
|
||||
|
||||
describe('synthesizeSpeechAttachment', () => {
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
it('returns null when text is empty', async () => {
|
||||
const result = await synthesizeSpeechAttachment(' ', {
|
||||
endpoint: 'https://example.com/v1/audio/speech',
|
||||
});
|
||||
expect(result).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when endpoint is missing', async () => {
|
||||
const result = await synthesizeSpeechAttachment('hello', {});
|
||||
expect(result).toBeNull();
|
||||
});
|
||||
|
||||
it('returns an outbound audio attachment on success', async () => {
|
||||
const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
|
||||
ok: true,
|
||||
status: 200,
|
||||
statusText: 'OK',
|
||||
arrayBuffer: async () => Uint8Array.from([1, 2, 3, 4]).buffer,
|
||||
} as Response);
|
||||
|
||||
const result = await synthesizeSpeechAttachment('Hello from Flynn', {
|
||||
endpoint: 'https://example.com/v1/audio/speech',
|
||||
apiKey: 'sk-test',
|
||||
model: 'gpt-4o-mini-tts',
|
||||
voice: 'alloy',
|
||||
format: 'mp3',
|
||||
});
|
||||
|
||||
expect(fetchSpy).toHaveBeenCalledWith(
|
||||
'https://example.com/v1/audio/speech',
|
||||
expect.objectContaining({
|
||||
method: 'POST',
|
||||
headers: expect.objectContaining({
|
||||
'Content-Type': 'application/json',
|
||||
Authorization: 'Bearer sk-test',
|
||||
}),
|
||||
}),
|
||||
);
|
||||
expect(result).toMatchObject({
|
||||
mimeType: 'audio/mpeg',
|
||||
data: 'AQIDBA==',
|
||||
});
|
||||
expect(result?.filename).toMatch(/^flynn-reply-\d+\.mp3$/);
|
||||
});
|
||||
|
||||
it('throws when the tts endpoint returns an error', async () => {
|
||||
vi.spyOn(globalThis, 'fetch').mockResolvedValue({
|
||||
ok: false,
|
||||
status: 429,
|
||||
statusText: 'Too Many Requests',
|
||||
text: async () => 'rate limit',
|
||||
} as Response);
|
||||
|
||||
await expect(synthesizeSpeechAttachment('Hello', {
|
||||
endpoint: 'https://example.com/v1/audio/speech',
|
||||
})).rejects.toThrow(/TTS request failed: 429 Too Many Requests/i);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,88 @@
|
||||
import type { OutboundAttachment } from '../channels/types.js';
|
||||
|
||||
export type TtsOutputFormat = 'mp3' | 'wav' | 'opus';
|
||||
|
||||
export interface TtsSynthesisConfig {
|
||||
endpoint?: string;
|
||||
apiKey?: string;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
format?: TtsOutputFormat;
|
||||
}
|
||||
|
||||
function outputFormatToMimeType(format: TtsOutputFormat): string {
|
||||
switch (format) {
|
||||
case 'wav':
|
||||
return 'audio/wav';
|
||||
case 'opus':
|
||||
return 'audio/ogg';
|
||||
case 'mp3':
|
||||
default:
|
||||
return 'audio/mpeg';
|
||||
}
|
||||
}
|
||||
|
||||
function outputFormatToExtension(format: TtsOutputFormat): string {
|
||||
switch (format) {
|
||||
case 'wav':
|
||||
return 'wav';
|
||||
case 'opus':
|
||||
return 'ogg';
|
||||
case 'mp3':
|
||||
default:
|
||||
return 'mp3';
|
||||
}
|
||||
}
|
||||
|
||||
/** Synthesize speech via an OpenAI-compatible /v1/audio/speech endpoint. */
|
||||
export async function synthesizeSpeechAttachment(
|
||||
text: string,
|
||||
config: TtsSynthesisConfig,
|
||||
): Promise<OutboundAttachment | null> {
|
||||
const trimmed = text.trim();
|
||||
if (!trimmed) {
|
||||
return null;
|
||||
}
|
||||
if (!config.endpoint) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const format = config.format ?? 'mp3';
|
||||
const model = config.model ?? 'gpt-4o-mini-tts';
|
||||
const voice = config.voice ?? 'alloy';
|
||||
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
};
|
||||
if (config.apiKey) {
|
||||
headers.Authorization = `Bearer ${config.apiKey}`;
|
||||
}
|
||||
|
||||
const response = await fetch(config.endpoint, {
|
||||
method: 'POST',
|
||||
headers,
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
voice,
|
||||
input: trimmed,
|
||||
response_format: format,
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const detail = await response.text().catch(() => '');
|
||||
throw new Error(
|
||||
`TTS request failed: ${response.status} ${response.statusText}${detail ? ` - ${detail.slice(0, 200)}` : ''}`,
|
||||
);
|
||||
}
|
||||
|
||||
const audioBytes = await response.arrayBuffer();
|
||||
const data = Buffer.from(audioBytes).toString('base64');
|
||||
const extension = outputFormatToExtension(format);
|
||||
|
||||
return {
|
||||
mimeType: outputFormatToMimeType(format),
|
||||
data,
|
||||
filename: `flynn-reply-${Date.now()}.${extension}`,
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user