From a71aa5992db90eaf0b3274b0b63cbe98c15bb1e0 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Wed, 18 Feb 2026 10:22:28 -0800 Subject: [PATCH] feat: implement tier-a4 tts voice output replies --- README.md | 28 +++++++ config/default.yaml | 15 ++++ docs/plans/state.json | 22 +++++- src/config/index.ts | 2 +- src/config/schema.test.ts | 43 +++++++++++ src/config/schema.ts | 20 +++++ src/daemon/routing.test.ts | 150 +++++++++++++++++++++++++++++++++++++ src/daemon/routing.ts | 50 ++++++++++++- src/models/index.ts | 1 + src/models/tts.test.ts | 67 +++++++++++++++++ src/models/tts.ts | 88 ++++++++++++++++++++++ 11 files changed, 482 insertions(+), 4 deletions(-) create mode 100644 src/models/tts.test.ts create mode 100644 src/models/tts.ts diff --git a/README.md b/README.md index 20f4d7e..2745c29 100644 --- a/README.md +++ b/README.md @@ -419,6 +419,34 @@ docker run -d \ # docker compose up -d ``` +### Text-to-Speech (TTS) Reply Audio + +Flynn can attach synthesized voice replies (OpenAI-compatible `/v1/audio/speech`) alongside text responses. + +```yaml +tts: + enabled: true + enabled_channels: [telegram, whatsapp, discord] # Empty = all channels + provider: + type: openai # openai | custom + endpoint: "https://api.openai.com/v1/audio/speech" + api_key: "${OPENAI_API_KEY}" # Optional Bearer token + model: "gpt-4o-mini-tts" + voice: "alloy" + format: "mp3" # mp3 | wav | opus +``` + +| Field | Required | Description | +|-------|----------|-------------| +| `tts.enabled` | no | Enable voice reply synthesis (default: `false`) | +| `tts.enabled_channels` | no | Channels allowed to receive voice replies (`[]` means all channels) | +| `tts.provider.type` | no | `openai` or `custom` (default: `openai`) | +| `tts.provider.endpoint` | no | OpenAI-compatible `/v1/audio/speech` endpoint (`openai` defaults to OpenAI API URL) | +| `tts.provider.api_key` | no | Bearer token for authentication | +| `tts.provider.model` | no | TTS model (default: `gpt-4o-mini-tts`) | +| `tts.provider.voice` | no | Voice identifier (default: `alloy`) | +| `tts.provider.format` | no | Output format: `mp3`, `wav`, `opus` (default: `mp3`) | + ### Capture Tools Flynn includes host capture tools: diff --git a/config/default.yaml b/config/default.yaml index 48fad23..41faa42 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -476,6 +476,21 @@ hooks: # timeout_ms: 120000 # allow_manual_toggle: true +# ── Text-to-Speech (TTS) Output ────────────────────────────────────── +# Optional voice output for assistant replies. Uses an OpenAI-compatible +# /v1/audio/speech endpoint and attaches audio to channel replies. +# +# tts: +# enabled: false +# enabled_channels: [telegram, whatsapp, discord] # Empty = all channels +# provider: +# type: openai # openai | custom +# endpoint: "https://api.openai.com/v1/audio/speech" +# api_key: "${OPENAI_API_KEY}" # Optional Bearer token +# model: "gpt-4o-mini-tts" +# voice: "alloy" +# format: "mp3" # mp3 | wav | opus + # ── Sub-Agent Configs ──────────────────────────────────────────────── # Named agent configurations for delegation via agent.delegate tool. # Each agent gets a focused system prompt, model tier, and tool profile. diff --git a/docs/plans/state.json b/docs/plans/state.json index 37858de..5d53c47 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -5225,6 +5225,26 @@ "docs/plans/state.json" ], "test_status": "Docs-only change (no code paths affected)" + }, + "tts-voice-output-tier-a4": { + "status": "completed", + "date": "2026-02-18", + "updated": "2026-02-18", + "summary": "Implemented Tier A4 voice output: added configurable OpenAI-compatible TTS (`tts.*`) and integrated channel-gated synthesized reply attachments in daemon routing for both native and external backend responses, with unit/integration tests and docs updates.", + "files_modified": [ + "src/models/tts.ts", + "src/models/tts.test.ts", + "src/models/index.ts", + "src/config/schema.ts", + "src/config/index.ts", + "src/config/schema.test.ts", + "src/daemon/routing.ts", + "src/daemon/routing.test.ts", + "README.md", + "config/default.yaml", + "docs/plans/state.json" + ], + "test_status": "pnpm test:run src/models/tts.test.ts src/config/schema.test.ts src/daemon/routing.test.ts + pnpm typecheck passing" } }, "overall_progress": { @@ -5248,7 +5268,7 @@ "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram", "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback", "remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening", - "next_up": "Implement Tier A4 from the OpenClaw roadmap: TTS voice output with channel-aware audio responses" + "next_up": "Implement the reactions/event-trigger automation layer from the OpenClaw roadmap (event pattern matching -> agent action execution)" }, "soul_md_and_cron_create": { "date": "2026-02-11", diff --git a/src/config/index.ts b/src/config/index.ts index ca2c994..d369e73 100644 --- a/src/config/index.ts +++ b/src/config/index.ts @@ -1,3 +1,3 @@ export { loadConfig, deepMerge } from './loader.js'; export { persistConfig } from './persistence.js'; -export { configSchema, MODEL_PROVIDERS, type ModelProvider, type Config, type TelegramConfig, type ModelConfig, type CronJobConfig, type AgentsConfig, type CompactionConfig, type ToolProfile, type ToolOverrideConfig, type ToolsConfig, type SandboxConfig, type AgentConfigEntry, type RoutingConfig, type ServerConfig, type BackupConfig, type K8sConfig } from './schema.js'; +export { configSchema, MODEL_PROVIDERS, type ModelProvider, type Config, type TelegramConfig, type ModelConfig, type CronJobConfig, type AgentsConfig, type CompactionConfig, type ToolProfile, type ToolOverrideConfig, type ToolsConfig, type SandboxConfig, type AgentConfigEntry, type RoutingConfig, type ServerConfig, type BackupConfig, type K8sConfig, type TtsConfig } from './schema.js'; diff --git a/src/config/schema.test.ts b/src/config/schema.test.ts index 67f5ddd..8cf89e7 100644 --- a/src/config/schema.test.ts +++ b/src/config/schema.test.ts @@ -660,6 +660,49 @@ describe('configSchema — audio talk mode', () => { }); }); +describe('configSchema — tts', () => { + const minimalConfig = { + telegram: { bot_token: 'test', allowed_chat_ids: [1] }, + models: { default: { provider: 'anthropic', model: 'claude-3' } }, + }; + + it('defaults tts fields', () => { + const result = configSchema.parse(minimalConfig); + expect(result.tts.enabled).toBe(false); + expect(result.tts.enabled_channels).toEqual([]); + expect(result.tts.provider).toBeUndefined(); + }); + + it('accepts custom tts provider settings', () => { + const result = configSchema.parse({ + ...minimalConfig, + tts: { + enabled: true, + enabled_channels: ['telegram', 'discord'], + provider: { + type: 'custom', + endpoint: 'https://example.com/v1/audio/speech', + api_key: 'sk-test', + model: 'gpt-4o-mini-tts', + voice: 'nova', + format: 'wav', + }, + }, + }); + + expect(result.tts.enabled).toBe(true); + expect(result.tts.enabled_channels).toEqual(['telegram', 'discord']); + expect(result.tts.provider).toMatchObject({ + type: 'custom', + endpoint: 'https://example.com/v1/audio/speech', + api_key: 'sk-test', + model: 'gpt-4o-mini-tts', + voice: 'nova', + format: 'wav', + }); + }); +}); + describe('configSchema — mattermost', () => { const minimalConfig = { telegram: { bot_token: 'test', allowed_chat_ids: [1] }, diff --git a/src/config/schema.ts b/src/config/schema.ts index daee92c..b4a6346 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -730,6 +730,24 @@ const audioSchema = z.object({ talk_mode: talkModeSchema, }).default({}); +const ttsOutputFormatSchema = z.enum(['mp3', 'wav', 'opus']); + +const ttsProviderSchema = z.object({ + type: z.enum(['openai', 'custom']).default('openai'), + endpoint: z.string().optional(), + api_key: z.string().optional(), + model: z.string().default('gpt-4o-mini-tts'), + voice: z.string().default('alloy'), + format: ttsOutputFormatSchema.default('mp3'), +}); + +const ttsSchema = z.object({ + enabled: z.boolean().default(false), + /** Restrict voice replies to selected channels. Empty means all channels. */ + enabled_channels: z.array(z.string().min(1)).default([]), + provider: ttsProviderSchema.optional(), +}).default({}); + // ── Tool policy schemas ────────────────────────────────────────────── const toolProfileEnum = z.enum(['minimal', 'messaging', 'coding', 'full']); @@ -912,6 +930,7 @@ export const configSchema = z.object({ retry: retrySchema, web_search: webSearchSchema, audio: audioSchema, + tts: ttsSchema, prompt: promptSchema, tools: toolsSchema, sandbox: sandboxSchema, @@ -936,6 +955,7 @@ export type CompactionConfig = z.infer; export type MemoryConfig = z.infer; export type WebSearchConfig = z.infer; export type AudioConfig = z.infer; +export type TtsConfig = z.infer; export type ProcessConfig = z.infer; export type BrowserConfig = z.infer; export type K8sConfig = z.infer; diff --git a/src/daemon/routing.test.ts b/src/daemon/routing.test.ts index a2d808f..3d8922e 100644 --- a/src/daemon/routing.test.ts +++ b/src/daemon/routing.test.ts @@ -1122,6 +1122,156 @@ describe('daemon audio routing integration', () => { }); }); +describe('daemon tts routing integration', () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('attaches synthesized audio reply when tts is enabled for the channel', async () => { + const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('voice-enabled response'); + const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({ + ok: true, + status: 200, + statusText: 'OK', + arrayBuffer: async () => Uint8Array.from([7, 8, 9]).buffer, + } as Response); + + const session = { + id: 'telegram:tts-user-1', + addMessage: vi.fn(), + getHistory: vi.fn(() => []), + clear: vi.fn(), + replaceHistory: vi.fn(), + getConfig: vi.fn(() => undefined), + setConfig: vi.fn(), + deleteConfig: vi.fn(), + }; + + const router = createMessageRouter({ + sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'], + modelRouter: { + getAvailableTiers: () => ['default'], + getAllLabels: () => ({ default: 'default' }), + getLabel: (tier: string) => tier, + } as unknown as MessageRouterDeps['modelRouter'], + systemPrompt: 'test prompt', + toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'], + toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'], + config: { + agents: { + primary_tier: 'default', + delegation: { + compaction: 'default', + memory_extraction: 'default', + classification: 'default', + tool_summarisation: 'default', + complex_reasoning: 'default', + }, + max_delegation_depth: 1, + max_iterations: 3, + }, + compaction: { enabled: false }, + models: { default: { provider: 'anthropic', model: 'claude' } }, + tts: { + enabled: true, + enabled_channels: ['telegram'], + provider: { + type: 'custom', + endpoint: 'https://example.com/v1/audio/speech', + api_key: 'sk-test', + model: 'gpt-4o-mini-tts', + voice: 'alloy', + format: 'mp3', + }, + }, + } as unknown as MessageRouterDeps['config'], + }); + + const reply = vi.fn(async (_message: OutboundMessage) => {}); + await router.handler({ + id: 'tts-1', + channel: 'telegram', + senderId: 'tts-user-1', + text: 'say hello', + timestamp: Date.now(), + } as MessageRouterInput, reply); + + expect(processSpy).toHaveBeenCalledTimes(1); + expect(fetchSpy).toHaveBeenCalledTimes(1); + const outbound = reply.mock.calls[0]?.[0] as OutboundMessage | undefined; + expect(outbound?.attachments).toBeDefined(); + expect(outbound?.attachments?.[0]).toMatchObject({ + mimeType: 'audio/mpeg', + data: 'BwgJ', + }); + }); + + it('does not synthesize tts when channel is not enabled', async () => { + vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('text-only response'); + const fetchSpy = vi.spyOn(globalThis, 'fetch'); + + const session = { + id: 'discord:tts-user-2', + addMessage: vi.fn(), + getHistory: vi.fn(() => []), + clear: vi.fn(), + replaceHistory: vi.fn(), + getConfig: vi.fn(() => undefined), + setConfig: vi.fn(), + deleteConfig: vi.fn(), + }; + + const router = createMessageRouter({ + sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'], + modelRouter: { + getAvailableTiers: () => ['default'], + getAllLabels: () => ({ default: 'default' }), + getLabel: (tier: string) => tier, + } as unknown as MessageRouterDeps['modelRouter'], + systemPrompt: 'test prompt', + toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'], + toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'], + config: { + agents: { + primary_tier: 'default', + delegation: { + compaction: 'default', + memory_extraction: 'default', + classification: 'default', + tool_summarisation: 'default', + complex_reasoning: 'default', + }, + max_delegation_depth: 1, + max_iterations: 3, + }, + compaction: { enabled: false }, + models: { default: { provider: 'anthropic', model: 'claude' } }, + tts: { + enabled: true, + enabled_channels: ['telegram'], + provider: { + type: 'custom', + endpoint: 'https://example.com/v1/audio/speech', + }, + }, + } as unknown as MessageRouterDeps['config'], + }); + + const reply = vi.fn(async (_message: OutboundMessage) => {}); + await router.handler({ + id: 'tts-2', + channel: 'discord', + senderId: 'tts-user-2', + text: 'respond as text', + timestamp: Date.now(), + } as MessageRouterInput, reply); + + expect(fetchSpy).not.toHaveBeenCalled(); + const outbound = reply.mock.calls[0]?.[0] as OutboundMessage | undefined; + expect(outbound?.attachments).toBeUndefined(); + }); +}); + describe('daemon auto-escalate integration', () => { afterEach(() => { vi.restoreAllMocks(); diff --git a/src/daemon/routing.ts b/src/daemon/routing.ts index d01c0f4..ad00ff0 100644 --- a/src/daemon/routing.ts +++ b/src/daemon/routing.ts @@ -1,6 +1,7 @@ import type { AudioTranscriptionConfig } from '../models/media.js'; import type { Attachment } from '../channels/types.js'; import { isSupportedAudio, transcribeAudio } from '../models/media.js'; +import { synthesizeSpeechAttachment } from '../models/tts.js'; import { supportsAudioInput } from '../models/capabilities.js'; import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js'; import { OutboundAttachmentCollector } from '../backends/native/attachments.js'; @@ -84,6 +85,17 @@ function parseResearchPrefix(text: string): string | undefined { } return undefined; } + +function isTtsEnabledForChannel(config: Config, channel: string): boolean { + if (!config.tts?.enabled) { + return false; + } + const enabledChannels = config.tts.enabled_channels ?? []; + if (enabledChannels.length === 0) { + return true; + } + return enabledChannels.includes(channel); +} /** * Create the unified message handler for the channel registry. * Each channel+sender pair gets its own AgentOrchestrator backed by a persistent session. @@ -116,6 +128,31 @@ export function createMessageRouter(deps: { const agents = new Map(); const talkModeUntil = new Map(); + async function maybeBuildTtsAttachment(responseText: string, channel: string) { + if (!isTtsEnabledForChannel(deps.config, channel)) { + return undefined; + } + + const provider = deps.config.tts?.provider; + const endpoint = provider?.endpoint ?? (provider?.type === 'openai' ? 'https://api.openai.com/v1/audio/speech' : undefined); + if (!endpoint) { + return undefined; + } + + try { + return await synthesizeSpeechAttachment(responseText, { + endpoint, + apiKey: provider?.api_key, + model: provider?.model, + voice: provider?.voice, + format: provider?.format, + }); + } catch (error) { + console.warn(`TTS synthesis failed for channel ${channel}:`, error instanceof Error ? error.message : 'Unknown error'); + return undefined; + } + } + function getOrCreateAgent(channel: string, senderId: string, metadata?: Record, agentOverride?: string): { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector } { // Resolve agent config name via routing (sender → channel → default fallback) const agentConfigName = agentOverride ?? deps.agentRouter?.resolve(channel, senderId); @@ -998,7 +1035,12 @@ export function createMessageRouter(deps: { history, }); session.addMessage({ role: 'assistant', content: response }); - await reply({ text: response, replyTo: msg.id }); + const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel); + await reply({ + text: response, + replyTo: msg.id, + attachments: ttsAttachment ? [ttsAttachment] : undefined, + }); return; } catch (error) { const detail = error instanceof Error ? error.message : String(error); @@ -1031,10 +1073,14 @@ export function createMessageRouter(deps: { response = await agent.process(messageText, attachments); } const outboundAttachments = collector.drain(); + const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel); + const mergedAttachments = ttsAttachment + ? [...outboundAttachments, ttsAttachment] + : outboundAttachments; await reply({ text: response, replyTo: msg.id, - attachments: outboundAttachments.length > 0 ? outboundAttachments : undefined, + attachments: mergedAttachments.length > 0 ? mergedAttachments : undefined, }); } catch (error) { console.error(`Error processing message from ${msg.channel}:${msg.senderId}:`, error); diff --git a/src/models/index.ts b/src/models/index.ts index a1a9c62..3190079 100644 --- a/src/models/index.ts +++ b/src/models/index.ts @@ -10,6 +10,7 @@ export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js'; export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js'; export { supportsAudioInput } from './capabilities.js'; +export { synthesizeSpeechAttachment, type TtsSynthesisConfig, type TtsOutputFormat } from './tts.js'; export { isSupportedImage, isSupportedAudio, diff --git a/src/models/tts.test.ts b/src/models/tts.test.ts new file mode 100644 index 0000000..9d5b557 --- /dev/null +++ b/src/models/tts.test.ts @@ -0,0 +1,67 @@ +import { afterEach, describe, expect, it, vi } from 'vitest'; + +import { synthesizeSpeechAttachment } from './tts.js'; + +describe('synthesizeSpeechAttachment', () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('returns null when text is empty', async () => { + const result = await synthesizeSpeechAttachment(' ', { + endpoint: 'https://example.com/v1/audio/speech', + }); + expect(result).toBeNull(); + }); + + it('returns null when endpoint is missing', async () => { + const result = await synthesizeSpeechAttachment('hello', {}); + expect(result).toBeNull(); + }); + + it('returns an outbound audio attachment on success', async () => { + const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({ + ok: true, + status: 200, + statusText: 'OK', + arrayBuffer: async () => Uint8Array.from([1, 2, 3, 4]).buffer, + } as Response); + + const result = await synthesizeSpeechAttachment('Hello from Flynn', { + endpoint: 'https://example.com/v1/audio/speech', + apiKey: 'sk-test', + model: 'gpt-4o-mini-tts', + voice: 'alloy', + format: 'mp3', + }); + + expect(fetchSpy).toHaveBeenCalledWith( + 'https://example.com/v1/audio/speech', + expect.objectContaining({ + method: 'POST', + headers: expect.objectContaining({ + 'Content-Type': 'application/json', + Authorization: 'Bearer sk-test', + }), + }), + ); + expect(result).toMatchObject({ + mimeType: 'audio/mpeg', + data: 'AQIDBA==', + }); + expect(result?.filename).toMatch(/^flynn-reply-\d+\.mp3$/); + }); + + it('throws when the tts endpoint returns an error', async () => { + vi.spyOn(globalThis, 'fetch').mockResolvedValue({ + ok: false, + status: 429, + statusText: 'Too Many Requests', + text: async () => 'rate limit', + } as Response); + + await expect(synthesizeSpeechAttachment('Hello', { + endpoint: 'https://example.com/v1/audio/speech', + })).rejects.toThrow(/TTS request failed: 429 Too Many Requests/i); + }); +}); diff --git a/src/models/tts.ts b/src/models/tts.ts new file mode 100644 index 0000000..8d858e1 --- /dev/null +++ b/src/models/tts.ts @@ -0,0 +1,88 @@ +import type { OutboundAttachment } from '../channels/types.js'; + +export type TtsOutputFormat = 'mp3' | 'wav' | 'opus'; + +export interface TtsSynthesisConfig { + endpoint?: string; + apiKey?: string; + model?: string; + voice?: string; + format?: TtsOutputFormat; +} + +function outputFormatToMimeType(format: TtsOutputFormat): string { + switch (format) { + case 'wav': + return 'audio/wav'; + case 'opus': + return 'audio/ogg'; + case 'mp3': + default: + return 'audio/mpeg'; + } +} + +function outputFormatToExtension(format: TtsOutputFormat): string { + switch (format) { + case 'wav': + return 'wav'; + case 'opus': + return 'ogg'; + case 'mp3': + default: + return 'mp3'; + } +} + +/** Synthesize speech via an OpenAI-compatible /v1/audio/speech endpoint. */ +export async function synthesizeSpeechAttachment( + text: string, + config: TtsSynthesisConfig, +): Promise { + const trimmed = text.trim(); + if (!trimmed) { + return null; + } + if (!config.endpoint) { + return null; + } + + const format = config.format ?? 'mp3'; + const model = config.model ?? 'gpt-4o-mini-tts'; + const voice = config.voice ?? 'alloy'; + + const headers: Record = { + 'Content-Type': 'application/json', + }; + if (config.apiKey) { + headers.Authorization = `Bearer ${config.apiKey}`; + } + + const response = await fetch(config.endpoint, { + method: 'POST', + headers, + body: JSON.stringify({ + model, + voice, + input: trimmed, + response_format: format, + }), + }); + + if (!response.ok) { + const detail = await response.text().catch(() => ''); + throw new Error( + `TTS request failed: ${response.status} ${response.statusText}${detail ? ` - ${detail.slice(0, 200)}` : ''}`, + ); + } + + const audioBytes = await response.arrayBuffer(); + const data = Buffer.from(audioBytes).toString('base64'); + const extension = outputFormatToExtension(format); + + return { + mimeType: outputFormatToMimeType(format), + data, + filename: `flynn-reply-${Date.now()}.${extension}`, + }; +}