feat: implement tier-a4 tts voice output replies

2026-02-18 10:22:28 -08:00
parent 3eb07875f1
commit a71aa5992d
11 changed files with 482 additions and 4 deletions
@@ -419,6 +419,34 @@ docker run -d \
 # docker compose up -d
 ```

+### Text-to-Speech (TTS) Reply Audio
+
+Flynn can attach synthesized voice replies (OpenAI-compatible `/v1/audio/speech`) alongside text responses.
+
+```yaml
+tts:
+  enabled: true
+  enabled_channels: [telegram, whatsapp, discord]  # Empty = all channels
+  provider:
+    type: openai                                    # openai | custom
+    endpoint: "https://api.openai.com/v1/audio/speech"
+    api_key: "${OPENAI_API_KEY}"                   # Optional Bearer token
+    model: "gpt-4o-mini-tts"
+    voice: "alloy"
+    format: "mp3"                                  # mp3 | wav | opus
+```
+
+| Field | Required | Description |
+|-------|----------|-------------|
+| `tts.enabled` | no | Enable voice reply synthesis (default: `false`) |
+| `tts.enabled_channels` | no | Channels allowed to receive voice replies (`[]` means all channels) |
+| `tts.provider.type` | no | `openai` or `custom` (default: `openai`) |
+| `tts.provider.endpoint` | no | OpenAI-compatible `/v1/audio/speech` endpoint (`openai` defaults to OpenAI API URL) |
+| `tts.provider.api_key` | no | Bearer token for authentication |
+| `tts.provider.model` | no | TTS model (default: `gpt-4o-mini-tts`) |
+| `tts.provider.voice` | no | Voice identifier (default: `alloy`) |
+| `tts.provider.format` | no | Output format: `mp3`, `wav`, `opus` (default: `mp3`) |
+
 ### Capture Tools

 Flynn includes host capture tools:
@@ -476,6 +476,21 @@ hooks:
 #     timeout_ms: 120000
 #     allow_manual_toggle: true

+# ── Text-to-Speech (TTS) Output ──────────────────────────────────────
+# Optional voice output for assistant replies. Uses an OpenAI-compatible
+# /v1/audio/speech endpoint and attaches audio to channel replies.
+#
+# tts:
+#   enabled: false
+#   enabled_channels: [telegram, whatsapp, discord] # Empty = all channels
+#   provider:
+#     type: openai                                  # openai | custom
+#     endpoint: "https://api.openai.com/v1/audio/speech"
+#     api_key: "${OPENAI_API_KEY}"                  # Optional Bearer token
+#     model: "gpt-4o-mini-tts"
+#     voice: "alloy"
+#     format: "mp3"                                 # mp3 | wav | opus
+
 # ── Sub-Agent Configs ────────────────────────────────────────────────
 # Named agent configurations for delegation via agent.delegate tool.
 # Each agent gets a focused system prompt, model tier, and tool profile.
@@ -5225,6 +5225,26 @@
        "docs/plans/state.json"
      ],
      "test_status": "Docs-only change (no code paths affected)"
+    },
+    "tts-voice-output-tier-a4": {
+      "status": "completed",
+      "date": "2026-02-18",
+      "updated": "2026-02-18",
+      "summary": "Implemented Tier A4 voice output: added configurable OpenAI-compatible TTS (`tts.*`) and integrated channel-gated synthesized reply attachments in daemon routing for both native and external backend responses, with unit/integration tests and docs updates.",
+      "files_modified": [
+        "src/models/tts.ts",
+        "src/models/tts.test.ts",
+        "src/models/index.ts",
+        "src/config/schema.ts",
+        "src/config/index.ts",
+        "src/config/schema.test.ts",
+        "src/daemon/routing.ts",
+        "src/daemon/routing.test.ts",
+        "README.md",
+        "config/default.yaml",
+        "docs/plans/state.json"
+      ],
+      "test_status": "pnpm test:run src/models/tts.test.ts src/config/schema.test.ts src/daemon/routing.test.ts + pnpm typecheck passing"
    }
  },
  "overall_progress": {
@@ -5248,7 +5268,7 @@
    "gmail_auth_cli": "flynn gmail-auth command implemented with OAuth2 flow, doctor check, config routed to Telegram",
    "native_audio_support": "completed — smart routing for native audio (Gemini/OpenAI/GitHub) vs Whisper transcription fallback",
    "remaining_phases_completion": "Phase 1: 3/3 (100%) — context levels, command registry, memory structure. Phase 2: 3/3 (100%) — component registry, confidence routing, history index. Phase 3: 2/2 (100%) — adaptive memory/compaction, truthfulness/autonomy hardening",
-    "next_up": "Implement Tier A4 from the OpenClaw roadmap: TTS voice output with channel-aware audio responses"
+    "next_up": "Implement the reactions/event-trigger automation layer from the OpenClaw roadmap (event pattern matching -> agent action execution)"
  },
  "soul_md_and_cron_create": {
    "date": "2026-02-11",
@@ -1,3 +1,3 @@
 export { loadConfig, deepMerge } from './loader.js';
 export { persistConfig } from './persistence.js';
-export { configSchema, MODEL_PROVIDERS, type ModelProvider, type Config, type TelegramConfig, type ModelConfig, type CronJobConfig, type AgentsConfig, type CompactionConfig, type ToolProfile, type ToolOverrideConfig, type ToolsConfig, type SandboxConfig, type AgentConfigEntry, type RoutingConfig, type ServerConfig, type BackupConfig, type K8sConfig } from './schema.js';
+export { configSchema, MODEL_PROVIDERS, type ModelProvider, type Config, type TelegramConfig, type ModelConfig, type CronJobConfig, type AgentsConfig, type CompactionConfig, type ToolProfile, type ToolOverrideConfig, type ToolsConfig, type SandboxConfig, type AgentConfigEntry, type RoutingConfig, type ServerConfig, type BackupConfig, type K8sConfig, type TtsConfig } from './schema.js';
@@ -660,6 +660,49 @@ describe('configSchema — audio talk mode', () => {
  });
 });

+describe('configSchema — tts', () => {
+  const minimalConfig = {
+    telegram: { bot_token: 'test', allowed_chat_ids: [1] },
+    models: { default: { provider: 'anthropic', model: 'claude-3' } },
+  };
+
+  it('defaults tts fields', () => {
+    const result = configSchema.parse(minimalConfig);
+    expect(result.tts.enabled).toBe(false);
+    expect(result.tts.enabled_channels).toEqual([]);
+    expect(result.tts.provider).toBeUndefined();
+  });
+
+  it('accepts custom tts provider settings', () => {
+    const result = configSchema.parse({
+      ...minimalConfig,
+      tts: {
+        enabled: true,
+        enabled_channels: ['telegram', 'discord'],
+        provider: {
+          type: 'custom',
+          endpoint: 'https://example.com/v1/audio/speech',
+          api_key: 'sk-test',
+          model: 'gpt-4o-mini-tts',
+          voice: 'nova',
+          format: 'wav',
+        },
+      },
+    });
+
+    expect(result.tts.enabled).toBe(true);
+    expect(result.tts.enabled_channels).toEqual(['telegram', 'discord']);
+    expect(result.tts.provider).toMatchObject({
+      type: 'custom',
+      endpoint: 'https://example.com/v1/audio/speech',
+      api_key: 'sk-test',
+      model: 'gpt-4o-mini-tts',
+      voice: 'nova',
+      format: 'wav',
+    });
+  });
+});
+
 describe('configSchema — mattermost', () => {
  const minimalConfig = {
    telegram: { bot_token: 'test', allowed_chat_ids: [1] },
@@ -730,6 +730,24 @@ const audioSchema = z.object({
  talk_mode: talkModeSchema,
 }).default({});

+const ttsOutputFormatSchema = z.enum(['mp3', 'wav', 'opus']);
+
+const ttsProviderSchema = z.object({
+  type: z.enum(['openai', 'custom']).default('openai'),
+  endpoint: z.string().optional(),
+  api_key: z.string().optional(),
+  model: z.string().default('gpt-4o-mini-tts'),
+  voice: z.string().default('alloy'),
+  format: ttsOutputFormatSchema.default('mp3'),
+});
+
+const ttsSchema = z.object({
+  enabled: z.boolean().default(false),
+  /** Restrict voice replies to selected channels. Empty means all channels. */
+  enabled_channels: z.array(z.string().min(1)).default([]),
+  provider: ttsProviderSchema.optional(),
+}).default({});
+
 // ── Tool policy schemas ──────────────────────────────────────────────

 const toolProfileEnum = z.enum(['minimal', 'messaging', 'coding', 'full']);
@@ -912,6 +930,7 @@ export const configSchema = z.object({
  retry: retrySchema,
  web_search: webSearchSchema,
  audio: audioSchema,
+  tts: ttsSchema,
  prompt: promptSchema,
  tools: toolsSchema,
  sandbox: sandboxSchema,
@@ -936,6 +955,7 @@ export type CompactionConfig = z.infer<typeof compactionSchema>;
 export type MemoryConfig = z.infer<typeof memorySchema>;
 export type WebSearchConfig = z.infer<typeof webSearchSchema>;
 export type AudioConfig = z.infer<typeof audioSchema>;
+export type TtsConfig = z.infer<typeof ttsSchema>;
 export type ProcessConfig = z.infer<typeof processSchema>;
 export type BrowserConfig = z.infer<typeof browserSchema>;
 export type K8sConfig = z.infer<typeof k8sSchema>;
@@ -1122,6 +1122,156 @@ describe('daemon audio routing integration', () => {
  });
 });

+describe('daemon tts routing integration', () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('attaches synthesized audio reply when tts is enabled for the channel', async () => {
+    const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('voice-enabled response');
+    const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      status: 200,
+      statusText: 'OK',
+      arrayBuffer: async () => Uint8Array.from([7, 8, 9]).buffer,
+    } as Response);
+
+    const session = {
+      id: 'telegram:tts-user-1',
+      addMessage: vi.fn(),
+      getHistory: vi.fn(() => []),
+      clear: vi.fn(),
+      replaceHistory: vi.fn(),
+      getConfig: vi.fn(() => undefined),
+      setConfig: vi.fn(),
+      deleteConfig: vi.fn(),
+    };
+
+    const router = createMessageRouter({
+      sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
+      modelRouter: {
+        getAvailableTiers: () => ['default'],
+        getAllLabels: () => ({ default: 'default' }),
+        getLabel: (tier: string) => tier,
+      } as unknown as MessageRouterDeps['modelRouter'],
+      systemPrompt: 'test prompt',
+      toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
+      toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
+      config: {
+        agents: {
+          primary_tier: 'default',
+          delegation: {
+            compaction: 'default',
+            memory_extraction: 'default',
+            classification: 'default',
+            tool_summarisation: 'default',
+            complex_reasoning: 'default',
+          },
+          max_delegation_depth: 1,
+          max_iterations: 3,
+        },
+        compaction: { enabled: false },
+        models: { default: { provider: 'anthropic', model: 'claude' } },
+        tts: {
+          enabled: true,
+          enabled_channels: ['telegram'],
+          provider: {
+            type: 'custom',
+            endpoint: 'https://example.com/v1/audio/speech',
+            api_key: 'sk-test',
+            model: 'gpt-4o-mini-tts',
+            voice: 'alloy',
+            format: 'mp3',
+          },
+        },
+      } as unknown as MessageRouterDeps['config'],
+    });
+
+    const reply = vi.fn(async (_message: OutboundMessage) => {});
+    await router.handler({
+      id: 'tts-1',
+      channel: 'telegram',
+      senderId: 'tts-user-1',
+      text: 'say hello',
+      timestamp: Date.now(),
+    } as MessageRouterInput, reply);
+
+    expect(processSpy).toHaveBeenCalledTimes(1);
+    expect(fetchSpy).toHaveBeenCalledTimes(1);
+    const outbound = reply.mock.calls[0]?.[0] as OutboundMessage | undefined;
+    expect(outbound?.attachments).toBeDefined();
+    expect(outbound?.attachments?.[0]).toMatchObject({
+      mimeType: 'audio/mpeg',
+      data: 'BwgJ',
+    });
+  });
+
+  it('does not synthesize tts when channel is not enabled', async () => {
+    vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('text-only response');
+    const fetchSpy = vi.spyOn(globalThis, 'fetch');
+
+    const session = {
+      id: 'discord:tts-user-2',
+      addMessage: vi.fn(),
+      getHistory: vi.fn(() => []),
+      clear: vi.fn(),
+      replaceHistory: vi.fn(),
+      getConfig: vi.fn(() => undefined),
+      setConfig: vi.fn(),
+      deleteConfig: vi.fn(),
+    };
+
+    const router = createMessageRouter({
+      sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
+      modelRouter: {
+        getAvailableTiers: () => ['default'],
+        getAllLabels: () => ({ default: 'default' }),
+        getLabel: (tier: string) => tier,
+      } as unknown as MessageRouterDeps['modelRouter'],
+      systemPrompt: 'test prompt',
+      toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
+      toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
+      config: {
+        agents: {
+          primary_tier: 'default',
+          delegation: {
+            compaction: 'default',
+            memory_extraction: 'default',
+            classification: 'default',
+            tool_summarisation: 'default',
+            complex_reasoning: 'default',
+          },
+          max_delegation_depth: 1,
+          max_iterations: 3,
+        },
+        compaction: { enabled: false },
+        models: { default: { provider: 'anthropic', model: 'claude' } },
+        tts: {
+          enabled: true,
+          enabled_channels: ['telegram'],
+          provider: {
+            type: 'custom',
+            endpoint: 'https://example.com/v1/audio/speech',
+          },
+        },
+      } as unknown as MessageRouterDeps['config'],
+    });
+
+    const reply = vi.fn(async (_message: OutboundMessage) => {});
+    await router.handler({
+      id: 'tts-2',
+      channel: 'discord',
+      senderId: 'tts-user-2',
+      text: 'respond as text',
+      timestamp: Date.now(),
+    } as MessageRouterInput, reply);
+
+    expect(fetchSpy).not.toHaveBeenCalled();
+    const outbound = reply.mock.calls[0]?.[0] as OutboundMessage | undefined;
+    expect(outbound?.attachments).toBeUndefined();
+  });
+});
+
 describe('daemon auto-escalate integration', () => {
  afterEach(() => {
    vi.restoreAllMocks();
@@ -1,6 +1,7 @@
 import type { AudioTranscriptionConfig } from '../models/media.js';
 import type { Attachment } from '../channels/types.js';
 import { isSupportedAudio, transcribeAudio } from '../models/media.js';
+import { synthesizeSpeechAttachment } from '../models/tts.js';
 import { supportsAudioInput } from '../models/capabilities.js';
 import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
 import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
@@ -84,6 +85,17 @@ function parseResearchPrefix(text: string): string | undefined {
  }
  return undefined;
 }
+
+function isTtsEnabledForChannel(config: Config, channel: string): boolean {
+  if (!config.tts?.enabled) {
+    return false;
+  }
+  const enabledChannels = config.tts.enabled_channels ?? [];
+  if (enabledChannels.length === 0) {
+    return true;
+  }
+  return enabledChannels.includes(channel);
+}
 /**
 * Create the unified message handler for the channel registry.
 * Each channel+sender pair gets its own AgentOrchestrator backed by a persistent session.
@@ -116,6 +128,31 @@ export function createMessageRouter(deps: {
  const agents = new Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>();
  const talkModeUntil = new Map<string, number>();

+  async function maybeBuildTtsAttachment(responseText: string, channel: string) {
+    if (!isTtsEnabledForChannel(deps.config, channel)) {
+      return undefined;
+    }
+
+    const provider = deps.config.tts?.provider;
+    const endpoint = provider?.endpoint ?? (provider?.type === 'openai' ? 'https://api.openai.com/v1/audio/speech' : undefined);
+    if (!endpoint) {
+      return undefined;
+    }
+
+    try {
+      return await synthesizeSpeechAttachment(responseText, {
+        endpoint,
+        apiKey: provider?.api_key,
+        model: provider?.model,
+        voice: provider?.voice,
+        format: provider?.format,
+      });
+    } catch (error) {
+      console.warn(`TTS synthesis failed for channel ${channel}:`, error instanceof Error ? error.message : 'Unknown error');
+      return undefined;
+    }
+  }
+
  function getOrCreateAgent(channel: string, senderId: string, metadata?: Record<string, unknown>, agentOverride?: string): { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector } {
    // Resolve agent config name via routing (sender → channel → default fallback)
    const agentConfigName = agentOverride ?? deps.agentRouter?.resolve(channel, senderId);
@@ -998,7 +1035,12 @@ export function createMessageRouter(deps: {
            history,
          });
          session.addMessage({ role: 'assistant', content: response });
-          await reply({ text: response, replyTo: msg.id });
+          const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel);
+          await reply({
+            text: response,
+            replyTo: msg.id,
+            attachments: ttsAttachment ? [ttsAttachment] : undefined,
+          });
          return;
        } catch (error) {
          const detail = error instanceof Error ? error.message : String(error);
@@ -1031,10 +1073,14 @@ export function createMessageRouter(deps: {
        response = await agent.process(messageText, attachments);
      }
      const outboundAttachments = collector.drain();
+      const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel);
+      const mergedAttachments = ttsAttachment
+        ? [...outboundAttachments, ttsAttachment]
+        : outboundAttachments;
      await reply({
        text: response,
        replyTo: msg.id,
-        attachments: outboundAttachments.length > 0 ? outboundAttachments : undefined,
+        attachments: mergedAttachments.length > 0 ? mergedAttachments : undefined,
      });
    } catch (error) {
      console.error(`Error processing message from ${msg.channel}:${msg.senderId}:`, error);
@@ -10,6 +10,7 @@ export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js
 export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js';
 export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js';
 export { supportsAudioInput } from './capabilities.js';
+export { synthesizeSpeechAttachment, type TtsSynthesisConfig, type TtsOutputFormat } from './tts.js';
 export {
  isSupportedImage,
  isSupportedAudio,
@@ -0,0 +1,67 @@
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+import { synthesizeSpeechAttachment } from './tts.js';
+
+describe('synthesizeSpeechAttachment', () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('returns null when text is empty', async () => {
+    const result = await synthesizeSpeechAttachment('   ', {
+      endpoint: 'https://example.com/v1/audio/speech',
+    });
+    expect(result).toBeNull();
+  });
+
+  it('returns null when endpoint is missing', async () => {
+    const result = await synthesizeSpeechAttachment('hello', {});
+    expect(result).toBeNull();
+  });
+
+  it('returns an outbound audio attachment on success', async () => {
+    const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      status: 200,
+      statusText: 'OK',
+      arrayBuffer: async () => Uint8Array.from([1, 2, 3, 4]).buffer,
+    } as Response);
+
+    const result = await synthesizeSpeechAttachment('Hello from Flynn', {
+      endpoint: 'https://example.com/v1/audio/speech',
+      apiKey: 'sk-test',
+      model: 'gpt-4o-mini-tts',
+      voice: 'alloy',
+      format: 'mp3',
+    });
+
+    expect(fetchSpy).toHaveBeenCalledWith(
+      'https://example.com/v1/audio/speech',
+      expect.objectContaining({
+        method: 'POST',
+        headers: expect.objectContaining({
+          'Content-Type': 'application/json',
+          Authorization: 'Bearer sk-test',
+        }),
+      }),
+    );
+    expect(result).toMatchObject({
+      mimeType: 'audio/mpeg',
+      data: 'AQIDBA==',
+    });
+    expect(result?.filename).toMatch(/^flynn-reply-\d+\.mp3$/);
+  });
+
+  it('throws when the tts endpoint returns an error', async () => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: false,
+      status: 429,
+      statusText: 'Too Many Requests',
+      text: async () => 'rate limit',
+    } as Response);
+
+    await expect(synthesizeSpeechAttachment('Hello', {
+      endpoint: 'https://example.com/v1/audio/speech',
+    })).rejects.toThrow(/TTS request failed: 429 Too Many Requests/i);
+  });
+});
@@ -0,0 +1,88 @@
+import type { OutboundAttachment } from '../channels/types.js';
+
+export type TtsOutputFormat = 'mp3' | 'wav' | 'opus';
+
+export interface TtsSynthesisConfig {
+  endpoint?: string;
+  apiKey?: string;
+  model?: string;
+  voice?: string;
+  format?: TtsOutputFormat;
+}
+
+function outputFormatToMimeType(format: TtsOutputFormat): string {
+  switch (format) {
+    case 'wav':
+      return 'audio/wav';
+    case 'opus':
+      return 'audio/ogg';
+    case 'mp3':
+    default:
+      return 'audio/mpeg';
+  }
+}
+
+function outputFormatToExtension(format: TtsOutputFormat): string {
+  switch (format) {
+    case 'wav':
+      return 'wav';
+    case 'opus':
+      return 'ogg';
+    case 'mp3':
+    default:
+      return 'mp3';
+  }
+}
+
+/** Synthesize speech via an OpenAI-compatible /v1/audio/speech endpoint. */
+export async function synthesizeSpeechAttachment(
+  text: string,
+  config: TtsSynthesisConfig,
+): Promise<OutboundAttachment | null> {
+  const trimmed = text.trim();
+  if (!trimmed) {
+    return null;
+  }
+  if (!config.endpoint) {
+    return null;
+  }
+
+  const format = config.format ?? 'mp3';
+  const model = config.model ?? 'gpt-4o-mini-tts';
+  const voice = config.voice ?? 'alloy';
+
+  const headers: Record<string, string> = {
+    'Content-Type': 'application/json',
+  };
+  if (config.apiKey) {
+    headers.Authorization = `Bearer ${config.apiKey}`;
+  }
+
+  const response = await fetch(config.endpoint, {
+    method: 'POST',
+    headers,
+    body: JSON.stringify({
+      model,
+      voice,
+      input: trimmed,
+      response_format: format,
+    }),
+  });
+
+  if (!response.ok) {
+    const detail = await response.text().catch(() => '');
+    throw new Error(
+      `TTS request failed: ${response.status} ${response.statusText}${detail ? ` - ${detail.slice(0, 200)}` : ''}`,
+    );
+  }
+
+  const audioBytes = await response.arrayBuffer();
+  const data = Buffer.from(audioBytes).toString('base64');
+  const extension = outputFormatToExtension(format);
+
+  return {
+    mimeType: outputFormatToMimeType(format),
+    data,
+    filename: `flynn-reply-${Date.now()}.${extension}`,
+  };
+}