feat: implement tier-a4 tts voice output replies

2026-02-18 10:22:28 -08:00
parent 3eb07875f1
commit a71aa5992d
11 changed files with 482 additions and 4 deletions
@@ -1,3 +1,3 @@
 export { loadConfig, deepMerge } from './loader.js';
 export { persistConfig } from './persistence.js';
-export { configSchema, MODEL_PROVIDERS, type ModelProvider, type Config, type TelegramConfig, type ModelConfig, type CronJobConfig, type AgentsConfig, type CompactionConfig, type ToolProfile, type ToolOverrideConfig, type ToolsConfig, type SandboxConfig, type AgentConfigEntry, type RoutingConfig, type ServerConfig, type BackupConfig, type K8sConfig } from './schema.js';
+export { configSchema, MODEL_PROVIDERS, type ModelProvider, type Config, type TelegramConfig, type ModelConfig, type CronJobConfig, type AgentsConfig, type CompactionConfig, type ToolProfile, type ToolOverrideConfig, type ToolsConfig, type SandboxConfig, type AgentConfigEntry, type RoutingConfig, type ServerConfig, type BackupConfig, type K8sConfig, type TtsConfig } from './schema.js';
@@ -660,6 +660,49 @@ describe('configSchema — audio talk mode', () => {
  });
 });

+describe('configSchema — tts', () => {
+  const minimalConfig = {
+    telegram: { bot_token: 'test', allowed_chat_ids: [1] },
+    models: { default: { provider: 'anthropic', model: 'claude-3' } },
+  };
+
+  it('defaults tts fields', () => {
+    const result = configSchema.parse(minimalConfig);
+    expect(result.tts.enabled).toBe(false);
+    expect(result.tts.enabled_channels).toEqual([]);
+    expect(result.tts.provider).toBeUndefined();
+  });
+
+  it('accepts custom tts provider settings', () => {
+    const result = configSchema.parse({
+      ...minimalConfig,
+      tts: {
+        enabled: true,
+        enabled_channels: ['telegram', 'discord'],
+        provider: {
+          type: 'custom',
+          endpoint: 'https://example.com/v1/audio/speech',
+          api_key: 'sk-test',
+          model: 'gpt-4o-mini-tts',
+          voice: 'nova',
+          format: 'wav',
+        },
+      },
+    });
+
+    expect(result.tts.enabled).toBe(true);
+    expect(result.tts.enabled_channels).toEqual(['telegram', 'discord']);
+    expect(result.tts.provider).toMatchObject({
+      type: 'custom',
+      endpoint: 'https://example.com/v1/audio/speech',
+      api_key: 'sk-test',
+      model: 'gpt-4o-mini-tts',
+      voice: 'nova',
+      format: 'wav',
+    });
+  });
+});
+
 describe('configSchema — mattermost', () => {
  const minimalConfig = {
    telegram: { bot_token: 'test', allowed_chat_ids: [1] },
@@ -730,6 +730,24 @@ const audioSchema = z.object({
  talk_mode: talkModeSchema,
 }).default({});

+const ttsOutputFormatSchema = z.enum(['mp3', 'wav', 'opus']);
+
+const ttsProviderSchema = z.object({
+  type: z.enum(['openai', 'custom']).default('openai'),
+  endpoint: z.string().optional(),
+  api_key: z.string().optional(),
+  model: z.string().default('gpt-4o-mini-tts'),
+  voice: z.string().default('alloy'),
+  format: ttsOutputFormatSchema.default('mp3'),
+});
+
+const ttsSchema = z.object({
+  enabled: z.boolean().default(false),
+  /** Restrict voice replies to selected channels. Empty means all channels. */
+  enabled_channels: z.array(z.string().min(1)).default([]),
+  provider: ttsProviderSchema.optional(),
+}).default({});
+
 // ── Tool policy schemas ──────────────────────────────────────────────

 const toolProfileEnum = z.enum(['minimal', 'messaging', 'coding', 'full']);
@@ -912,6 +930,7 @@ export const configSchema = z.object({
  retry: retrySchema,
  web_search: webSearchSchema,
  audio: audioSchema,
+  tts: ttsSchema,
  prompt: promptSchema,
  tools: toolsSchema,
  sandbox: sandboxSchema,
@@ -936,6 +955,7 @@ export type CompactionConfig = z.infer<typeof compactionSchema>;
 export type MemoryConfig = z.infer<typeof memorySchema>;
 export type WebSearchConfig = z.infer<typeof webSearchSchema>;
 export type AudioConfig = z.infer<typeof audioSchema>;
+export type TtsConfig = z.infer<typeof ttsSchema>;
 export type ProcessConfig = z.infer<typeof processSchema>;
 export type BrowserConfig = z.infer<typeof browserSchema>;
 export type K8sConfig = z.infer<typeof k8sSchema>;
@@ -1122,6 +1122,156 @@ describe('daemon audio routing integration', () => {
  });
 });

+describe('daemon tts routing integration', () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('attaches synthesized audio reply when tts is enabled for the channel', async () => {
+    const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('voice-enabled response');
+    const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      status: 200,
+      statusText: 'OK',
+      arrayBuffer: async () => Uint8Array.from([7, 8, 9]).buffer,
+    } as Response);
+
+    const session = {
+      id: 'telegram:tts-user-1',
+      addMessage: vi.fn(),
+      getHistory: vi.fn(() => []),
+      clear: vi.fn(),
+      replaceHistory: vi.fn(),
+      getConfig: vi.fn(() => undefined),
+      setConfig: vi.fn(),
+      deleteConfig: vi.fn(),
+    };
+
+    const router = createMessageRouter({
+      sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
+      modelRouter: {
+        getAvailableTiers: () => ['default'],
+        getAllLabels: () => ({ default: 'default' }),
+        getLabel: (tier: string) => tier,
+      } as unknown as MessageRouterDeps['modelRouter'],
+      systemPrompt: 'test prompt',
+      toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
+      toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
+      config: {
+        agents: {
+          primary_tier: 'default',
+          delegation: {
+            compaction: 'default',
+            memory_extraction: 'default',
+            classification: 'default',
+            tool_summarisation: 'default',
+            complex_reasoning: 'default',
+          },
+          max_delegation_depth: 1,
+          max_iterations: 3,
+        },
+        compaction: { enabled: false },
+        models: { default: { provider: 'anthropic', model: 'claude' } },
+        tts: {
+          enabled: true,
+          enabled_channels: ['telegram'],
+          provider: {
+            type: 'custom',
+            endpoint: 'https://example.com/v1/audio/speech',
+            api_key: 'sk-test',
+            model: 'gpt-4o-mini-tts',
+            voice: 'alloy',
+            format: 'mp3',
+          },
+        },
+      } as unknown as MessageRouterDeps['config'],
+    });
+
+    const reply = vi.fn(async (_message: OutboundMessage) => {});
+    await router.handler({
+      id: 'tts-1',
+      channel: 'telegram',
+      senderId: 'tts-user-1',
+      text: 'say hello',
+      timestamp: Date.now(),
+    } as MessageRouterInput, reply);
+
+    expect(processSpy).toHaveBeenCalledTimes(1);
+    expect(fetchSpy).toHaveBeenCalledTimes(1);
+    const outbound = reply.mock.calls[0]?.[0] as OutboundMessage | undefined;
+    expect(outbound?.attachments).toBeDefined();
+    expect(outbound?.attachments?.[0]).toMatchObject({
+      mimeType: 'audio/mpeg',
+      data: 'BwgJ',
+    });
+  });
+
+  it('does not synthesize tts when channel is not enabled', async () => {
+    vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('text-only response');
+    const fetchSpy = vi.spyOn(globalThis, 'fetch');
+
+    const session = {
+      id: 'discord:tts-user-2',
+      addMessage: vi.fn(),
+      getHistory: vi.fn(() => []),
+      clear: vi.fn(),
+      replaceHistory: vi.fn(),
+      getConfig: vi.fn(() => undefined),
+      setConfig: vi.fn(),
+      deleteConfig: vi.fn(),
+    };
+
+    const router = createMessageRouter({
+      sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
+      modelRouter: {
+        getAvailableTiers: () => ['default'],
+        getAllLabels: () => ({ default: 'default' }),
+        getLabel: (tier: string) => tier,
+      } as unknown as MessageRouterDeps['modelRouter'],
+      systemPrompt: 'test prompt',
+      toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
+      toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
+      config: {
+        agents: {
+          primary_tier: 'default',
+          delegation: {
+            compaction: 'default',
+            memory_extraction: 'default',
+            classification: 'default',
+            tool_summarisation: 'default',
+            complex_reasoning: 'default',
+          },
+          max_delegation_depth: 1,
+          max_iterations: 3,
+        },
+        compaction: { enabled: false },
+        models: { default: { provider: 'anthropic', model: 'claude' } },
+        tts: {
+          enabled: true,
+          enabled_channels: ['telegram'],
+          provider: {
+            type: 'custom',
+            endpoint: 'https://example.com/v1/audio/speech',
+          },
+        },
+      } as unknown as MessageRouterDeps['config'],
+    });
+
+    const reply = vi.fn(async (_message: OutboundMessage) => {});
+    await router.handler({
+      id: 'tts-2',
+      channel: 'discord',
+      senderId: 'tts-user-2',
+      text: 'respond as text',
+      timestamp: Date.now(),
+    } as MessageRouterInput, reply);
+
+    expect(fetchSpy).not.toHaveBeenCalled();
+    const outbound = reply.mock.calls[0]?.[0] as OutboundMessage | undefined;
+    expect(outbound?.attachments).toBeUndefined();
+  });
+});
+
 describe('daemon auto-escalate integration', () => {
  afterEach(() => {
    vi.restoreAllMocks();
@@ -1,6 +1,7 @@
 import type { AudioTranscriptionConfig } from '../models/media.js';
 import type { Attachment } from '../channels/types.js';
 import { isSupportedAudio, transcribeAudio } from '../models/media.js';
+import { synthesizeSpeechAttachment } from '../models/tts.js';
 import { supportsAudioInput } from '../models/capabilities.js';
 import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
 import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
@@ -84,6 +85,17 @@ function parseResearchPrefix(text: string): string | undefined {
  }
  return undefined;
 }
+
+function isTtsEnabledForChannel(config: Config, channel: string): boolean {
+  if (!config.tts?.enabled) {
+    return false;
+  }
+  const enabledChannels = config.tts.enabled_channels ?? [];
+  if (enabledChannels.length === 0) {
+    return true;
+  }
+  return enabledChannels.includes(channel);
+}
 /**
 * Create the unified message handler for the channel registry.
 * Each channel+sender pair gets its own AgentOrchestrator backed by a persistent session.
@@ -116,6 +128,31 @@ export function createMessageRouter(deps: {
  const agents = new Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>();
  const talkModeUntil = new Map<string, number>();

+  async function maybeBuildTtsAttachment(responseText: string, channel: string) {
+    if (!isTtsEnabledForChannel(deps.config, channel)) {
+      return undefined;
+    }
+
+    const provider = deps.config.tts?.provider;
+    const endpoint = provider?.endpoint ?? (provider?.type === 'openai' ? 'https://api.openai.com/v1/audio/speech' : undefined);
+    if (!endpoint) {
+      return undefined;
+    }
+
+    try {
+      return await synthesizeSpeechAttachment(responseText, {
+        endpoint,
+        apiKey: provider?.api_key,
+        model: provider?.model,
+        voice: provider?.voice,
+        format: provider?.format,
+      });
+    } catch (error) {
+      console.warn(`TTS synthesis failed for channel ${channel}:`, error instanceof Error ? error.message : 'Unknown error');
+      return undefined;
+    }
+  }
+
  function getOrCreateAgent(channel: string, senderId: string, metadata?: Record<string, unknown>, agentOverride?: string): { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector } {
    // Resolve agent config name via routing (sender → channel → default fallback)
    const agentConfigName = agentOverride ?? deps.agentRouter?.resolve(channel, senderId);
@@ -998,7 +1035,12 @@ export function createMessageRouter(deps: {
            history,
          });
          session.addMessage({ role: 'assistant', content: response });
-          await reply({ text: response, replyTo: msg.id });
+          const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel);
+          await reply({
+            text: response,
+            replyTo: msg.id,
+            attachments: ttsAttachment ? [ttsAttachment] : undefined,
+          });
          return;
        } catch (error) {
          const detail = error instanceof Error ? error.message : String(error);
@@ -1031,10 +1073,14 @@ export function createMessageRouter(deps: {
        response = await agent.process(messageText, attachments);
      }
      const outboundAttachments = collector.drain();
+      const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel);
+      const mergedAttachments = ttsAttachment
+        ? [...outboundAttachments, ttsAttachment]
+        : outboundAttachments;
      await reply({
        text: response,
        replyTo: msg.id,
-        attachments: outboundAttachments.length > 0 ? outboundAttachments : undefined,
+        attachments: mergedAttachments.length > 0 ? mergedAttachments : undefined,
      });
    } catch (error) {
      console.error(`Error processing message from ${msg.channel}:${msg.senderId}:`, error);
@@ -10,6 +10,7 @@ export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js
 export { withRetry, isRetryable, DEFAULT_RETRY_CONFIG, type RetryConfig } from './retry.js';
 export { estimateCost, MODEL_COSTS_PER_MILLION } from './costs.js';
 export { supportsAudioInput } from './capabilities.js';
+export { synthesizeSpeechAttachment, type TtsSynthesisConfig, type TtsOutputFormat } from './tts.js';
 export {
  isSupportedImage,
  isSupportedAudio,
@@ -0,0 +1,67 @@
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+import { synthesizeSpeechAttachment } from './tts.js';
+
+describe('synthesizeSpeechAttachment', () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('returns null when text is empty', async () => {
+    const result = await synthesizeSpeechAttachment('   ', {
+      endpoint: 'https://example.com/v1/audio/speech',
+    });
+    expect(result).toBeNull();
+  });
+
+  it('returns null when endpoint is missing', async () => {
+    const result = await synthesizeSpeechAttachment('hello', {});
+    expect(result).toBeNull();
+  });
+
+  it('returns an outbound audio attachment on success', async () => {
+    const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      status: 200,
+      statusText: 'OK',
+      arrayBuffer: async () => Uint8Array.from([1, 2, 3, 4]).buffer,
+    } as Response);
+
+    const result = await synthesizeSpeechAttachment('Hello from Flynn', {
+      endpoint: 'https://example.com/v1/audio/speech',
+      apiKey: 'sk-test',
+      model: 'gpt-4o-mini-tts',
+      voice: 'alloy',
+      format: 'mp3',
+    });
+
+    expect(fetchSpy).toHaveBeenCalledWith(
+      'https://example.com/v1/audio/speech',
+      expect.objectContaining({
+        method: 'POST',
+        headers: expect.objectContaining({
+          'Content-Type': 'application/json',
+          Authorization: 'Bearer sk-test',
+        }),
+      }),
+    );
+    expect(result).toMatchObject({
+      mimeType: 'audio/mpeg',
+      data: 'AQIDBA==',
+    });
+    expect(result?.filename).toMatch(/^flynn-reply-\d+\.mp3$/);
+  });
+
+  it('throws when the tts endpoint returns an error', async () => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: false,
+      status: 429,
+      statusText: 'Too Many Requests',
+      text: async () => 'rate limit',
+    } as Response);
+
+    await expect(synthesizeSpeechAttachment('Hello', {
+      endpoint: 'https://example.com/v1/audio/speech',
+    })).rejects.toThrow(/TTS request failed: 429 Too Many Requests/i);
+  });
+});
@@ -0,0 +1,88 @@
+import type { OutboundAttachment } from '../channels/types.js';
+
+export type TtsOutputFormat = 'mp3' | 'wav' | 'opus';
+
+export interface TtsSynthesisConfig {
+  endpoint?: string;
+  apiKey?: string;
+  model?: string;
+  voice?: string;
+  format?: TtsOutputFormat;
+}
+
+function outputFormatToMimeType(format: TtsOutputFormat): string {
+  switch (format) {
+    case 'wav':
+      return 'audio/wav';
+    case 'opus':
+      return 'audio/ogg';
+    case 'mp3':
+    default:
+      return 'audio/mpeg';
+  }
+}
+
+function outputFormatToExtension(format: TtsOutputFormat): string {
+  switch (format) {
+    case 'wav':
+      return 'wav';
+    case 'opus':
+      return 'ogg';
+    case 'mp3':
+    default:
+      return 'mp3';
+  }
+}
+
+/** Synthesize speech via an OpenAI-compatible /v1/audio/speech endpoint. */
+export async function synthesizeSpeechAttachment(
+  text: string,
+  config: TtsSynthesisConfig,
+): Promise<OutboundAttachment | null> {
+  const trimmed = text.trim();
+  if (!trimmed) {
+    return null;
+  }
+  if (!config.endpoint) {
+    return null;
+  }
+
+  const format = config.format ?? 'mp3';
+  const model = config.model ?? 'gpt-4o-mini-tts';
+  const voice = config.voice ?? 'alloy';
+
+  const headers: Record<string, string> = {
+    'Content-Type': 'application/json',
+  };
+  if (config.apiKey) {
+    headers.Authorization = `Bearer ${config.apiKey}`;
+  }
+
+  const response = await fetch(config.endpoint, {
+    method: 'POST',
+    headers,
+    body: JSON.stringify({
+      model,
+      voice,
+      input: trimmed,
+      response_format: format,
+    }),
+  });
+
+  if (!response.ok) {
+    const detail = await response.text().catch(() => '');
+    throw new Error(
+      `TTS request failed: ${response.status} ${response.statusText}${detail ? ` - ${detail.slice(0, 200)}` : ''}`,
+    );
+  }
+
+  const audioBytes = await response.arrayBuffer();
+  const data = Buffer.from(audioBytes).toString('base64');
+  const extension = outputFormatToExtension(format);
+
+  return {
+    mimeType: outputFormatToMimeType(format),
+    data,
+    filename: `flynn-reply-${Date.now()}.${extension}`,
+  };
+}