feat: implement tier-a4 tts voice output replies

2026-02-18 10:22:28 -08:00
parent 3eb07875f1
commit a71aa5992d
11 changed files with 482 additions and 4 deletions
@@ -1122,6 +1122,156 @@ describe('daemon audio routing integration', () => {
  });
 });

+describe('daemon tts routing integration', () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('attaches synthesized audio reply when tts is enabled for the channel', async () => {
+    const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('voice-enabled response');
+    const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      status: 200,
+      statusText: 'OK',
+      arrayBuffer: async () => Uint8Array.from([7, 8, 9]).buffer,
+    } as Response);
+
+    const session = {
+      id: 'telegram:tts-user-1',
+      addMessage: vi.fn(),
+      getHistory: vi.fn(() => []),
+      clear: vi.fn(),
+      replaceHistory: vi.fn(),
+      getConfig: vi.fn(() => undefined),
+      setConfig: vi.fn(),
+      deleteConfig: vi.fn(),
+    };
+
+    const router = createMessageRouter({
+      sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
+      modelRouter: {
+        getAvailableTiers: () => ['default'],
+        getAllLabels: () => ({ default: 'default' }),
+        getLabel: (tier: string) => tier,
+      } as unknown as MessageRouterDeps['modelRouter'],
+      systemPrompt: 'test prompt',
+      toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
+      toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
+      config: {
+        agents: {
+          primary_tier: 'default',
+          delegation: {
+            compaction: 'default',
+            memory_extraction: 'default',
+            classification: 'default',
+            tool_summarisation: 'default',
+            complex_reasoning: 'default',
+          },
+          max_delegation_depth: 1,
+          max_iterations: 3,
+        },
+        compaction: { enabled: false },
+        models: { default: { provider: 'anthropic', model: 'claude' } },
+        tts: {
+          enabled: true,
+          enabled_channels: ['telegram'],
+          provider: {
+            type: 'custom',
+            endpoint: 'https://example.com/v1/audio/speech',
+            api_key: 'sk-test',
+            model: 'gpt-4o-mini-tts',
+            voice: 'alloy',
+            format: 'mp3',
+          },
+        },
+      } as unknown as MessageRouterDeps['config'],
+    });
+
+    const reply = vi.fn(async (_message: OutboundMessage) => {});
+    await router.handler({
+      id: 'tts-1',
+      channel: 'telegram',
+      senderId: 'tts-user-1',
+      text: 'say hello',
+      timestamp: Date.now(),
+    } as MessageRouterInput, reply);
+
+    expect(processSpy).toHaveBeenCalledTimes(1);
+    expect(fetchSpy).toHaveBeenCalledTimes(1);
+    const outbound = reply.mock.calls[0]?.[0] as OutboundMessage | undefined;
+    expect(outbound?.attachments).toBeDefined();
+    expect(outbound?.attachments?.[0]).toMatchObject({
+      mimeType: 'audio/mpeg',
+      data: 'BwgJ',
+    });
+  });
+
+  it('does not synthesize tts when channel is not enabled', async () => {
+    vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('text-only response');
+    const fetchSpy = vi.spyOn(globalThis, 'fetch');
+
+    const session = {
+      id: 'discord:tts-user-2',
+      addMessage: vi.fn(),
+      getHistory: vi.fn(() => []),
+      clear: vi.fn(),
+      replaceHistory: vi.fn(),
+      getConfig: vi.fn(() => undefined),
+      setConfig: vi.fn(),
+      deleteConfig: vi.fn(),
+    };
+
+    const router = createMessageRouter({
+      sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
+      modelRouter: {
+        getAvailableTiers: () => ['default'],
+        getAllLabels: () => ({ default: 'default' }),
+        getLabel: (tier: string) => tier,
+      } as unknown as MessageRouterDeps['modelRouter'],
+      systemPrompt: 'test prompt',
+      toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
+      toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
+      config: {
+        agents: {
+          primary_tier: 'default',
+          delegation: {
+            compaction: 'default',
+            memory_extraction: 'default',
+            classification: 'default',
+            tool_summarisation: 'default',
+            complex_reasoning: 'default',
+          },
+          max_delegation_depth: 1,
+          max_iterations: 3,
+        },
+        compaction: { enabled: false },
+        models: { default: { provider: 'anthropic', model: 'claude' } },
+        tts: {
+          enabled: true,
+          enabled_channels: ['telegram'],
+          provider: {
+            type: 'custom',
+            endpoint: 'https://example.com/v1/audio/speech',
+          },
+        },
+      } as unknown as MessageRouterDeps['config'],
+    });
+
+    const reply = vi.fn(async (_message: OutboundMessage) => {});
+    await router.handler({
+      id: 'tts-2',
+      channel: 'discord',
+      senderId: 'tts-user-2',
+      text: 'respond as text',
+      timestamp: Date.now(),
+    } as MessageRouterInput, reply);
+
+    expect(fetchSpy).not.toHaveBeenCalled();
+    const outbound = reply.mock.calls[0]?.[0] as OutboundMessage | undefined;
+    expect(outbound?.attachments).toBeUndefined();
+  });
+});
+
 describe('daemon auto-escalate integration', () => {
  afterEach(() => {
    vi.restoreAllMocks();
@@ -1,6 +1,7 @@
 import type { AudioTranscriptionConfig } from '../models/media.js';
 import type { Attachment } from '../channels/types.js';
 import { isSupportedAudio, transcribeAudio } from '../models/media.js';
+import { synthesizeSpeechAttachment } from '../models/tts.js';
 import { supportsAudioInput } from '../models/capabilities.js';
 import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
 import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
@@ -84,6 +85,17 @@ function parseResearchPrefix(text: string): string | undefined {
  }
  return undefined;
 }
+
+function isTtsEnabledForChannel(config: Config, channel: string): boolean {
+  if (!config.tts?.enabled) {
+    return false;
+  }
+  const enabledChannels = config.tts.enabled_channels ?? [];
+  if (enabledChannels.length === 0) {
+    return true;
+  }
+  return enabledChannels.includes(channel);
+}
 /**
 * Create the unified message handler for the channel registry.
 * Each channel+sender pair gets its own AgentOrchestrator backed by a persistent session.
@@ -116,6 +128,31 @@ export function createMessageRouter(deps: {
  const agents = new Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>();
  const talkModeUntil = new Map<string, number>();

+  async function maybeBuildTtsAttachment(responseText: string, channel: string) {
+    if (!isTtsEnabledForChannel(deps.config, channel)) {
+      return undefined;
+    }
+
+    const provider = deps.config.tts?.provider;
+    const endpoint = provider?.endpoint ?? (provider?.type === 'openai' ? 'https://api.openai.com/v1/audio/speech' : undefined);
+    if (!endpoint) {
+      return undefined;
+    }
+
+    try {
+      return await synthesizeSpeechAttachment(responseText, {
+        endpoint,
+        apiKey: provider?.api_key,
+        model: provider?.model,
+        voice: provider?.voice,
+        format: provider?.format,
+      });
+    } catch (error) {
+      console.warn(`TTS synthesis failed for channel ${channel}:`, error instanceof Error ? error.message : 'Unknown error');
+      return undefined;
+    }
+  }
+
  function getOrCreateAgent(channel: string, senderId: string, metadata?: Record<string, unknown>, agentOverride?: string): { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector } {
    // Resolve agent config name via routing (sender → channel → default fallback)
    const agentConfigName = agentOverride ?? deps.agentRouter?.resolve(channel, senderId);
@@ -998,7 +1035,12 @@ export function createMessageRouter(deps: {
            history,
          });
          session.addMessage({ role: 'assistant', content: response });
-          await reply({ text: response, replyTo: msg.id });
+          const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel);
+          await reply({
+            text: response,
+            replyTo: msg.id,
+            attachments: ttsAttachment ? [ttsAttachment] : undefined,
+          });
          return;
        } catch (error) {
          const detail = error instanceof Error ? error.message : String(error);
@@ -1031,10 +1073,14 @@ export function createMessageRouter(deps: {
        response = await agent.process(messageText, attachments);
      }
      const outboundAttachments = collector.drain();
+      const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel);
+      const mergedAttachments = ttsAttachment
+        ? [...outboundAttachments, ttsAttachment]
+        : outboundAttachments;
      await reply({
        text: response,
        replyTo: msg.id,
-        attachments: outboundAttachments.length > 0 ? outboundAttachments : undefined,
+        attachments: mergedAttachments.length > 0 ? mergedAttachments : undefined,
      });
    } catch (error) {
      console.error(`Error processing message from ${msg.channel}:${msg.senderId}:`, error);