feat: implement tier-a4 tts voice output replies
This commit is contained in:
@@ -1122,6 +1122,156 @@ describe('daemon audio routing integration', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('daemon tts routing integration', () => {
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
it('attaches synthesized audio reply when tts is enabled for the channel', async () => {
|
||||
const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('voice-enabled response');
|
||||
const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
|
||||
ok: true,
|
||||
status: 200,
|
||||
statusText: 'OK',
|
||||
arrayBuffer: async () => Uint8Array.from([7, 8, 9]).buffer,
|
||||
} as Response);
|
||||
|
||||
const session = {
|
||||
id: 'telegram:tts-user-1',
|
||||
addMessage: vi.fn(),
|
||||
getHistory: vi.fn(() => []),
|
||||
clear: vi.fn(),
|
||||
replaceHistory: vi.fn(),
|
||||
getConfig: vi.fn(() => undefined),
|
||||
setConfig: vi.fn(),
|
||||
deleteConfig: vi.fn(),
|
||||
};
|
||||
|
||||
const router = createMessageRouter({
|
||||
sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
|
||||
modelRouter: {
|
||||
getAvailableTiers: () => ['default'],
|
||||
getAllLabels: () => ({ default: 'default' }),
|
||||
getLabel: (tier: string) => tier,
|
||||
} as unknown as MessageRouterDeps['modelRouter'],
|
||||
systemPrompt: 'test prompt',
|
||||
toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
|
||||
toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
|
||||
config: {
|
||||
agents: {
|
||||
primary_tier: 'default',
|
||||
delegation: {
|
||||
compaction: 'default',
|
||||
memory_extraction: 'default',
|
||||
classification: 'default',
|
||||
tool_summarisation: 'default',
|
||||
complex_reasoning: 'default',
|
||||
},
|
||||
max_delegation_depth: 1,
|
||||
max_iterations: 3,
|
||||
},
|
||||
compaction: { enabled: false },
|
||||
models: { default: { provider: 'anthropic', model: 'claude' } },
|
||||
tts: {
|
||||
enabled: true,
|
||||
enabled_channels: ['telegram'],
|
||||
provider: {
|
||||
type: 'custom',
|
||||
endpoint: 'https://example.com/v1/audio/speech',
|
||||
api_key: 'sk-test',
|
||||
model: 'gpt-4o-mini-tts',
|
||||
voice: 'alloy',
|
||||
format: 'mp3',
|
||||
},
|
||||
},
|
||||
} as unknown as MessageRouterDeps['config'],
|
||||
});
|
||||
|
||||
const reply = vi.fn(async (_message: OutboundMessage) => {});
|
||||
await router.handler({
|
||||
id: 'tts-1',
|
||||
channel: 'telegram',
|
||||
senderId: 'tts-user-1',
|
||||
text: 'say hello',
|
||||
timestamp: Date.now(),
|
||||
} as MessageRouterInput, reply);
|
||||
|
||||
expect(processSpy).toHaveBeenCalledTimes(1);
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(1);
|
||||
const outbound = reply.mock.calls[0]?.[0] as OutboundMessage | undefined;
|
||||
expect(outbound?.attachments).toBeDefined();
|
||||
expect(outbound?.attachments?.[0]).toMatchObject({
|
||||
mimeType: 'audio/mpeg',
|
||||
data: 'BwgJ',
|
||||
});
|
||||
});
|
||||
|
||||
it('does not synthesize tts when channel is not enabled', async () => {
|
||||
vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('text-only response');
|
||||
const fetchSpy = vi.spyOn(globalThis, 'fetch');
|
||||
|
||||
const session = {
|
||||
id: 'discord:tts-user-2',
|
||||
addMessage: vi.fn(),
|
||||
getHistory: vi.fn(() => []),
|
||||
clear: vi.fn(),
|
||||
replaceHistory: vi.fn(),
|
||||
getConfig: vi.fn(() => undefined),
|
||||
setConfig: vi.fn(),
|
||||
deleteConfig: vi.fn(),
|
||||
};
|
||||
|
||||
const router = createMessageRouter({
|
||||
sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
|
||||
modelRouter: {
|
||||
getAvailableTiers: () => ['default'],
|
||||
getAllLabels: () => ({ default: 'default' }),
|
||||
getLabel: (tier: string) => tier,
|
||||
} as unknown as MessageRouterDeps['modelRouter'],
|
||||
systemPrompt: 'test prompt',
|
||||
toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
|
||||
toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
|
||||
config: {
|
||||
agents: {
|
||||
primary_tier: 'default',
|
||||
delegation: {
|
||||
compaction: 'default',
|
||||
memory_extraction: 'default',
|
||||
classification: 'default',
|
||||
tool_summarisation: 'default',
|
||||
complex_reasoning: 'default',
|
||||
},
|
||||
max_delegation_depth: 1,
|
||||
max_iterations: 3,
|
||||
},
|
||||
compaction: { enabled: false },
|
||||
models: { default: { provider: 'anthropic', model: 'claude' } },
|
||||
tts: {
|
||||
enabled: true,
|
||||
enabled_channels: ['telegram'],
|
||||
provider: {
|
||||
type: 'custom',
|
||||
endpoint: 'https://example.com/v1/audio/speech',
|
||||
},
|
||||
},
|
||||
} as unknown as MessageRouterDeps['config'],
|
||||
});
|
||||
|
||||
const reply = vi.fn(async (_message: OutboundMessage) => {});
|
||||
await router.handler({
|
||||
id: 'tts-2',
|
||||
channel: 'discord',
|
||||
senderId: 'tts-user-2',
|
||||
text: 'respond as text',
|
||||
timestamp: Date.now(),
|
||||
} as MessageRouterInput, reply);
|
||||
|
||||
expect(fetchSpy).not.toHaveBeenCalled();
|
||||
const outbound = reply.mock.calls[0]?.[0] as OutboundMessage | undefined;
|
||||
expect(outbound?.attachments).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('daemon auto-escalate integration', () => {
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
|
||||
+48
-2
@@ -1,6 +1,7 @@
|
||||
import type { AudioTranscriptionConfig } from '../models/media.js';
|
||||
import type { Attachment } from '../channels/types.js';
|
||||
import { isSupportedAudio, transcribeAudio } from '../models/media.js';
|
||||
import { synthesizeSpeechAttachment } from '../models/tts.js';
|
||||
import { supportsAudioInput } from '../models/capabilities.js';
|
||||
import { AgentOrchestrator, type DelegationConfig } from '../backends/index.js';
|
||||
import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
|
||||
@@ -84,6 +85,17 @@ function parseResearchPrefix(text: string): string | undefined {
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function isTtsEnabledForChannel(config: Config, channel: string): boolean {
|
||||
if (!config.tts?.enabled) {
|
||||
return false;
|
||||
}
|
||||
const enabledChannels = config.tts.enabled_channels ?? [];
|
||||
if (enabledChannels.length === 0) {
|
||||
return true;
|
||||
}
|
||||
return enabledChannels.includes(channel);
|
||||
}
|
||||
/**
|
||||
* Create the unified message handler for the channel registry.
|
||||
* Each channel+sender pair gets its own AgentOrchestrator backed by a persistent session.
|
||||
@@ -116,6 +128,31 @@ export function createMessageRouter(deps: {
|
||||
const agents = new Map<string, { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector }>();
|
||||
const talkModeUntil = new Map<string, number>();
|
||||
|
||||
async function maybeBuildTtsAttachment(responseText: string, channel: string) {
|
||||
if (!isTtsEnabledForChannel(deps.config, channel)) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const provider = deps.config.tts?.provider;
|
||||
const endpoint = provider?.endpoint ?? (provider?.type === 'openai' ? 'https://api.openai.com/v1/audio/speech' : undefined);
|
||||
if (!endpoint) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
try {
|
||||
return await synthesizeSpeechAttachment(responseText, {
|
||||
endpoint,
|
||||
apiKey: provider?.api_key,
|
||||
model: provider?.model,
|
||||
voice: provider?.voice,
|
||||
format: provider?.format,
|
||||
});
|
||||
} catch (error) {
|
||||
console.warn(`TTS synthesis failed for channel ${channel}:`, error instanceof Error ? error.message : 'Unknown error');
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function getOrCreateAgent(channel: string, senderId: string, metadata?: Record<string, unknown>, agentOverride?: string): { orchestrator: AgentOrchestrator; collector: OutboundAttachmentCollector } {
|
||||
// Resolve agent config name via routing (sender → channel → default fallback)
|
||||
const agentConfigName = agentOverride ?? deps.agentRouter?.resolve(channel, senderId);
|
||||
@@ -998,7 +1035,12 @@ export function createMessageRouter(deps: {
|
||||
history,
|
||||
});
|
||||
session.addMessage({ role: 'assistant', content: response });
|
||||
await reply({ text: response, replyTo: msg.id });
|
||||
const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel);
|
||||
await reply({
|
||||
text: response,
|
||||
replyTo: msg.id,
|
||||
attachments: ttsAttachment ? [ttsAttachment] : undefined,
|
||||
});
|
||||
return;
|
||||
} catch (error) {
|
||||
const detail = error instanceof Error ? error.message : String(error);
|
||||
@@ -1031,10 +1073,14 @@ export function createMessageRouter(deps: {
|
||||
response = await agent.process(messageText, attachments);
|
||||
}
|
||||
const outboundAttachments = collector.drain();
|
||||
const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel);
|
||||
const mergedAttachments = ttsAttachment
|
||||
? [...outboundAttachments, ttsAttachment]
|
||||
: outboundAttachments;
|
||||
await reply({
|
||||
text: response,
|
||||
replyTo: msg.id,
|
||||
attachments: outboundAttachments.length > 0 ? outboundAttachments : undefined,
|
||||
attachments: mergedAttachments.length > 0 ? mergedAttachments : undefined,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`Error processing message from ${msg.channel}:${msg.senderId}:`, error);
|
||||
|
||||
Reference in New Issue
Block a user