feat: harden voice reliability with tts fallback and talk controls

This commit is contained in:
William Valentin
2026-02-26 17:29:23 -08:00
parent 2a9bed8c91
commit 163b1a0139
13 changed files with 781 additions and 17 deletions
+174
View File
@@ -2393,6 +2393,100 @@ describe('daemon tts routing integration', () => {
expect(outbound?.attachments).toBeUndefined();
});
it('falls back to secondary TTS provider when primary fails', async () => {
vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('fallback-chain response');
const fetchSpy = vi.spyOn(globalThis, 'fetch')
.mockResolvedValueOnce({
ok: false,
status: 503,
statusText: 'Service Unavailable',
text: async () => 'primary down',
} as Response)
.mockResolvedValueOnce({
ok: true,
status: 200,
statusText: 'OK',
arrayBuffer: async () => Uint8Array.from([5, 6, 7]).buffer,
} as Response);
const session = {
id: 'telegram:tts-user-4',
addMessage: vi.fn(),
getHistory: vi.fn(() => []),
clear: vi.fn(),
replaceHistory: vi.fn(),
getConfig: vi.fn(() => undefined),
setConfig: vi.fn(),
deleteConfig: vi.fn(),
};
const router = createMessageRouter({
sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
modelRouter: {
getAvailableTiers: () => ['default'],
getAllLabels: () => ({ default: 'default' }),
getLabel: (tier: string) => tier,
} as unknown as MessageRouterDeps['modelRouter'],
systemPrompt: 'test prompt',
toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
config: {
agents: {
primary_tier: 'default',
delegation: {
compaction: 'default',
memory_extraction: 'default',
classification: 'default',
tool_summarisation: 'default',
complex_reasoning: 'default',
},
max_delegation_depth: 1,
max_iterations: 3,
},
compaction: { enabled: false },
models: { default: { provider: 'anthropic', model: 'claude' } },
tts: {
enabled: true,
enabled_channels: ['telegram'],
providers: [
{
name: 'primary',
type: 'custom',
endpoint: 'https://tts-primary.example.com/v1/audio/speech',
},
{
name: 'backup',
type: 'custom',
endpoint: 'https://tts-backup.example.com/v1/audio/speech',
},
],
fallback: {
max_attempts: 2,
failure_cooldown_ms: 60000,
},
},
} as unknown as MessageRouterDeps['config'],
});
const reply = vi.fn(async (_message: OutboundMessage) => {});
await router.handler({
id: 'tts-4',
channel: 'telegram',
senderId: 'tts-user-4',
text: 'respond with provider fallback',
timestamp: Date.now(),
} as MessageRouterInput, reply);
expect(fetchSpy).toHaveBeenCalledTimes(2);
expect(fetchSpy.mock.calls[0]?.[0]).toBe('https://tts-primary.example.com/v1/audio/speech');
expect(fetchSpy.mock.calls[1]?.[0]).toBe('https://tts-backup.example.com/v1/audio/speech');
const outbound = reply.mock.calls[0]?.[0] as OutboundMessage | undefined;
expect(outbound?.attachments?.[0]).toMatchObject({
mimeType: 'audio/mpeg',
data: 'BQYH',
});
});
it('falls back to text-only replies when tts synthesis fails', async () => {
vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('fallback response');
vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('tts down'));
@@ -2909,4 +3003,84 @@ describe('daemon talk mode (voice wake) integration', () => {
expect(processSpy).toHaveBeenCalledOnce();
expect(processSpy).toHaveBeenCalledWith('what time is it?', undefined, undefined);
});
it('treats spoken cancel as /stop while talk mode is active', async () => {
const cancelSpy = vi.spyOn(AgentOrchestrator.prototype, 'cancel');
vi.spyOn(AgentOrchestrator.prototype, 'isCancellable').mockReturnValue(true);
const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process');
let resolveFirst: ((value: string) => void) | undefined;
let markStarted: (() => void) | undefined;
const started = new Promise<void>((resolve) => { markStarted = resolve; });
processSpy.mockImplementationOnce(() => {
markStarted?.();
return new Promise<string>((resolve) => { resolveFirst = resolve; });
});
const session = {
id: 'telegram:user-talk-2',
addMessage: vi.fn(),
getHistory: vi.fn(() => []),
clear: vi.fn(),
replaceHistory: vi.fn(),
getConfig: vi.fn(() => undefined),
setConfig: vi.fn(),
deleteConfig: vi.fn(),
};
const commandRegistry = new CommandRegistry();
registerBuiltinCommands(commandRegistry);
const router = createMessageRouter({
sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
modelRouter: {
getAvailableTiers: () => ['fast', 'default', 'complex', 'local'],
getAllLabels: () => ({ fast: 'fast', default: 'default', complex: 'complex', local: 'local' }),
getLabel: (tier: string) => tier,
} as unknown as MessageRouterDeps['modelRouter'],
systemPrompt: 'test prompt',
toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
config: {
agents: {
primary_tier: 'default',
delegation: { compaction: 'fast', memory_extraction: 'fast', classification: 'fast', tool_summarisation: 'fast', complex_reasoning: 'complex' },
max_delegation_depth: 3,
max_iterations: 10,
},
compaction: { enabled: false },
models: { default: { provider: 'anthropic', model: 'claude' } },
audio: { talk_mode: { enabled: true, wake_phrase: 'hey flynn', timeout_ms: 120000, allow_manual_toggle: true } },
} as unknown as MessageRouterDeps['config'],
commandRegistry,
});
const reply = vi.fn(async (_message: OutboundMessage) => {});
const firstRun = router.handler({
id: 'm-talk-3',
channel: 'telegram',
senderId: 'user-talk-2',
text: 'hey flynn start a long task',
timestamp: Date.now(),
} as MessageRouterInput, reply);
await started;
await router.handler({
id: 'm-talk-4',
channel: 'telegram',
senderId: 'user-talk-2',
text: 'cancel',
timestamp: Date.now(),
} as MessageRouterInput, reply);
expect(cancelSpy).toHaveBeenCalledTimes(1);
expect(processSpy).toHaveBeenCalledTimes(1);
expect(reply).toHaveBeenCalledWith(expect.objectContaining({
text: 'Cancellation requested. The active operation will stop at the next safe point.',
replyTo: 'm-talk-4',
}));
resolveFirst?.('operation cancelled by user.');
await firstRun;
});
});
+44 -15
View File
@@ -1,7 +1,7 @@
import type { AudioTranscriptionConfig } from '../models/media.js';
import type { Attachment } from '../channels/types.js';
import { isSupportedAudio, transcribeAudio } from '../models/media.js';
import { synthesizeSpeechAttachment } from '../models/tts.js';
import { synthesizeSpeechWithFallback, TtsHealthTracker } from '../models/tts.js';
import { supportsAudioInput } from '../models/capabilities.js';
import { AgentOrchestrator, SubagentManager, type DelegationConfig } from '../backends/index.js';
import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
@@ -397,6 +397,7 @@ export function createMessageRouter(deps: {
const talkModeUntil = new Map<string, number>();
const activeRuns = new Map<string, AgentOrchestrator>();
const reactionCooldowns = new Map<string, number>();
const ttsHealthTracker = new TtsHealthTracker();
function getBackendMode(): BackendRuntimeMode {
return deps.getBackendMode?.() ?? 'config_default';
@@ -518,24 +519,42 @@ export function createMessageRouter(deps: {
return undefined;
}
const provider = deps.config.tts?.provider;
const endpoint = provider?.endpoint ?? (provider?.type === 'openai' ? 'https://api.openai.com/v1/audio/speech' : undefined);
if (!endpoint) {
const configuredProviders = deps.config.tts?.providers ?? [];
const providers = configuredProviders.length > 0
? configuredProviders
: (deps.config.tts?.provider ? [deps.config.tts.provider] : []);
if (providers.length === 0) {
return undefined;
}
try {
return await synthesizeSpeechAttachment(responseText, {
endpoint,
apiKey: provider?.api_key,
model: provider?.model,
voice: provider?.voice,
format: provider?.format,
});
} catch (error) {
console.warn(`TTS synthesis failed for channel ${channel}:`, error instanceof Error ? error.message : 'Unknown error');
return undefined;
const outcome = await synthesizeSpeechWithFallback(responseText, {
providers: providers.map((provider, index) => ({
id: provider.name?.trim() || `tts-provider-${index + 1}`,
type: provider.type,
endpoint: provider.endpoint,
apiKey: provider.api_key,
model: provider.model,
voice: provider.voice,
format: provider.format,
})),
fallback: {
maxAttempts: deps.config.tts?.fallback?.max_attempts,
failureCooldownMs: deps.config.tts?.fallback?.failure_cooldown_ms,
},
healthTracker: ttsHealthTracker,
});
if (!outcome.attachment && outcome.attemptedProviders.length > 0) {
console.warn(
`TTS synthesis fallback exhausted for channel ${channel}. `
+ `attempted=${outcome.attemptedProviders.join(',') || 'none'} `
+ `skipped=${outcome.skippedProviders.join(',') || 'none'} `
+ `${outcome.lastError ? `last_error=${outcome.lastError}` : ''}`.trim(),
);
}
return outcome.attachment ?? undefined;
}
function getOrCreateAgent(
@@ -822,6 +841,7 @@ export function createMessageRouter(deps: {
let incomingText = msg.text;
let matchedReactionName: string | undefined;
const talkMode = deps.config.audio?.talk_mode;
let inTalkModeContext = false;
if (talkMode?.enabled && incomingText.trim().length > 0) {
const key = `${msg.channel}:${msg.senderId}`;
const now = Date.now();
@@ -858,6 +878,7 @@ export function createMessageRouter(deps: {
if (wakeMatched && wakeRegex) {
talkModeUntil.set(key, now + timeoutMs);
inTalkModeContext = true;
incomingText = incomingText.replace(wakeRegex, '').trim();
if (!incomingText) {
await reply({ text: `Listening. Talk mode active for ${Math.ceil(timeoutMs / 1000)}s.`, replyTo: msg.id });
@@ -865,11 +886,19 @@ export function createMessageRouter(deps: {
}
} else if (currentUntil > now) {
talkModeUntil.set(key, now + timeoutMs);
inTalkModeContext = true;
} else {
return;
}
}
if (inTalkModeContext && !msg.metadata?.isCommand) {
const spokenCommand = incomingText.trim().toLowerCase();
if (spokenCommand === 'stop' || spokenCommand === 'cancel') {
incomingText = '/stop';
}
}
const session = deps.sessionManager.getSession(msg.channel, msg.senderId);
const queueMode = session.getConfig('queue.mode') ?? deps.config.server?.queue?.mode ?? 'collect';
const rawCommand = msg.metadata?.isCommand