feat: harden voice reliability with tts fallback and talk controls
This commit is contained in:
@@ -2393,6 +2393,100 @@ describe('daemon tts routing integration', () => {
|
||||
expect(outbound?.attachments).toBeUndefined();
|
||||
});
|
||||
|
||||
it('falls back to secondary TTS provider when primary fails', async () => {
|
||||
vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('fallback-chain response');
|
||||
const fetchSpy = vi.spyOn(globalThis, 'fetch')
|
||||
.mockResolvedValueOnce({
|
||||
ok: false,
|
||||
status: 503,
|
||||
statusText: 'Service Unavailable',
|
||||
text: async () => 'primary down',
|
||||
} as Response)
|
||||
.mockResolvedValueOnce({
|
||||
ok: true,
|
||||
status: 200,
|
||||
statusText: 'OK',
|
||||
arrayBuffer: async () => Uint8Array.from([5, 6, 7]).buffer,
|
||||
} as Response);
|
||||
|
||||
const session = {
|
||||
id: 'telegram:tts-user-4',
|
||||
addMessage: vi.fn(),
|
||||
getHistory: vi.fn(() => []),
|
||||
clear: vi.fn(),
|
||||
replaceHistory: vi.fn(),
|
||||
getConfig: vi.fn(() => undefined),
|
||||
setConfig: vi.fn(),
|
||||
deleteConfig: vi.fn(),
|
||||
};
|
||||
|
||||
const router = createMessageRouter({
|
||||
sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
|
||||
modelRouter: {
|
||||
getAvailableTiers: () => ['default'],
|
||||
getAllLabels: () => ({ default: 'default' }),
|
||||
getLabel: (tier: string) => tier,
|
||||
} as unknown as MessageRouterDeps['modelRouter'],
|
||||
systemPrompt: 'test prompt',
|
||||
toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
|
||||
toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
|
||||
config: {
|
||||
agents: {
|
||||
primary_tier: 'default',
|
||||
delegation: {
|
||||
compaction: 'default',
|
||||
memory_extraction: 'default',
|
||||
classification: 'default',
|
||||
tool_summarisation: 'default',
|
||||
complex_reasoning: 'default',
|
||||
},
|
||||
max_delegation_depth: 1,
|
||||
max_iterations: 3,
|
||||
},
|
||||
compaction: { enabled: false },
|
||||
models: { default: { provider: 'anthropic', model: 'claude' } },
|
||||
tts: {
|
||||
enabled: true,
|
||||
enabled_channels: ['telegram'],
|
||||
providers: [
|
||||
{
|
||||
name: 'primary',
|
||||
type: 'custom',
|
||||
endpoint: 'https://tts-primary.example.com/v1/audio/speech',
|
||||
},
|
||||
{
|
||||
name: 'backup',
|
||||
type: 'custom',
|
||||
endpoint: 'https://tts-backup.example.com/v1/audio/speech',
|
||||
},
|
||||
],
|
||||
fallback: {
|
||||
max_attempts: 2,
|
||||
failure_cooldown_ms: 60000,
|
||||
},
|
||||
},
|
||||
} as unknown as MessageRouterDeps['config'],
|
||||
});
|
||||
|
||||
const reply = vi.fn(async (_message: OutboundMessage) => {});
|
||||
await router.handler({
|
||||
id: 'tts-4',
|
||||
channel: 'telegram',
|
||||
senderId: 'tts-user-4',
|
||||
text: 'respond with provider fallback',
|
||||
timestamp: Date.now(),
|
||||
} as MessageRouterInput, reply);
|
||||
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(2);
|
||||
expect(fetchSpy.mock.calls[0]?.[0]).toBe('https://tts-primary.example.com/v1/audio/speech');
|
||||
expect(fetchSpy.mock.calls[1]?.[0]).toBe('https://tts-backup.example.com/v1/audio/speech');
|
||||
const outbound = reply.mock.calls[0]?.[0] as OutboundMessage | undefined;
|
||||
expect(outbound?.attachments?.[0]).toMatchObject({
|
||||
mimeType: 'audio/mpeg',
|
||||
data: 'BQYH',
|
||||
});
|
||||
});
|
||||
|
||||
it('falls back to text-only replies when tts synthesis fails', async () => {
|
||||
vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('fallback response');
|
||||
vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('tts down'));
|
||||
@@ -2909,4 +3003,84 @@ describe('daemon talk mode (voice wake) integration', () => {
|
||||
expect(processSpy).toHaveBeenCalledOnce();
|
||||
expect(processSpy).toHaveBeenCalledWith('what time is it?', undefined, undefined);
|
||||
});
|
||||
|
||||
it('treats spoken cancel as /stop while talk mode is active', async () => {
|
||||
const cancelSpy = vi.spyOn(AgentOrchestrator.prototype, 'cancel');
|
||||
vi.spyOn(AgentOrchestrator.prototype, 'isCancellable').mockReturnValue(true);
|
||||
const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process');
|
||||
let resolveFirst: ((value: string) => void) | undefined;
|
||||
let markStarted: (() => void) | undefined;
|
||||
const started = new Promise<void>((resolve) => { markStarted = resolve; });
|
||||
processSpy.mockImplementationOnce(() => {
|
||||
markStarted?.();
|
||||
return new Promise<string>((resolve) => { resolveFirst = resolve; });
|
||||
});
|
||||
|
||||
const session = {
|
||||
id: 'telegram:user-talk-2',
|
||||
addMessage: vi.fn(),
|
||||
getHistory: vi.fn(() => []),
|
||||
clear: vi.fn(),
|
||||
replaceHistory: vi.fn(),
|
||||
getConfig: vi.fn(() => undefined),
|
||||
setConfig: vi.fn(),
|
||||
deleteConfig: vi.fn(),
|
||||
};
|
||||
|
||||
const commandRegistry = new CommandRegistry();
|
||||
registerBuiltinCommands(commandRegistry);
|
||||
|
||||
const router = createMessageRouter({
|
||||
sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
|
||||
modelRouter: {
|
||||
getAvailableTiers: () => ['fast', 'default', 'complex', 'local'],
|
||||
getAllLabels: () => ({ fast: 'fast', default: 'default', complex: 'complex', local: 'local' }),
|
||||
getLabel: (tier: string) => tier,
|
||||
} as unknown as MessageRouterDeps['modelRouter'],
|
||||
systemPrompt: 'test prompt',
|
||||
toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
|
||||
toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
|
||||
config: {
|
||||
agents: {
|
||||
primary_tier: 'default',
|
||||
delegation: { compaction: 'fast', memory_extraction: 'fast', classification: 'fast', tool_summarisation: 'fast', complex_reasoning: 'complex' },
|
||||
max_delegation_depth: 3,
|
||||
max_iterations: 10,
|
||||
},
|
||||
compaction: { enabled: false },
|
||||
models: { default: { provider: 'anthropic', model: 'claude' } },
|
||||
audio: { talk_mode: { enabled: true, wake_phrase: 'hey flynn', timeout_ms: 120000, allow_manual_toggle: true } },
|
||||
} as unknown as MessageRouterDeps['config'],
|
||||
commandRegistry,
|
||||
});
|
||||
|
||||
const reply = vi.fn(async (_message: OutboundMessage) => {});
|
||||
const firstRun = router.handler({
|
||||
id: 'm-talk-3',
|
||||
channel: 'telegram',
|
||||
senderId: 'user-talk-2',
|
||||
text: 'hey flynn start a long task',
|
||||
timestamp: Date.now(),
|
||||
} as MessageRouterInput, reply);
|
||||
|
||||
await started;
|
||||
|
||||
await router.handler({
|
||||
id: 'm-talk-4',
|
||||
channel: 'telegram',
|
||||
senderId: 'user-talk-2',
|
||||
text: 'cancel',
|
||||
timestamp: Date.now(),
|
||||
} as MessageRouterInput, reply);
|
||||
|
||||
expect(cancelSpy).toHaveBeenCalledTimes(1);
|
||||
expect(processSpy).toHaveBeenCalledTimes(1);
|
||||
expect(reply).toHaveBeenCalledWith(expect.objectContaining({
|
||||
text: 'Cancellation requested. The active operation will stop at the next safe point.',
|
||||
replyTo: 'm-talk-4',
|
||||
}));
|
||||
|
||||
resolveFirst?.('operation cancelled by user.');
|
||||
await firstRun;
|
||||
});
|
||||
});
|
||||
|
||||
+44
-15
@@ -1,7 +1,7 @@
|
||||
import type { AudioTranscriptionConfig } from '../models/media.js';
|
||||
import type { Attachment } from '../channels/types.js';
|
||||
import { isSupportedAudio, transcribeAudio } from '../models/media.js';
|
||||
import { synthesizeSpeechAttachment } from '../models/tts.js';
|
||||
import { synthesizeSpeechWithFallback, TtsHealthTracker } from '../models/tts.js';
|
||||
import { supportsAudioInput } from '../models/capabilities.js';
|
||||
import { AgentOrchestrator, SubagentManager, type DelegationConfig } from '../backends/index.js';
|
||||
import { OutboundAttachmentCollector } from '../backends/native/attachments.js';
|
||||
@@ -397,6 +397,7 @@ export function createMessageRouter(deps: {
|
||||
const talkModeUntil = new Map<string, number>();
|
||||
const activeRuns = new Map<string, AgentOrchestrator>();
|
||||
const reactionCooldowns = new Map<string, number>();
|
||||
const ttsHealthTracker = new TtsHealthTracker();
|
||||
|
||||
function getBackendMode(): BackendRuntimeMode {
|
||||
return deps.getBackendMode?.() ?? 'config_default';
|
||||
@@ -518,24 +519,42 @@ export function createMessageRouter(deps: {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const provider = deps.config.tts?.provider;
|
||||
const endpoint = provider?.endpoint ?? (provider?.type === 'openai' ? 'https://api.openai.com/v1/audio/speech' : undefined);
|
||||
if (!endpoint) {
|
||||
const configuredProviders = deps.config.tts?.providers ?? [];
|
||||
const providers = configuredProviders.length > 0
|
||||
? configuredProviders
|
||||
: (deps.config.tts?.provider ? [deps.config.tts.provider] : []);
|
||||
|
||||
if (providers.length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
try {
|
||||
return await synthesizeSpeechAttachment(responseText, {
|
||||
endpoint,
|
||||
apiKey: provider?.api_key,
|
||||
model: provider?.model,
|
||||
voice: provider?.voice,
|
||||
format: provider?.format,
|
||||
});
|
||||
} catch (error) {
|
||||
console.warn(`TTS synthesis failed for channel ${channel}:`, error instanceof Error ? error.message : 'Unknown error');
|
||||
return undefined;
|
||||
const outcome = await synthesizeSpeechWithFallback(responseText, {
|
||||
providers: providers.map((provider, index) => ({
|
||||
id: provider.name?.trim() || `tts-provider-${index + 1}`,
|
||||
type: provider.type,
|
||||
endpoint: provider.endpoint,
|
||||
apiKey: provider.api_key,
|
||||
model: provider.model,
|
||||
voice: provider.voice,
|
||||
format: provider.format,
|
||||
})),
|
||||
fallback: {
|
||||
maxAttempts: deps.config.tts?.fallback?.max_attempts,
|
||||
failureCooldownMs: deps.config.tts?.fallback?.failure_cooldown_ms,
|
||||
},
|
||||
healthTracker: ttsHealthTracker,
|
||||
});
|
||||
|
||||
if (!outcome.attachment && outcome.attemptedProviders.length > 0) {
|
||||
console.warn(
|
||||
`TTS synthesis fallback exhausted for channel ${channel}. `
|
||||
+ `attempted=${outcome.attemptedProviders.join(',') || 'none'} `
|
||||
+ `skipped=${outcome.skippedProviders.join(',') || 'none'} `
|
||||
+ `${outcome.lastError ? `last_error=${outcome.lastError}` : ''}`.trim(),
|
||||
);
|
||||
}
|
||||
|
||||
return outcome.attachment ?? undefined;
|
||||
}
|
||||
|
||||
function getOrCreateAgent(
|
||||
@@ -822,6 +841,7 @@ export function createMessageRouter(deps: {
|
||||
let incomingText = msg.text;
|
||||
let matchedReactionName: string | undefined;
|
||||
const talkMode = deps.config.audio?.talk_mode;
|
||||
let inTalkModeContext = false;
|
||||
if (talkMode?.enabled && incomingText.trim().length > 0) {
|
||||
const key = `${msg.channel}:${msg.senderId}`;
|
||||
const now = Date.now();
|
||||
@@ -858,6 +878,7 @@ export function createMessageRouter(deps: {
|
||||
|
||||
if (wakeMatched && wakeRegex) {
|
||||
talkModeUntil.set(key, now + timeoutMs);
|
||||
inTalkModeContext = true;
|
||||
incomingText = incomingText.replace(wakeRegex, '').trim();
|
||||
if (!incomingText) {
|
||||
await reply({ text: `Listening. Talk mode active for ${Math.ceil(timeoutMs / 1000)}s.`, replyTo: msg.id });
|
||||
@@ -865,11 +886,19 @@ export function createMessageRouter(deps: {
|
||||
}
|
||||
} else if (currentUntil > now) {
|
||||
talkModeUntil.set(key, now + timeoutMs);
|
||||
inTalkModeContext = true;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (inTalkModeContext && !msg.metadata?.isCommand) {
|
||||
const spokenCommand = incomingText.trim().toLowerCase();
|
||||
if (spokenCommand === 'stop' || spokenCommand === 'cancel') {
|
||||
incomingText = '/stop';
|
||||
}
|
||||
}
|
||||
|
||||
const session = deps.sessionManager.getSession(msg.channel, msg.senderId);
|
||||
const queueMode = session.getConfig('queue.mode') ?? deps.config.server?.queue?.mode ?? 'collect';
|
||||
const rawCommand = msg.metadata?.isCommand
|
||||
|
||||
Reference in New Issue
Block a user