routing: fast-path voice when transcription not configured

This commit is contained in:
William Valentin
2026-02-13 18:35:04 -08:00
parent 7df0569a39
commit 944b2c916a
3 changed files with 184 additions and 9 deletions
+151
View File
@@ -380,3 +380,154 @@ describe('daemon command fast-path integration', () => {
expect(keys.some(key => key.includes(':assistant'))).toBe(true);
});
});
describe('daemon audio routing integration', () => {
afterEach(() => {
vi.restoreAllMocks();
});
it('fast-path replies for voice attachments when transcription is not configured and model does not support audio', async () => {
const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process');
const session = {
id: 'telegram:user-voice-1',
addMessage: vi.fn(),
getHistory: vi.fn(() => []),
clear: vi.fn(),
replaceHistory: vi.fn(),
getConfig: vi.fn(() => undefined),
setConfig: vi.fn(),
deleteConfig: vi.fn(),
};
const commandRegistry = new CommandRegistry();
registerBuiltinCommands(commandRegistry);
const router = createMessageRouter({
sessionManager: { getSession: vi.fn(() => session) } as any,
modelRouter: {
getAvailableTiers: () => ['default'],
getAllLabels: () => ({ default: 'default' }),
getLabel: (tier: string) => tier,
} as any,
systemPrompt: 'test prompt',
toolRegistry: { clone() { return this; }, register: vi.fn() } as any,
toolExecutor: {} as any,
config: {
agents: {
primary_tier: 'default',
delegation: {
compaction: 'default',
memory_extraction: 'default',
classification: 'default',
tool_summarisation: 'default',
complex_reasoning: 'default',
},
max_delegation_depth: 1,
max_iterations: 3,
},
compaction: { enabled: false },
// Anthropic doesn't support native audio; ensures routing hits the non-audio path.
models: { default: { provider: 'anthropic', model: 'claude' } },
audio: { enabled: false },
} as any,
commandRegistry,
});
const reply = vi.fn(async () => {});
await router.handler({
id: 'v1',
channel: 'telegram',
senderId: 'user-voice-1',
text: '',
attachments: [{ mimeType: 'audio/ogg', data: 'ZGF0YQ==', filename: 'voice.ogg' }],
timestamp: Date.now(),
} as any, reply);
expect(processSpy).not.toHaveBeenCalled();
expect(reply).toHaveBeenCalledTimes(1);
const msg = (reply.mock.calls[0] as unknown as any[])[0] as { text?: string };
expect(String(msg.text)).toContain('audio transcription is not configured');
});
it('transcribes voice attachments when transcription is configured, then strips audio before calling agent.process', async () => {
const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('ok');
// Mock transcription endpoint call.
const fetchSpy = vi.spyOn(globalThis, 'fetch' as any).mockResolvedValue({
ok: true,
status: 200,
statusText: 'OK',
json: async () => ({ text: 'hello world' }),
} as any);
const session = {
id: 'telegram:user-voice-2',
addMessage: vi.fn(),
getHistory: vi.fn(() => []),
clear: vi.fn(),
replaceHistory: vi.fn(),
getConfig: vi.fn(() => undefined),
setConfig: vi.fn(),
deleteConfig: vi.fn(),
};
const commandRegistry = new CommandRegistry();
registerBuiltinCommands(commandRegistry);
const router = createMessageRouter({
sessionManager: { getSession: vi.fn(() => session) } as any,
modelRouter: {
getAvailableTiers: () => ['default'],
getAllLabels: () => ({ default: 'default' }),
getLabel: (tier: string) => tier,
} as any,
systemPrompt: 'test prompt',
toolRegistry: { clone() { return this; }, register: vi.fn() } as any,
toolExecutor: {} as any,
config: {
agents: {
primary_tier: 'default',
delegation: {
compaction: 'default',
memory_extraction: 'default',
classification: 'default',
tool_summarisation: 'default',
complex_reasoning: 'default',
},
max_delegation_depth: 1,
max_iterations: 3,
},
compaction: { enabled: false },
models: { default: { provider: 'anthropic', model: 'claude' } },
audio: {
enabled: true,
provider: { type: 'openai', endpoint: 'https://example.com/v1/audio/transcriptions', api_key: 'sk-test', model: 'whisper-1' },
},
} as any,
commandRegistry,
});
const reply = vi.fn(async () => {});
await router.handler({
id: 'v2',
channel: 'telegram',
senderId: 'user-voice-2',
text: 'caption',
attachments: [
{ mimeType: 'audio/ogg', data: 'ZGF0YQ==', filename: 'voice.ogg' },
{ mimeType: 'image/jpeg', data: 'aW1n', filename: 'img.jpg' },
],
timestamp: Date.now(),
} as any, reply);
expect(fetchSpy).toHaveBeenCalled();
expect(processSpy).toHaveBeenCalledTimes(1);
const [calledText, calledAttachments] = processSpy.mock.calls[0] ?? [];
expect(String(calledText)).toContain('[Voice message]: hello world');
expect(String(calledText)).toContain('caption');
const atts = calledAttachments as any[] | undefined;
expect(atts?.some(a => a.mimeType === 'audio/ogg')).toBe(false);
expect(atts?.some(a => a.mimeType === 'image/jpeg')).toBe(true);
});
});
+22 -8
View File
@@ -441,14 +441,28 @@ export function createMessageRouter(deps: {
}
: undefined;
if (audioConfig?.endpoint) {
for (const att of audioAttachments) {
const transcript = await transcribeAudio(att, audioConfig);
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
}
} else {
// No transcription endpoint configured — inform the user gracefully
messageText = '[Voice message received but audio transcription is not configured. Please configure the audio section in config.yaml to enable voice message support.]';
if (!audioConfig?.endpoint) {
// Without transcription, we cannot safely send audio to a non-audio-capable model.
// Fast-path a deterministic, user-friendly reply instead of invoking the agent loop.
await reply({
text:
[
'I received your voice message, but I cannot transcribe it yet because audio transcription is not configured.',
'',
'To enable voice messages, set `audio.enabled: true` and configure an `audio.provider` in `config.yaml` (OpenAI/Groq/custom Whisper-compatible `/v1/audio/transcriptions`).',
'',
'Workarounds:',
'1. Paste the transcription text.',
'2. Upload the audio file somewhere and send me a direct URL.',
].join('\n'),
replyTo: msg.id,
});
return;
}
for (const att of audioAttachments) {
const transcript = await transcribeAudio(att, audioConfig);
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
}
// Remove audio attachments so buildUserMessage doesn't create audio content parts
attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));