routing: fast-path voice when transcription not configured
This commit is contained in:
+11
-1
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"version": "1.0",
|
"version": "1.0",
|
||||||
"updated_at": "2026-02-13",
|
"updated_at": "2026-02-14",
|
||||||
"description": "Tracks the status of all Flynn plans and implementation phases",
|
"description": "Tracks the status of all Flynn plans and implementation phases",
|
||||||
|
|
||||||
"plans": {
|
"plans": {
|
||||||
@@ -38,6 +38,16 @@
|
|||||||
],
|
],
|
||||||
"test_status": "pnpm test:run (targeted suites) + pnpm typecheck passing"
|
"test_status": "pnpm test:run (targeted suites) + pnpm typecheck passing"
|
||||||
},
|
},
|
||||||
|
"voice-message-transcription-fastpath": {
|
||||||
|
"status": "completed",
|
||||||
|
"date": "2026-02-14",
|
||||||
|
"summary": "When a non-audio-capable model receives a voice attachment but audio transcription is not configured, route replies via a deterministic fast-path config help message instead of invoking the LLM (which cannot safely consume the audio).",
|
||||||
|
"files_modified": [
|
||||||
|
"src/daemon/routing.ts",
|
||||||
|
"src/daemon/routing.test.ts"
|
||||||
|
],
|
||||||
|
"test_status": "pnpm test:run src/daemon/routing.test.ts + pnpm typecheck passing (full pnpm test:run fails in this sandbox due to EPERM listen/spawn)"
|
||||||
|
},
|
||||||
"p0-p1-implementation-plan": {
|
"p0-p1-implementation-plan": {
|
||||||
"file": "2026-02-06-p0-p1-implementation-plan.md",
|
"file": "2026-02-06-p0-p1-implementation-plan.md",
|
||||||
"status": "completed",
|
"status": "completed",
|
||||||
|
|||||||
@@ -380,3 +380,154 @@ describe('daemon command fast-path integration', () => {
|
|||||||
expect(keys.some(key => key.includes(':assistant'))).toBe(true);
|
expect(keys.some(key => key.includes(':assistant'))).toBe(true);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('daemon audio routing integration', () => {
|
||||||
|
afterEach(() => {
|
||||||
|
vi.restoreAllMocks();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('fast-path replies for voice attachments when transcription is not configured and model does not support audio', async () => {
|
||||||
|
const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process');
|
||||||
|
|
||||||
|
const session = {
|
||||||
|
id: 'telegram:user-voice-1',
|
||||||
|
addMessage: vi.fn(),
|
||||||
|
getHistory: vi.fn(() => []),
|
||||||
|
clear: vi.fn(),
|
||||||
|
replaceHistory: vi.fn(),
|
||||||
|
getConfig: vi.fn(() => undefined),
|
||||||
|
setConfig: vi.fn(),
|
||||||
|
deleteConfig: vi.fn(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const commandRegistry = new CommandRegistry();
|
||||||
|
registerBuiltinCommands(commandRegistry);
|
||||||
|
|
||||||
|
const router = createMessageRouter({
|
||||||
|
sessionManager: { getSession: vi.fn(() => session) } as any,
|
||||||
|
modelRouter: {
|
||||||
|
getAvailableTiers: () => ['default'],
|
||||||
|
getAllLabels: () => ({ default: 'default' }),
|
||||||
|
getLabel: (tier: string) => tier,
|
||||||
|
} as any,
|
||||||
|
systemPrompt: 'test prompt',
|
||||||
|
toolRegistry: { clone() { return this; }, register: vi.fn() } as any,
|
||||||
|
toolExecutor: {} as any,
|
||||||
|
config: {
|
||||||
|
agents: {
|
||||||
|
primary_tier: 'default',
|
||||||
|
delegation: {
|
||||||
|
compaction: 'default',
|
||||||
|
memory_extraction: 'default',
|
||||||
|
classification: 'default',
|
||||||
|
tool_summarisation: 'default',
|
||||||
|
complex_reasoning: 'default',
|
||||||
|
},
|
||||||
|
max_delegation_depth: 1,
|
||||||
|
max_iterations: 3,
|
||||||
|
},
|
||||||
|
compaction: { enabled: false },
|
||||||
|
// Anthropic doesn't support native audio; ensures routing hits the non-audio path.
|
||||||
|
models: { default: { provider: 'anthropic', model: 'claude' } },
|
||||||
|
audio: { enabled: false },
|
||||||
|
} as any,
|
||||||
|
commandRegistry,
|
||||||
|
});
|
||||||
|
|
||||||
|
const reply = vi.fn(async () => {});
|
||||||
|
await router.handler({
|
||||||
|
id: 'v1',
|
||||||
|
channel: 'telegram',
|
||||||
|
senderId: 'user-voice-1',
|
||||||
|
text: '',
|
||||||
|
attachments: [{ mimeType: 'audio/ogg', data: 'ZGF0YQ==', filename: 'voice.ogg' }],
|
||||||
|
timestamp: Date.now(),
|
||||||
|
} as any, reply);
|
||||||
|
|
||||||
|
expect(processSpy).not.toHaveBeenCalled();
|
||||||
|
expect(reply).toHaveBeenCalledTimes(1);
|
||||||
|
const msg = (reply.mock.calls[0] as unknown as any[])[0] as { text?: string };
|
||||||
|
expect(String(msg.text)).toContain('audio transcription is not configured');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('transcribes voice attachments when transcription is configured, then strips audio before calling agent.process', async () => {
|
||||||
|
const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('ok');
|
||||||
|
|
||||||
|
// Mock transcription endpoint call.
|
||||||
|
const fetchSpy = vi.spyOn(globalThis, 'fetch' as any).mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
status: 200,
|
||||||
|
statusText: 'OK',
|
||||||
|
json: async () => ({ text: 'hello world' }),
|
||||||
|
} as any);
|
||||||
|
|
||||||
|
const session = {
|
||||||
|
id: 'telegram:user-voice-2',
|
||||||
|
addMessage: vi.fn(),
|
||||||
|
getHistory: vi.fn(() => []),
|
||||||
|
clear: vi.fn(),
|
||||||
|
replaceHistory: vi.fn(),
|
||||||
|
getConfig: vi.fn(() => undefined),
|
||||||
|
setConfig: vi.fn(),
|
||||||
|
deleteConfig: vi.fn(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const commandRegistry = new CommandRegistry();
|
||||||
|
registerBuiltinCommands(commandRegistry);
|
||||||
|
|
||||||
|
const router = createMessageRouter({
|
||||||
|
sessionManager: { getSession: vi.fn(() => session) } as any,
|
||||||
|
modelRouter: {
|
||||||
|
getAvailableTiers: () => ['default'],
|
||||||
|
getAllLabels: () => ({ default: 'default' }),
|
||||||
|
getLabel: (tier: string) => tier,
|
||||||
|
} as any,
|
||||||
|
systemPrompt: 'test prompt',
|
||||||
|
toolRegistry: { clone() { return this; }, register: vi.fn() } as any,
|
||||||
|
toolExecutor: {} as any,
|
||||||
|
config: {
|
||||||
|
agents: {
|
||||||
|
primary_tier: 'default',
|
||||||
|
delegation: {
|
||||||
|
compaction: 'default',
|
||||||
|
memory_extraction: 'default',
|
||||||
|
classification: 'default',
|
||||||
|
tool_summarisation: 'default',
|
||||||
|
complex_reasoning: 'default',
|
||||||
|
},
|
||||||
|
max_delegation_depth: 1,
|
||||||
|
max_iterations: 3,
|
||||||
|
},
|
||||||
|
compaction: { enabled: false },
|
||||||
|
models: { default: { provider: 'anthropic', model: 'claude' } },
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
provider: { type: 'openai', endpoint: 'https://example.com/v1/audio/transcriptions', api_key: 'sk-test', model: 'whisper-1' },
|
||||||
|
},
|
||||||
|
} as any,
|
||||||
|
commandRegistry,
|
||||||
|
});
|
||||||
|
|
||||||
|
const reply = vi.fn(async () => {});
|
||||||
|
await router.handler({
|
||||||
|
id: 'v2',
|
||||||
|
channel: 'telegram',
|
||||||
|
senderId: 'user-voice-2',
|
||||||
|
text: 'caption',
|
||||||
|
attachments: [
|
||||||
|
{ mimeType: 'audio/ogg', data: 'ZGF0YQ==', filename: 'voice.ogg' },
|
||||||
|
{ mimeType: 'image/jpeg', data: 'aW1n', filename: 'img.jpg' },
|
||||||
|
],
|
||||||
|
timestamp: Date.now(),
|
||||||
|
} as any, reply);
|
||||||
|
|
||||||
|
expect(fetchSpy).toHaveBeenCalled();
|
||||||
|
expect(processSpy).toHaveBeenCalledTimes(1);
|
||||||
|
const [calledText, calledAttachments] = processSpy.mock.calls[0] ?? [];
|
||||||
|
expect(String(calledText)).toContain('[Voice message]: hello world');
|
||||||
|
expect(String(calledText)).toContain('caption');
|
||||||
|
const atts = calledAttachments as any[] | undefined;
|
||||||
|
expect(atts?.some(a => a.mimeType === 'audio/ogg')).toBe(false);
|
||||||
|
expect(atts?.some(a => a.mimeType === 'image/jpeg')).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
+22
-8
@@ -441,14 +441,28 @@ export function createMessageRouter(deps: {
|
|||||||
}
|
}
|
||||||
: undefined;
|
: undefined;
|
||||||
|
|
||||||
if (audioConfig?.endpoint) {
|
if (!audioConfig?.endpoint) {
|
||||||
for (const att of audioAttachments) {
|
// Without transcription, we cannot safely send audio to a non-audio-capable model.
|
||||||
const transcript = await transcribeAudio(att, audioConfig);
|
// Fast-path a deterministic, user-friendly reply instead of invoking the agent loop.
|
||||||
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
|
await reply({
|
||||||
}
|
text:
|
||||||
} else {
|
[
|
||||||
// No transcription endpoint configured — inform the user gracefully
|
'I received your voice message, but I cannot transcribe it yet because audio transcription is not configured.',
|
||||||
messageText = '[Voice message received but audio transcription is not configured. Please configure the audio section in config.yaml to enable voice message support.]';
|
'',
|
||||||
|
'To enable voice messages, set `audio.enabled: true` and configure an `audio.provider` in `config.yaml` (OpenAI/Groq/custom Whisper-compatible `/v1/audio/transcriptions`).',
|
||||||
|
'',
|
||||||
|
'Workarounds:',
|
||||||
|
'1. Paste the transcription text.',
|
||||||
|
'2. Upload the audio file somewhere and send me a direct URL.',
|
||||||
|
].join('\n'),
|
||||||
|
replyTo: msg.id,
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const att of audioAttachments) {
|
||||||
|
const transcript = await transcribeAudio(att, audioConfig);
|
||||||
|
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
|
||||||
}
|
}
|
||||||
// Remove audio attachments so buildUserMessage doesn't create audio content parts
|
// Remove audio attachments so buildUserMessage doesn't create audio content parts
|
||||||
attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
|
attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
|
||||||
|
|||||||
Reference in New Issue
Block a user