diff --git a/docs/plans/state.json b/docs/plans/state.json index 2dc39ef..01b6095 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -1,6 +1,6 @@ { "version": "1.0", - "updated_at": "2026-02-13", + "updated_at": "2026-02-14", "description": "Tracks the status of all Flynn plans and implementation phases", "plans": { @@ -38,6 +38,16 @@ ], "test_status": "pnpm test:run (targeted suites) + pnpm typecheck passing" }, + "voice-message-transcription-fastpath": { + "status": "completed", + "date": "2026-02-14", + "summary": "When a non-audio-capable model receives a voice attachment but audio transcription is not configured, route replies via a deterministic fast-path config help message instead of invoking the LLM (which cannot safely consume the audio).", + "files_modified": [ + "src/daemon/routing.ts", + "src/daemon/routing.test.ts" + ], + "test_status": "pnpm test:run src/daemon/routing.test.ts + pnpm typecheck passing (full pnpm test:run fails in this sandbox due to EPERM listen/spawn)" + }, "p0-p1-implementation-plan": { "file": "2026-02-06-p0-p1-implementation-plan.md", "status": "completed", diff --git a/src/daemon/routing.test.ts b/src/daemon/routing.test.ts index b0c5c01..6aea353 100644 --- a/src/daemon/routing.test.ts +++ b/src/daemon/routing.test.ts @@ -380,3 +380,154 @@ describe('daemon command fast-path integration', () => { expect(keys.some(key => key.includes(':assistant'))).toBe(true); }); }); + +describe('daemon audio routing integration', () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('fast-path replies for voice attachments when transcription is not configured and model does not support audio', async () => { + const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process'); + + const session = { + id: 'telegram:user-voice-1', + addMessage: vi.fn(), + getHistory: vi.fn(() => []), + clear: vi.fn(), + replaceHistory: vi.fn(), + getConfig: vi.fn(() => undefined), + setConfig: vi.fn(), + deleteConfig: vi.fn(), + }; + + const commandRegistry = new CommandRegistry(); + registerBuiltinCommands(commandRegistry); + + const router = createMessageRouter({ + sessionManager: { getSession: vi.fn(() => session) } as any, + modelRouter: { + getAvailableTiers: () => ['default'], + getAllLabels: () => ({ default: 'default' }), + getLabel: (tier: string) => tier, + } as any, + systemPrompt: 'test prompt', + toolRegistry: { clone() { return this; }, register: vi.fn() } as any, + toolExecutor: {} as any, + config: { + agents: { + primary_tier: 'default', + delegation: { + compaction: 'default', + memory_extraction: 'default', + classification: 'default', + tool_summarisation: 'default', + complex_reasoning: 'default', + }, + max_delegation_depth: 1, + max_iterations: 3, + }, + compaction: { enabled: false }, + // Anthropic doesn't support native audio; ensures routing hits the non-audio path. + models: { default: { provider: 'anthropic', model: 'claude' } }, + audio: { enabled: false }, + } as any, + commandRegistry, + }); + + const reply = vi.fn(async () => {}); + await router.handler({ + id: 'v1', + channel: 'telegram', + senderId: 'user-voice-1', + text: '', + attachments: [{ mimeType: 'audio/ogg', data: 'ZGF0YQ==', filename: 'voice.ogg' }], + timestamp: Date.now(), + } as any, reply); + + expect(processSpy).not.toHaveBeenCalled(); + expect(reply).toHaveBeenCalledTimes(1); + const msg = (reply.mock.calls[0] as unknown as any[])[0] as { text?: string }; + expect(String(msg.text)).toContain('audio transcription is not configured'); + }); + + it('transcribes voice attachments when transcription is configured, then strips audio before calling agent.process', async () => { + const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('ok'); + + // Mock transcription endpoint call. + const fetchSpy = vi.spyOn(globalThis, 'fetch' as any).mockResolvedValue({ + ok: true, + status: 200, + statusText: 'OK', + json: async () => ({ text: 'hello world' }), + } as any); + + const session = { + id: 'telegram:user-voice-2', + addMessage: vi.fn(), + getHistory: vi.fn(() => []), + clear: vi.fn(), + replaceHistory: vi.fn(), + getConfig: vi.fn(() => undefined), + setConfig: vi.fn(), + deleteConfig: vi.fn(), + }; + + const commandRegistry = new CommandRegistry(); + registerBuiltinCommands(commandRegistry); + + const router = createMessageRouter({ + sessionManager: { getSession: vi.fn(() => session) } as any, + modelRouter: { + getAvailableTiers: () => ['default'], + getAllLabels: () => ({ default: 'default' }), + getLabel: (tier: string) => tier, + } as any, + systemPrompt: 'test prompt', + toolRegistry: { clone() { return this; }, register: vi.fn() } as any, + toolExecutor: {} as any, + config: { + agents: { + primary_tier: 'default', + delegation: { + compaction: 'default', + memory_extraction: 'default', + classification: 'default', + tool_summarisation: 'default', + complex_reasoning: 'default', + }, + max_delegation_depth: 1, + max_iterations: 3, + }, + compaction: { enabled: false }, + models: { default: { provider: 'anthropic', model: 'claude' } }, + audio: { + enabled: true, + provider: { type: 'openai', endpoint: 'https://example.com/v1/audio/transcriptions', api_key: 'sk-test', model: 'whisper-1' }, + }, + } as any, + commandRegistry, + }); + + const reply = vi.fn(async () => {}); + await router.handler({ + id: 'v2', + channel: 'telegram', + senderId: 'user-voice-2', + text: 'caption', + attachments: [ + { mimeType: 'audio/ogg', data: 'ZGF0YQ==', filename: 'voice.ogg' }, + { mimeType: 'image/jpeg', data: 'aW1n', filename: 'img.jpg' }, + ], + timestamp: Date.now(), + } as any, reply); + + expect(fetchSpy).toHaveBeenCalled(); + expect(processSpy).toHaveBeenCalledTimes(1); + const [calledText, calledAttachments] = processSpy.mock.calls[0] ?? []; + expect(String(calledText)).toContain('[Voice message]: hello world'); + expect(String(calledText)).toContain('caption'); + const atts = calledAttachments as any[] | undefined; + expect(atts?.some(a => a.mimeType === 'audio/ogg')).toBe(false); + expect(atts?.some(a => a.mimeType === 'image/jpeg')).toBe(true); + }); +}); diff --git a/src/daemon/routing.ts b/src/daemon/routing.ts index 8d541a4..4877f86 100644 --- a/src/daemon/routing.ts +++ b/src/daemon/routing.ts @@ -441,14 +441,28 @@ export function createMessageRouter(deps: { } : undefined; - if (audioConfig?.endpoint) { - for (const att of audioAttachments) { - const transcript = await transcribeAudio(att, audioConfig); - messageText = `[Voice message]: ${transcript}\n\n${messageText}`; - } - } else { - // No transcription endpoint configured — inform the user gracefully - messageText = '[Voice message received but audio transcription is not configured. Please configure the audio section in config.yaml to enable voice message support.]'; + if (!audioConfig?.endpoint) { + // Without transcription, we cannot safely send audio to a non-audio-capable model. + // Fast-path a deterministic, user-friendly reply instead of invoking the agent loop. + await reply({ + text: + [ + 'I received your voice message, but I cannot transcribe it yet because audio transcription is not configured.', + '', + 'To enable voice messages, set `audio.enabled: true` and configure an `audio.provider` in `config.yaml` (OpenAI/Groq/custom Whisper-compatible `/v1/audio/transcriptions`).', + '', + 'Workarounds:', + '1. Paste the transcription text.', + '2. Upload the audio file somewhere and send me a direct URL.', + ].join('\n'), + replyTo: msg.id, + }); + return; + } + + for (const att of audioAttachments) { + const transcript = await transcribeAudio(att, audioConfig); + messageText = `[Voice message]: ${transcript}\n\n${messageText}`; } // Remove audio attachments so buildUserMessage doesn't create audio content parts attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));