From a761813375792447a71ca094ebefadcdbfd618a0 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Sun, 22 Feb 2026 21:27:09 -0800 Subject: [PATCH] Bind audio.transcribe hydration to current message turn --- src/backends/native/agent.test.ts | 73 +++++++++++++++++++++++++++++ src/backends/native/agent.ts | 71 +++++++++++++++++++++++++++- src/backends/native/orchestrator.ts | 18 ++++--- src/daemon/routing.test.ts | 4 +- src/daemon/routing.ts | 25 +++++++++- 5 files changed, 180 insertions(+), 11 deletions(-) diff --git a/src/backends/native/agent.test.ts b/src/backends/native/agent.test.ts index 65aa8c7..bacebce 100644 --- a/src/backends/native/agent.test.ts +++ b/src/backends/native/agent.test.ts @@ -323,6 +323,79 @@ describe('NativeAgent tool loop', () => { })); }); + it('prefers per-turn audio input over persisted fallback during voice transcript turns', async () => { + let callCount = 0; + let seenArgs: Record | undefined; + const mockClient: ModelClient = { + chat: vi.fn().mockImplementation(() => { + callCount++; + if (callCount === 1) { + return { + content: '', + stopReason: 'tool_use', + usage: { inputTokens: 10, outputTokens: 5 }, + toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: {} }], + }; + } + return { + content: 'done', + stopReason: 'end_turn', + usage: { inputTokens: 15, outputTokens: 10 }, + }; + }), + }; + + const mockSession = { + id: 'telegram:user-audio', + getHistory: vi.fn().mockReturnValue([ + { role: 'user', content: '[Voice message]: old transcript' }, + ]), + addMessage: vi.fn(), + clear: vi.fn(), + replaceHistory: vi.fn(), + getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment' + ? JSON.stringify({ data: 'T0xEX0FVRElP', mimeType: 'audio/ogg' }) + : undefined)), + setConfig: vi.fn(), + deleteConfig: vi.fn(), + }; + + const audioTool: Tool = { + name: 'audio.transcribe', + description: 'Transcribe audio', + inputSchema: { type: 'object', properties: {} }, + execute: async (args) => { + seenArgs = args as Record; + return { success: true, output: 'transcript' }; + }, + }; + + const registry = new ToolRegistry(); + registry.register(audioTool); + const hooks = new HookEngine({ confirm: [], log: [], silent: [] }); + const executor = new ToolExecutor(registry, hooks); + + const agent = new NativeAgent({ + modelClient: mockClient, + systemPrompt: 'You are helpful.', + session: mockSession, + toolRegistry: registry, + toolExecutor: executor, + }); + + const response = await agent.process( + 'Please transcribe this', + undefined, + { data: 'TkVXX0FVRElP', mime_type: 'audio/ogg' }, + ); + + expect(response).toBe('done'); + expect(seenArgs).toEqual(expect.objectContaining({ + data: 'TkVXX0FVRElP', + mime_type: 'audio/ogg', + })); + }); + it('replaces invalid file:// audio.transcribe url with persisted session audio data', async () => { let callCount = 0; let seenArgs: Record | undefined; diff --git a/src/backends/native/agent.ts b/src/backends/native/agent.ts index e78ac13..e3f13b5 100644 --- a/src/backends/native/agent.ts +++ b/src/backends/native/agent.ts @@ -77,6 +77,12 @@ interface AudioToolArgSummary { mimeType?: string; } +export interface NativeAgentTurnAudioInput { + data?: string; + url?: string; + mime_type?: string; +} + export class NativeAgent { private static readonly EMPTY_RESPONSE_FALLBACK = 'I could not generate a response for that. Please try again.'; @@ -100,6 +106,7 @@ export class NativeAgent { private _runInProgress = false; private _runAbortController?: AbortController; private modelTimeoutMs: number; + private _currentTurnAudioInput?: AudioToolInput; constructor(config: NativeAgentConfig) { this.modelClient = config.modelClient; @@ -120,9 +127,14 @@ export class NativeAgent { return this.session?.getHistory() ?? [...this.inMemoryHistory]; } - async process(userMessage: string, attachments?: Attachment[]): Promise { + async process( + userMessage: string, + attachments?: Attachment[], + turnAudioInput?: NativeAgentTurnAudioInput, + ): Promise { this._cancelRequested = false; this._runAbortController = new AbortController(); + this._currentTurnAudioInput = this.normalizeTurnAudioInput(turnAudioInput) ?? this.extractLatestAudioInputFromAttachments(attachments); if ('clearAbort' in this.modelClient && typeof this.modelClient.clearAbort === 'function') { this.modelClient.clearAbort(); } @@ -162,6 +174,7 @@ export class NativeAgent { this._runInProgress = false; this._cancelRequested = false; this._runAbortController = undefined; + this._currentTurnAudioInput = undefined; } } @@ -649,6 +662,12 @@ export class NativeAgent { : {}; const original = this.summarizeAudioToolArgs(args); + if (this._currentTurnAudioInput) { + this.applyAudioToolInput(args, this._currentTurnAudioInput); + this.logAudioArgsRewrite('latest_audio_preferred', 'latest_turn', original, args); + return args; + } + const latestTurnAudio = this.getLatestTurnUserAudioInput(); if (latestTurnAudio) { this.applyAudioToolInput(args, latestTurnAudio); @@ -794,6 +813,56 @@ export class NativeAgent { return null; } + private normalizeTurnAudioInput(turnAudioInput: NativeAgentTurnAudioInput | undefined): AudioToolInput | undefined { + if (!turnAudioInput) { + return undefined; + } + const data = typeof turnAudioInput.data === 'string' && turnAudioInput.data.length > 0 + ? turnAudioInput.data + : undefined; + const url = typeof turnAudioInput.url === 'string' && turnAudioInput.url.length > 0 + ? turnAudioInput.url + : undefined; + const mimeType = typeof turnAudioInput.mime_type === 'string' && turnAudioInput.mime_type.length > 0 + ? turnAudioInput.mime_type + : undefined; + if (!data && !url) { + return undefined; + } + return { + ...(data ? { data } : {}), + ...(url ? { url } : {}), + ...(mimeType ? { mime_type: mimeType } : {}), + }; + } + + private extractLatestAudioInputFromAttachments(attachments?: Attachment[]): AudioToolInput | undefined { + if (!attachments || attachments.length === 0) { + return undefined; + } + for (let i = attachments.length - 1; i >= 0; i--) { + const attachment = attachments[i]; + if (!attachment.mimeType.startsWith('audio/')) { + continue; + } + const data = typeof attachment.data === 'string' && attachment.data.length > 0 + ? attachment.data + : undefined; + const url = typeof attachment.url === 'string' && attachment.url.length > 0 + ? attachment.url + : undefined; + if (!data && !url) { + continue; + } + return { + ...(data ? { data } : {}), + ...(url ? { url } : {}), + mime_type: attachment.mimeType, + }; + } + return undefined; + } + private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined { if (typeof rawData !== 'string') { return undefined; diff --git a/src/backends/native/orchestrator.ts b/src/backends/native/orchestrator.ts index 4fc0da1..1dd7da0 100644 --- a/src/backends/native/orchestrator.ts +++ b/src/backends/native/orchestrator.ts @@ -7,6 +7,7 @@ import type { MemoryStore } from '../../memory/store.js'; import type { ToolPolicyContext } from '../../tools/policy.js'; import type { Attachment } from '../../channels/types.js'; import { NativeAgent } from './agent.js'; +import type { NativeAgentTurnAudioInput } from './agent.js'; import type { ToolUseEvent } from './agent.js'; import type { OutboundAttachmentCollector } from './attachments.js'; import { estimateMessageTokens, getContextWindow, shouldCompact } from '../../context/tokens.js'; @@ -339,7 +340,11 @@ export class AgentOrchestrator { * When compaction is configured, checks whether the conversation history * exceeds the context window threshold and compacts it before processing. */ - async process(userMessage: string, attachments?: Attachment[]): Promise { + async process( + userMessage: string, + attachments?: Attachment[], + turnAudioInput?: NativeAgentTurnAudioInput, + ): Promise { this._activeRunToolStarts = 0; this._injectMemoryContext(userMessage); await this._runProactiveContextMaintenance(); @@ -352,10 +357,10 @@ export class AgentOrchestrator { let result: string; try { - result = await this._agent.process(userMessage, attachments); + result = await this._agent.process(userMessage, attachments, turnAudioInput); } catch { this._restoreHistory(before); - const escalated = await this._retryWithEscalation(userMessage, attachments, before, originalTier); + const escalated = await this._retryWithEscalation(userMessage, attachments, turnAudioInput, before, originalTier); if (escalated) { await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts); return escalated; @@ -383,7 +388,7 @@ export class AgentOrchestrator { if (ctx) { // Attempt: compact + hard-trim to fit the discovered context window, then retry once. await this._compactAndTrimToFit(ctx); - const retry = await this._agent.process(userMessage, attachments); + const retry = await this._agent.process(userMessage, attachments, turnAudioInput); if (!this._isToolLoopErrorMessage(retry)) { return retry; } @@ -391,7 +396,7 @@ export class AgentOrchestrator { this._restoreHistory(before); } - const escalated = await this._retryWithEscalation(userMessage, attachments, before, originalTier); + const escalated = await this._retryWithEscalation(userMessage, attachments, turnAudioInput, before, originalTier); if (escalated) { await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts); return escalated; @@ -419,6 +424,7 @@ export class AgentOrchestrator { private async _retryWithEscalation( userMessage: string, attachments: Attachment[] | undefined, + turnAudioInput: NativeAgentTurnAudioInput | undefined, historyBefore: Message[], originalTier: ModelTier, ): Promise { @@ -437,7 +443,7 @@ export class AgentOrchestrator { this._agent.setModelTier(targetTier); try { - const retry = await this._agent.process(userMessage, attachments); + const retry = await this._agent.process(userMessage, attachments, turnAudioInput); if (!this._isToolLoopErrorMessage(retry)) { return retry; } diff --git a/src/daemon/routing.test.ts b/src/daemon/routing.test.ts index 437c490..51501d2 100644 --- a/src/daemon/routing.test.ts +++ b/src/daemon/routing.test.ts @@ -656,7 +656,7 @@ describe('daemon command fast-path integration', () => { const keys = Array.from(router.agents.keys()); expect(keys.some(key => key.includes(':research'))).toBe(true); - expect(processSpy).toHaveBeenCalledWith('compare k0s vs k3s for a homelab', undefined); + expect(processSpy).toHaveBeenCalledWith('compare k0s vs k3s for a homelab', undefined, undefined); }); it('falls back to llm path when confidence is below fast threshold', async () => { @@ -1938,6 +1938,6 @@ describe('daemon talk mode (voice wake) integration', () => { timestamp: Date.now(), } as MessageRouterInput, reply); expect(processSpy).toHaveBeenCalledOnce(); - expect(processSpy).toHaveBeenCalledWith('what time is it?', undefined); + expect(processSpy).toHaveBeenCalledWith('what time is it?', undefined, undefined); }); }); diff --git a/src/daemon/routing.ts b/src/daemon/routing.ts index fbfc75b..0b103e2 100644 --- a/src/daemon/routing.ts +++ b/src/daemon/routing.ts @@ -215,6 +215,26 @@ function persistLatestAudioAttachment( } } +function extractLatestAudioToolInput(audioAttachments: Attachment[]): { data?: string; url?: string; mime_type?: string } | undefined { + const latest = [...audioAttachments].reverse().find((att) => ( + (typeof att.data === 'string' && att.data.length > 0) + || (typeof att.url === 'string' && att.url.length > 0) + )); + if (!latest) { + return undefined; + } + const data = typeof latest.data === 'string' && latest.data.length > 0 ? latest.data : undefined; + const url = typeof latest.url === 'string' && latest.url.length > 0 ? latest.url : undefined; + if (!data && !url) { + return undefined; + } + return { + ...(data ? { data } : {}), + ...(url ? { url } : {}), + mime_type: latest.mimeType, + }; +} + function isTtsEnabledForChannel(config: Config, channel: string): boolean { if (!config.tts?.enabled) { return false; @@ -1317,6 +1337,7 @@ export function createMessageRouter(deps: { let messageText = incomingText; let attachments = msg.attachments; const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a)); + const turnAudioToolInput = extractLatestAudioToolInput(audioAttachments); if (audioAttachments.length > 0) { persistLatestAudioAttachment(session, audioAttachments); } @@ -1424,7 +1445,7 @@ export function createMessageRouter(deps: { let response: string; activeRuns.set(sessionIdForRun, agent); try { - response = await agent.process(messageText, attachments); + response = await agent.process(messageText, attachments, turnAudioToolInput); } catch (error) { const currentTier = agent.getModelTier(); const canEscalate = deps.config.agents.auto_escalate && currentTier !== 'complex'; @@ -1434,7 +1455,7 @@ export function createMessageRouter(deps: { console.warn(`Auto-escalating session ${msg.channel}:${msg.senderId} from ${currentTier} to complex after processing failure.`); agent.setModelTier('complex'); - response = await agent.process(messageText, attachments); + response = await agent.process(messageText, attachments, turnAudioToolInput); } const outboundAttachments = collector.drain(); const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel);