Bind audio.transcribe hydration to current message turn

This commit is contained in:
William Valentin
2026-02-22 21:27:09 -08:00
parent 89246e7da0
commit a761813375
5 changed files with 180 additions and 11 deletions
+73
View File
@@ -323,6 +323,79 @@ describe('NativeAgent tool loop', () => {
})); }));
}); });
it('prefers per-turn audio input over persisted fallback during voice transcript turns', async () => {
let callCount = 0;
let seenArgs: Record<string, unknown> | undefined;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: {} }],
};
}
return {
content: 'done',
stopReason: 'end_turn',
usage: { inputTokens: 15, outputTokens: 10 },
};
}),
};
const mockSession = {
id: 'telegram:user-audio',
getHistory: vi.fn().mockReturnValue([
{ role: 'user', content: '[Voice message]: old transcript' },
]),
addMessage: vi.fn(),
clear: vi.fn(),
replaceHistory: vi.fn(),
getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment'
? JSON.stringify({ data: 'T0xEX0FVRElP', mimeType: 'audio/ogg' })
: undefined)),
setConfig: vi.fn(),
deleteConfig: vi.fn(),
};
const audioTool: Tool = {
name: 'audio.transcribe',
description: 'Transcribe audio',
inputSchema: { type: 'object', properties: {} },
execute: async (args) => {
seenArgs = args as Record<string, unknown>;
return { success: true, output: 'transcript' };
},
};
const registry = new ToolRegistry();
registry.register(audioTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
session: mockSession,
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process(
'Please transcribe this',
undefined,
{ data: 'TkVXX0FVRElP', mime_type: 'audio/ogg' },
);
expect(response).toBe('done');
expect(seenArgs).toEqual(expect.objectContaining({
data: 'TkVXX0FVRElP',
mime_type: 'audio/ogg',
}));
});
it('replaces invalid file:// audio.transcribe url with persisted session audio data', async () => { it('replaces invalid file:// audio.transcribe url with persisted session audio data', async () => {
let callCount = 0; let callCount = 0;
let seenArgs: Record<string, unknown> | undefined; let seenArgs: Record<string, unknown> | undefined;
+70 -1
View File
@@ -77,6 +77,12 @@ interface AudioToolArgSummary {
mimeType?: string; mimeType?: string;
} }
export interface NativeAgentTurnAudioInput {
data?: string;
url?: string;
mime_type?: string;
}
export class NativeAgent { export class NativeAgent {
private static readonly EMPTY_RESPONSE_FALLBACK = private static readonly EMPTY_RESPONSE_FALLBACK =
'I could not generate a response for that. Please try again.'; 'I could not generate a response for that. Please try again.';
@@ -100,6 +106,7 @@ export class NativeAgent {
private _runInProgress = false; private _runInProgress = false;
private _runAbortController?: AbortController; private _runAbortController?: AbortController;
private modelTimeoutMs: number; private modelTimeoutMs: number;
private _currentTurnAudioInput?: AudioToolInput;
constructor(config: NativeAgentConfig) { constructor(config: NativeAgentConfig) {
this.modelClient = config.modelClient; this.modelClient = config.modelClient;
@@ -120,9 +127,14 @@ export class NativeAgent {
return this.session?.getHistory() ?? [...this.inMemoryHistory]; return this.session?.getHistory() ?? [...this.inMemoryHistory];
} }
async process(userMessage: string, attachments?: Attachment[]): Promise<string> { async process(
userMessage: string,
attachments?: Attachment[],
turnAudioInput?: NativeAgentTurnAudioInput,
): Promise<string> {
this._cancelRequested = false; this._cancelRequested = false;
this._runAbortController = new AbortController(); this._runAbortController = new AbortController();
this._currentTurnAudioInput = this.normalizeTurnAudioInput(turnAudioInput) ?? this.extractLatestAudioInputFromAttachments(attachments);
if ('clearAbort' in this.modelClient && typeof this.modelClient.clearAbort === 'function') { if ('clearAbort' in this.modelClient && typeof this.modelClient.clearAbort === 'function') {
this.modelClient.clearAbort(); this.modelClient.clearAbort();
} }
@@ -162,6 +174,7 @@ export class NativeAgent {
this._runInProgress = false; this._runInProgress = false;
this._cancelRequested = false; this._cancelRequested = false;
this._runAbortController = undefined; this._runAbortController = undefined;
this._currentTurnAudioInput = undefined;
} }
} }
@@ -649,6 +662,12 @@ export class NativeAgent {
: {}; : {};
const original = this.summarizeAudioToolArgs(args); const original = this.summarizeAudioToolArgs(args);
if (this._currentTurnAudioInput) {
this.applyAudioToolInput(args, this._currentTurnAudioInput);
this.logAudioArgsRewrite('latest_audio_preferred', 'latest_turn', original, args);
return args;
}
const latestTurnAudio = this.getLatestTurnUserAudioInput(); const latestTurnAudio = this.getLatestTurnUserAudioInput();
if (latestTurnAudio) { if (latestTurnAudio) {
this.applyAudioToolInput(args, latestTurnAudio); this.applyAudioToolInput(args, latestTurnAudio);
@@ -794,6 +813,56 @@ export class NativeAgent {
return null; return null;
} }
private normalizeTurnAudioInput(turnAudioInput: NativeAgentTurnAudioInput | undefined): AudioToolInput | undefined {
if (!turnAudioInput) {
return undefined;
}
const data = typeof turnAudioInput.data === 'string' && turnAudioInput.data.length > 0
? turnAudioInput.data
: undefined;
const url = typeof turnAudioInput.url === 'string' && turnAudioInput.url.length > 0
? turnAudioInput.url
: undefined;
const mimeType = typeof turnAudioInput.mime_type === 'string' && turnAudioInput.mime_type.length > 0
? turnAudioInput.mime_type
: undefined;
if (!data && !url) {
return undefined;
}
return {
...(data ? { data } : {}),
...(url ? { url } : {}),
...(mimeType ? { mime_type: mimeType } : {}),
};
}
private extractLatestAudioInputFromAttachments(attachments?: Attachment[]): AudioToolInput | undefined {
if (!attachments || attachments.length === 0) {
return undefined;
}
for (let i = attachments.length - 1; i >= 0; i--) {
const attachment = attachments[i];
if (!attachment.mimeType.startsWith('audio/')) {
continue;
}
const data = typeof attachment.data === 'string' && attachment.data.length > 0
? attachment.data
: undefined;
const url = typeof attachment.url === 'string' && attachment.url.length > 0
? attachment.url
: undefined;
if (!data && !url) {
continue;
}
return {
...(data ? { data } : {}),
...(url ? { url } : {}),
mime_type: attachment.mimeType,
};
}
return undefined;
}
private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined { private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined {
if (typeof rawData !== 'string') { if (typeof rawData !== 'string') {
return undefined; return undefined;
+12 -6
View File
@@ -7,6 +7,7 @@ import type { MemoryStore } from '../../memory/store.js';
import type { ToolPolicyContext } from '../../tools/policy.js'; import type { ToolPolicyContext } from '../../tools/policy.js';
import type { Attachment } from '../../channels/types.js'; import type { Attachment } from '../../channels/types.js';
import { NativeAgent } from './agent.js'; import { NativeAgent } from './agent.js';
import type { NativeAgentTurnAudioInput } from './agent.js';
import type { ToolUseEvent } from './agent.js'; import type { ToolUseEvent } from './agent.js';
import type { OutboundAttachmentCollector } from './attachments.js'; import type { OutboundAttachmentCollector } from './attachments.js';
import { estimateMessageTokens, getContextWindow, shouldCompact } from '../../context/tokens.js'; import { estimateMessageTokens, getContextWindow, shouldCompact } from '../../context/tokens.js';
@@ -339,7 +340,11 @@ export class AgentOrchestrator {
* When compaction is configured, checks whether the conversation history * When compaction is configured, checks whether the conversation history
* exceeds the context window threshold and compacts it before processing. * exceeds the context window threshold and compacts it before processing.
*/ */
async process(userMessage: string, attachments?: Attachment[]): Promise<string> { async process(
userMessage: string,
attachments?: Attachment[],
turnAudioInput?: NativeAgentTurnAudioInput,
): Promise<string> {
this._activeRunToolStarts = 0; this._activeRunToolStarts = 0;
this._injectMemoryContext(userMessage); this._injectMemoryContext(userMessage);
await this._runProactiveContextMaintenance(); await this._runProactiveContextMaintenance();
@@ -352,10 +357,10 @@ export class AgentOrchestrator {
let result: string; let result: string;
try { try {
result = await this._agent.process(userMessage, attachments); result = await this._agent.process(userMessage, attachments, turnAudioInput);
} catch { } catch {
this._restoreHistory(before); this._restoreHistory(before);
const escalated = await this._retryWithEscalation(userMessage, attachments, before, originalTier); const escalated = await this._retryWithEscalation(userMessage, attachments, turnAudioInput, before, originalTier);
if (escalated) { if (escalated) {
await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts); await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts);
return escalated; return escalated;
@@ -383,7 +388,7 @@ export class AgentOrchestrator {
if (ctx) { if (ctx) {
// Attempt: compact + hard-trim to fit the discovered context window, then retry once. // Attempt: compact + hard-trim to fit the discovered context window, then retry once.
await this._compactAndTrimToFit(ctx); await this._compactAndTrimToFit(ctx);
const retry = await this._agent.process(userMessage, attachments); const retry = await this._agent.process(userMessage, attachments, turnAudioInput);
if (!this._isToolLoopErrorMessage(retry)) { if (!this._isToolLoopErrorMessage(retry)) {
return retry; return retry;
} }
@@ -391,7 +396,7 @@ export class AgentOrchestrator {
this._restoreHistory(before); this._restoreHistory(before);
} }
const escalated = await this._retryWithEscalation(userMessage, attachments, before, originalTier); const escalated = await this._retryWithEscalation(userMessage, attachments, turnAudioInput, before, originalTier);
if (escalated) { if (escalated) {
await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts); await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts);
return escalated; return escalated;
@@ -419,6 +424,7 @@ export class AgentOrchestrator {
private async _retryWithEscalation( private async _retryWithEscalation(
userMessage: string, userMessage: string,
attachments: Attachment[] | undefined, attachments: Attachment[] | undefined,
turnAudioInput: NativeAgentTurnAudioInput | undefined,
historyBefore: Message[], historyBefore: Message[],
originalTier: ModelTier, originalTier: ModelTier,
): Promise<string | null> { ): Promise<string | null> {
@@ -437,7 +443,7 @@ export class AgentOrchestrator {
this._agent.setModelTier(targetTier); this._agent.setModelTier(targetTier);
try { try {
const retry = await this._agent.process(userMessage, attachments); const retry = await this._agent.process(userMessage, attachments, turnAudioInput);
if (!this._isToolLoopErrorMessage(retry)) { if (!this._isToolLoopErrorMessage(retry)) {
return retry; return retry;
} }
+2 -2
View File
@@ -656,7 +656,7 @@ describe('daemon command fast-path integration', () => {
const keys = Array.from(router.agents.keys()); const keys = Array.from(router.agents.keys());
expect(keys.some(key => key.includes(':research'))).toBe(true); expect(keys.some(key => key.includes(':research'))).toBe(true);
expect(processSpy).toHaveBeenCalledWith('compare k0s vs k3s for a homelab', undefined); expect(processSpy).toHaveBeenCalledWith('compare k0s vs k3s for a homelab', undefined, undefined);
}); });
it('falls back to llm path when confidence is below fast threshold', async () => { it('falls back to llm path when confidence is below fast threshold', async () => {
@@ -1938,6 +1938,6 @@ describe('daemon talk mode (voice wake) integration', () => {
timestamp: Date.now(), timestamp: Date.now(),
} as MessageRouterInput, reply); } as MessageRouterInput, reply);
expect(processSpy).toHaveBeenCalledOnce(); expect(processSpy).toHaveBeenCalledOnce();
expect(processSpy).toHaveBeenCalledWith('what time is it?', undefined); expect(processSpy).toHaveBeenCalledWith('what time is it?', undefined, undefined);
}); });
}); });
+23 -2
View File
@@ -215,6 +215,26 @@ function persistLatestAudioAttachment(
} }
} }
function extractLatestAudioToolInput(audioAttachments: Attachment[]): { data?: string; url?: string; mime_type?: string } | undefined {
const latest = [...audioAttachments].reverse().find((att) => (
(typeof att.data === 'string' && att.data.length > 0)
|| (typeof att.url === 'string' && att.url.length > 0)
));
if (!latest) {
return undefined;
}
const data = typeof latest.data === 'string' && latest.data.length > 0 ? latest.data : undefined;
const url = typeof latest.url === 'string' && latest.url.length > 0 ? latest.url : undefined;
if (!data && !url) {
return undefined;
}
return {
...(data ? { data } : {}),
...(url ? { url } : {}),
mime_type: latest.mimeType,
};
}
function isTtsEnabledForChannel(config: Config, channel: string): boolean { function isTtsEnabledForChannel(config: Config, channel: string): boolean {
if (!config.tts?.enabled) { if (!config.tts?.enabled) {
return false; return false;
@@ -1317,6 +1337,7 @@ export function createMessageRouter(deps: {
let messageText = incomingText; let messageText = incomingText;
let attachments = msg.attachments; let attachments = msg.attachments;
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a)); const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
const turnAudioToolInput = extractLatestAudioToolInput(audioAttachments);
if (audioAttachments.length > 0) { if (audioAttachments.length > 0) {
persistLatestAudioAttachment(session, audioAttachments); persistLatestAudioAttachment(session, audioAttachments);
} }
@@ -1424,7 +1445,7 @@ export function createMessageRouter(deps: {
let response: string; let response: string;
activeRuns.set(sessionIdForRun, agent); activeRuns.set(sessionIdForRun, agent);
try { try {
response = await agent.process(messageText, attachments); response = await agent.process(messageText, attachments, turnAudioToolInput);
} catch (error) { } catch (error) {
const currentTier = agent.getModelTier(); const currentTier = agent.getModelTier();
const canEscalate = deps.config.agents.auto_escalate && currentTier !== 'complex'; const canEscalate = deps.config.agents.auto_escalate && currentTier !== 'complex';
@@ -1434,7 +1455,7 @@ export function createMessageRouter(deps: {
console.warn(`Auto-escalating session ${msg.channel}:${msg.senderId} from ${currentTier} to complex after processing failure.`); console.warn(`Auto-escalating session ${msg.channel}:${msg.senderId} from ${currentTier} to complex after processing failure.`);
agent.setModelTier('complex'); agent.setModelTier('complex');
response = await agent.process(messageText, attachments); response = await agent.process(messageText, attachments, turnAudioToolInput);
} }
const outboundAttachments = collector.drain(); const outboundAttachments = collector.drain();
const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel); const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel);