Bind audio.transcribe hydration to current message turn

This commit is contained in:
William Valentin
2026-02-22 21:27:09 -08:00
parent 89246e7da0
commit a761813375
5 changed files with 180 additions and 11 deletions
+73
View File
@@ -323,6 +323,79 @@ describe('NativeAgent tool loop', () => {
}));
});
it('prefers per-turn audio input over persisted fallback during voice transcript turns', async () => {
let callCount = 0;
let seenArgs: Record<string, unknown> | undefined;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: {} }],
};
}
return {
content: 'done',
stopReason: 'end_turn',
usage: { inputTokens: 15, outputTokens: 10 },
};
}),
};
const mockSession = {
id: 'telegram:user-audio',
getHistory: vi.fn().mockReturnValue([
{ role: 'user', content: '[Voice message]: old transcript' },
]),
addMessage: vi.fn(),
clear: vi.fn(),
replaceHistory: vi.fn(),
getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment'
? JSON.stringify({ data: 'T0xEX0FVRElP', mimeType: 'audio/ogg' })
: undefined)),
setConfig: vi.fn(),
deleteConfig: vi.fn(),
};
const audioTool: Tool = {
name: 'audio.transcribe',
description: 'Transcribe audio',
inputSchema: { type: 'object', properties: {} },
execute: async (args) => {
seenArgs = args as Record<string, unknown>;
return { success: true, output: 'transcript' };
},
};
const registry = new ToolRegistry();
registry.register(audioTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
session: mockSession,
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process(
'Please transcribe this',
undefined,
{ data: 'TkVXX0FVRElP', mime_type: 'audio/ogg' },
);
expect(response).toBe('done');
expect(seenArgs).toEqual(expect.objectContaining({
data: 'TkVXX0FVRElP',
mime_type: 'audio/ogg',
}));
});
it('replaces invalid file:// audio.transcribe url with persisted session audio data', async () => {
let callCount = 0;
let seenArgs: Record<string, unknown> | undefined;
+70 -1
View File
@@ -77,6 +77,12 @@ interface AudioToolArgSummary {
mimeType?: string;
}
export interface NativeAgentTurnAudioInput {
data?: string;
url?: string;
mime_type?: string;
}
export class NativeAgent {
private static readonly EMPTY_RESPONSE_FALLBACK =
'I could not generate a response for that. Please try again.';
@@ -100,6 +106,7 @@ export class NativeAgent {
private _runInProgress = false;
private _runAbortController?: AbortController;
private modelTimeoutMs: number;
private _currentTurnAudioInput?: AudioToolInput;
constructor(config: NativeAgentConfig) {
this.modelClient = config.modelClient;
@@ -120,9 +127,14 @@ export class NativeAgent {
return this.session?.getHistory() ?? [...this.inMemoryHistory];
}
async process(userMessage: string, attachments?: Attachment[]): Promise<string> {
async process(
userMessage: string,
attachments?: Attachment[],
turnAudioInput?: NativeAgentTurnAudioInput,
): Promise<string> {
this._cancelRequested = false;
this._runAbortController = new AbortController();
this._currentTurnAudioInput = this.normalizeTurnAudioInput(turnAudioInput) ?? this.extractLatestAudioInputFromAttachments(attachments);
if ('clearAbort' in this.modelClient && typeof this.modelClient.clearAbort === 'function') {
this.modelClient.clearAbort();
}
@@ -162,6 +174,7 @@ export class NativeAgent {
this._runInProgress = false;
this._cancelRequested = false;
this._runAbortController = undefined;
this._currentTurnAudioInput = undefined;
}
}
@@ -649,6 +662,12 @@ export class NativeAgent {
: {};
const original = this.summarizeAudioToolArgs(args);
if (this._currentTurnAudioInput) {
this.applyAudioToolInput(args, this._currentTurnAudioInput);
this.logAudioArgsRewrite('latest_audio_preferred', 'latest_turn', original, args);
return args;
}
const latestTurnAudio = this.getLatestTurnUserAudioInput();
if (latestTurnAudio) {
this.applyAudioToolInput(args, latestTurnAudio);
@@ -794,6 +813,56 @@ export class NativeAgent {
return null;
}
private normalizeTurnAudioInput(turnAudioInput: NativeAgentTurnAudioInput | undefined): AudioToolInput | undefined {
if (!turnAudioInput) {
return undefined;
}
const data = typeof turnAudioInput.data === 'string' && turnAudioInput.data.length > 0
? turnAudioInput.data
: undefined;
const url = typeof turnAudioInput.url === 'string' && turnAudioInput.url.length > 0
? turnAudioInput.url
: undefined;
const mimeType = typeof turnAudioInput.mime_type === 'string' && turnAudioInput.mime_type.length > 0
? turnAudioInput.mime_type
: undefined;
if (!data && !url) {
return undefined;
}
return {
...(data ? { data } : {}),
...(url ? { url } : {}),
...(mimeType ? { mime_type: mimeType } : {}),
};
}
private extractLatestAudioInputFromAttachments(attachments?: Attachment[]): AudioToolInput | undefined {
if (!attachments || attachments.length === 0) {
return undefined;
}
for (let i = attachments.length - 1; i >= 0; i--) {
const attachment = attachments[i];
if (!attachment.mimeType.startsWith('audio/')) {
continue;
}
const data = typeof attachment.data === 'string' && attachment.data.length > 0
? attachment.data
: undefined;
const url = typeof attachment.url === 'string' && attachment.url.length > 0
? attachment.url
: undefined;
if (!data && !url) {
continue;
}
return {
...(data ? { data } : {}),
...(url ? { url } : {}),
mime_type: attachment.mimeType,
};
}
return undefined;
}
private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined {
if (typeof rawData !== 'string') {
return undefined;
+12 -6
View File
@@ -7,6 +7,7 @@ import type { MemoryStore } from '../../memory/store.js';
import type { ToolPolicyContext } from '../../tools/policy.js';
import type { Attachment } from '../../channels/types.js';
import { NativeAgent } from './agent.js';
import type { NativeAgentTurnAudioInput } from './agent.js';
import type { ToolUseEvent } from './agent.js';
import type { OutboundAttachmentCollector } from './attachments.js';
import { estimateMessageTokens, getContextWindow, shouldCompact } from '../../context/tokens.js';
@@ -339,7 +340,11 @@ export class AgentOrchestrator {
* When compaction is configured, checks whether the conversation history
* exceeds the context window threshold and compacts it before processing.
*/
async process(userMessage: string, attachments?: Attachment[]): Promise<string> {
async process(
userMessage: string,
attachments?: Attachment[],
turnAudioInput?: NativeAgentTurnAudioInput,
): Promise<string> {
this._activeRunToolStarts = 0;
this._injectMemoryContext(userMessage);
await this._runProactiveContextMaintenance();
@@ -352,10 +357,10 @@ export class AgentOrchestrator {
let result: string;
try {
result = await this._agent.process(userMessage, attachments);
result = await this._agent.process(userMessage, attachments, turnAudioInput);
} catch {
this._restoreHistory(before);
const escalated = await this._retryWithEscalation(userMessage, attachments, before, originalTier);
const escalated = await this._retryWithEscalation(userMessage, attachments, turnAudioInput, before, originalTier);
if (escalated) {
await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts);
return escalated;
@@ -383,7 +388,7 @@ export class AgentOrchestrator {
if (ctx) {
// Attempt: compact + hard-trim to fit the discovered context window, then retry once.
await this._compactAndTrimToFit(ctx);
const retry = await this._agent.process(userMessage, attachments);
const retry = await this._agent.process(userMessage, attachments, turnAudioInput);
if (!this._isToolLoopErrorMessage(retry)) {
return retry;
}
@@ -391,7 +396,7 @@ export class AgentOrchestrator {
this._restoreHistory(before);
}
const escalated = await this._retryWithEscalation(userMessage, attachments, before, originalTier);
const escalated = await this._retryWithEscalation(userMessage, attachments, turnAudioInput, before, originalTier);
if (escalated) {
await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts);
return escalated;
@@ -419,6 +424,7 @@ export class AgentOrchestrator {
private async _retryWithEscalation(
userMessage: string,
attachments: Attachment[] | undefined,
turnAudioInput: NativeAgentTurnAudioInput | undefined,
historyBefore: Message[],
originalTier: ModelTier,
): Promise<string | null> {
@@ -437,7 +443,7 @@ export class AgentOrchestrator {
this._agent.setModelTier(targetTier);
try {
const retry = await this._agent.process(userMessage, attachments);
const retry = await this._agent.process(userMessage, attachments, turnAudioInput);
if (!this._isToolLoopErrorMessage(retry)) {
return retry;
}