Bind audio.transcribe hydration to current message turn

This commit is contained in:
William Valentin
2026-02-22 21:27:09 -08:00
parent 89246e7da0
commit a761813375
5 changed files with 180 additions and 11 deletions
+70 -1
View File
@@ -77,6 +77,12 @@ interface AudioToolArgSummary {
mimeType?: string;
}
export interface NativeAgentTurnAudioInput {
data?: string;
url?: string;
mime_type?: string;
}
export class NativeAgent {
private static readonly EMPTY_RESPONSE_FALLBACK =
'I could not generate a response for that. Please try again.';
@@ -100,6 +106,7 @@ export class NativeAgent {
private _runInProgress = false;
private _runAbortController?: AbortController;
private modelTimeoutMs: number;
private _currentTurnAudioInput?: AudioToolInput;
constructor(config: NativeAgentConfig) {
this.modelClient = config.modelClient;
@@ -120,9 +127,14 @@ export class NativeAgent {
return this.session?.getHistory() ?? [...this.inMemoryHistory];
}
async process(userMessage: string, attachments?: Attachment[]): Promise<string> {
async process(
userMessage: string,
attachments?: Attachment[],
turnAudioInput?: NativeAgentTurnAudioInput,
): Promise<string> {
this._cancelRequested = false;
this._runAbortController = new AbortController();
this._currentTurnAudioInput = this.normalizeTurnAudioInput(turnAudioInput) ?? this.extractLatestAudioInputFromAttachments(attachments);
if ('clearAbort' in this.modelClient && typeof this.modelClient.clearAbort === 'function') {
this.modelClient.clearAbort();
}
@@ -162,6 +174,7 @@ export class NativeAgent {
this._runInProgress = false;
this._cancelRequested = false;
this._runAbortController = undefined;
this._currentTurnAudioInput = undefined;
}
}
@@ -649,6 +662,12 @@ export class NativeAgent {
: {};
const original = this.summarizeAudioToolArgs(args);
if (this._currentTurnAudioInput) {
this.applyAudioToolInput(args, this._currentTurnAudioInput);
this.logAudioArgsRewrite('latest_audio_preferred', 'latest_turn', original, args);
return args;
}
const latestTurnAudio = this.getLatestTurnUserAudioInput();
if (latestTurnAudio) {
this.applyAudioToolInput(args, latestTurnAudio);
@@ -794,6 +813,56 @@ export class NativeAgent {
return null;
}
private normalizeTurnAudioInput(turnAudioInput: NativeAgentTurnAudioInput | undefined): AudioToolInput | undefined {
if (!turnAudioInput) {
return undefined;
}
const data = typeof turnAudioInput.data === 'string' && turnAudioInput.data.length > 0
? turnAudioInput.data
: undefined;
const url = typeof turnAudioInput.url === 'string' && turnAudioInput.url.length > 0
? turnAudioInput.url
: undefined;
const mimeType = typeof turnAudioInput.mime_type === 'string' && turnAudioInput.mime_type.length > 0
? turnAudioInput.mime_type
: undefined;
if (!data && !url) {
return undefined;
}
return {
...(data ? { data } : {}),
...(url ? { url } : {}),
...(mimeType ? { mime_type: mimeType } : {}),
};
}
private extractLatestAudioInputFromAttachments(attachments?: Attachment[]): AudioToolInput | undefined {
if (!attachments || attachments.length === 0) {
return undefined;
}
for (let i = attachments.length - 1; i >= 0; i--) {
const attachment = attachments[i];
if (!attachment.mimeType.startsWith('audio/')) {
continue;
}
const data = typeof attachment.data === 'string' && attachment.data.length > 0
? attachment.data
: undefined;
const url = typeof attachment.url === 'string' && attachment.url.length > 0
? attachment.url
: undefined;
if (!data && !url) {
continue;
}
return {
...(data ? { data } : {}),
...(url ? { url } : {}),
mime_type: attachment.mimeType,
};
}
return undefined;
}
private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined {
if (typeof rawData !== 'string') {
return undefined;