Harden audio transcription arg hydration and add rewrite audit event

This commit is contained in:
William Valentin
2026-02-22 18:56:22 -08:00
parent 7d0d8abec6
commit db4e52dd7e
10 changed files with 1183 additions and 16 deletions
+295 -1
View File
@@ -9,6 +9,7 @@ import type { Attachment } from '../../channels/types.js';
import type { OutboundAttachmentCollector } from './attachments.js';
import { buildUserMessage } from '../../models/media.js';
import { getElevationWindow } from '../../security/elevation.js';
import { auditLogger } from '../../audit/index.js';
export interface ToolUseEvent {
type: 'start' | 'end';
@@ -62,6 +63,20 @@ interface ExtractedTextToolCall {
end: number;
}
const LAST_AUDIO_ATTACHMENT_CONFIG_KEY = 'lastAudioAttachment';
interface AudioToolInput {
data?: string;
url?: string;
mime_type?: string;
}
interface AudioToolArgSummary {
hasData: boolean;
hasUrl: boolean;
mimeType?: string;
}
export class NativeAgent {
private static readonly EMPTY_RESPONSE_FALLBACK =
'I could not generate a response for that. Please try again.';
@@ -363,7 +378,8 @@ export class NativeAgent {
}
: undefined;
const result = await toolExecutor.execute(internalName, tc.args, perCallContext, {
const toolArgs = this.normalizeToolArgsForExecution(internalName, tc.args);
const result = await toolExecutor.execute(internalName, toolArgs, perCallContext, {
signal: this._runAbortController?.signal,
});
@@ -620,6 +636,284 @@ export class NativeAgent {
return error instanceof Error && error.name === 'AbortError';
}
private normalizeToolArgsForExecution(toolName: string, rawArgs: unknown): unknown {
if (toolName !== 'audio.transcribe') {
return rawArgs;
}
return this.hydrateAudioTranscribeArgs(rawArgs);
}
private hydrateAudioTranscribeArgs(rawArgs: unknown): unknown {
const args = (rawArgs && typeof rawArgs === 'object')
? { ...(rawArgs as Record<string, unknown>) }
: {};
const original = this.summarizeAudioToolArgs(args);
const latestTurnAudio = this.getLatestTurnUserAudioInput();
if (latestTurnAudio) {
this.applyAudioToolInput(args, latestTurnAudio);
this.logAudioArgsRewrite('latest_audio_preferred', 'latest_turn', original, args);
return args;
}
if (this.isCurrentTurnVoiceTranscriptFallback()) {
const persistedAudio = this.getPersistedAudioInput();
if (persistedAudio) {
this.applyAudioToolInput(args, persistedAudio);
this.logAudioArgsRewrite('voice_turn_fallback', 'persisted', original, args);
return args;
}
}
const normalizedData = this.normalizeAudioTranscribeDataArg(args.data, args.mime_type);
const normalizedUrl = this.normalizeAudioTranscribeUrlArg(args.url);
if (normalizedData) {
args.data = normalizedData;
delete args.url;
} else if (normalizedUrl) {
args.url = normalizedUrl;
delete args.data;
} else {
delete args.data;
delete args.url;
}
const hasData = typeof args.data === 'string' && args.data.length > 0;
const hasUrl = typeof args.url === 'string' && args.url.length > 0;
if (hasData || hasUrl) {
if (hasData && (typeof args.mime_type !== 'string' || args.mime_type.length === 0)) {
const latestAudioForMime = this.getLatestUserAudioInput();
if (latestAudioForMime?.mime_type) {
args.mime_type = latestAudioForMime.mime_type;
}
}
return args;
}
const latestAudio = this.getLatestUserAudioInput();
if (!latestAudio) {
return args;
}
const persistedAudio = this.getPersistedAudioInput();
const source: 'history' | 'persisted' = persistedAudio?.data === latestAudio.data
&& persistedAudio?.mime_type === latestAudio.mime_type
? 'persisted'
: 'history';
this.applyAudioToolInput(args, latestAudio);
this.logAudioArgsRewrite(original.hasData || original.hasUrl ? 'invalid_model_args' : 'missing_model_args', source, original, args);
return args;
}
private summarizeAudioToolArgs(args: Record<string, unknown>): AudioToolArgSummary {
const hasData = typeof args.data === 'string' && args.data.length > 0;
const hasUrl = typeof args.url === 'string' && args.url.length > 0;
const mimeType = typeof args.mime_type === 'string' && args.mime_type.length > 0
? args.mime_type
: undefined;
return { hasData, hasUrl, mimeType };
}
private applyAudioToolInput(args: Record<string, unknown>, audio: AudioToolInput): void {
if (audio.data) {
args.data = audio.data;
delete args.url;
} else if (audio.url) {
args.url = audio.url;
delete args.data;
} else {
delete args.data;
delete args.url;
}
if (audio.mime_type) {
args.mime_type = audio.mime_type;
}
}
private logAudioArgsRewrite(
reason: 'latest_audio_preferred' | 'voice_turn_fallback' | 'invalid_model_args' | 'missing_model_args',
source: 'latest_turn' | 'history' | 'persisted',
original: AudioToolArgSummary,
normalizedArgs: Record<string, unknown>,
): void {
const finalMime = typeof normalizedArgs.mime_type === 'string' && normalizedArgs.mime_type.length > 0
? normalizedArgs.mime_type
: undefined;
auditLogger?.toolArgsRewritten({
tool_name: 'audio.transcribe',
session_id: this.session?.id,
source,
reason,
original_has_data: original.hasData,
original_has_url: original.hasUrl,
original_mime_type: original.mimeType,
final_mime_type: finalMime,
});
}
private isCurrentTurnVoiceTranscriptFallback(): boolean {
for (let i = this.history.length - 1; i >= 0; i--) {
const msg = this.history[i];
if (msg.role !== 'user') {
continue;
}
if (typeof msg.content === 'string') {
return msg.content.includes('[Voice message]:');
}
if (!Array.isArray(msg.content)) {
return false;
}
return msg.content.some((part) => (
part.type === 'text'
&& typeof part.text === 'string'
&& part.text.includes('[Voice message]:')
));
}
return false;
}
private getLatestTurnUserAudioInput(): AudioToolInput | null {
for (let i = this.history.length - 1; i >= 0; i--) {
const msg = this.history[i];
if (msg.role !== 'user') {
continue;
}
if (!Array.isArray(msg.content)) {
return null;
}
for (const part of msg.content) {
if (part.type !== 'audio') {
continue;
}
const source = part.source;
if (typeof source.data === 'string' && source.data.length > 0 && typeof source.media_type === 'string' && source.media_type.length > 0) {
return { data: source.data, mime_type: source.media_type };
}
}
return null;
}
return null;
}
private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined {
if (typeof rawData !== 'string') {
return undefined;
}
const compact = rawData.replace(/\s+/g, '');
if (compact.length === 0) {
return undefined;
}
if (!/^[A-Za-z0-9+/=]+$/.test(compact)) {
return undefined;
}
try {
const decoded = Buffer.from(compact, 'base64');
if (decoded.length === 0) {
return undefined;
}
const mimeType = typeof rawMimeType === 'string' ? rawMimeType : undefined;
if (!this.matchesAudioSignature(decoded, mimeType)) {
return undefined;
}
return compact;
} catch {
return undefined;
}
}
private matchesAudioSignature(buffer: Buffer, mimeType?: string): boolean {
const ascii = (offset: number, value: string): boolean => {
if (buffer.length < offset + value.length) {
return false;
}
return buffer.subarray(offset, offset + value.length).toString('ascii') === value;
};
if (!mimeType) {
return true;
}
switch (mimeType) {
case 'audio/ogg':
return ascii(0, 'OggS');
case 'audio/wav':
return ascii(0, 'RIFF') && ascii(8, 'WAVE');
case 'audio/webm':
return buffer.length >= 4
&& buffer[0] === 0x1A
&& buffer[1] === 0x45
&& buffer[2] === 0xDF
&& buffer[3] === 0xA3;
case 'audio/mpeg':
case 'audio/mp3':
return ascii(0, 'ID3')
|| (buffer.length >= 2 && buffer[0] === 0xFF && (buffer[1] & 0xE0) === 0xE0);
case 'audio/mp4':
case 'audio/x-m4a':
return ascii(4, 'ftyp');
default:
return true;
}
}
private normalizeAudioTranscribeUrlArg(rawUrl: unknown): string | undefined {
if (typeof rawUrl !== 'string') {
return undefined;
}
const trimmed = rawUrl.trim();
if (trimmed.length === 0) {
return undefined;
}
if (!/^https?:\/\//i.test(trimmed)) {
return undefined;
}
return trimmed;
}
private getLatestUserAudioInput(): AudioToolInput | null {
for (let i = this.history.length - 1; i >= 0; i--) {
const msg = this.history[i];
if (msg.role !== 'user' || !Array.isArray(msg.content)) {
continue;
}
for (const part of msg.content) {
if (part.type !== 'audio') {
continue;
}
const source = part.source;
if (typeof source.data === 'string' && source.data.length > 0 && typeof source.media_type === 'string' && source.media_type.length > 0) {
return { data: source.data, mime_type: source.media_type };
}
}
}
return this.getPersistedAudioInput();
}
private getPersistedAudioInput(): AudioToolInput | null {
const persisted = this.session?.getConfig(LAST_AUDIO_ATTACHMENT_CONFIG_KEY);
if (!persisted) {
return null;
}
try {
const parsed = JSON.parse(persisted) as { data?: unknown; url?: unknown; mimeType?: unknown };
const data = typeof parsed.data === 'string' && parsed.data.length > 0 ? parsed.data : undefined;
const url = typeof parsed.url === 'string' && parsed.url.length > 0 ? parsed.url : undefined;
const mimeType = typeof parsed.mimeType === 'string' && parsed.mimeType.length > 0 ? parsed.mimeType : undefined;
if (!data && !url) {
return null;
}
return {
...(data ? { data } : {}),
...(url ? { url } : {}),
...(mimeType ? { mime_type: mimeType } : {}),
};
} catch {
return null;
}
}
private extractPseudoToolUse(content: string): PseudoToolUse | null {
if (!content) {
return null;