Harden audio transcription arg hydration and add rewrite audit event

This commit is contained in:
William Valentin
2026-02-22 18:56:22 -08:00
parent 7d0d8abec6
commit db4e52dd7e
10 changed files with 1183 additions and 16 deletions
+63 -3
View File
@@ -164,6 +164,57 @@ function shouldForceNativeForCapabilityQuery(text: string): boolean {
);
}
function providerAcceptsNativeAudioContentParts(provider: string): boolean {
return (
provider === 'openai'
|| provider === 'github'
|| provider === 'gemini'
|| provider === 'openrouter'
|| provider === 'zhipuai'
|| provider === 'xai'
|| provider === 'minimax'
|| provider === 'moonshot'
|| provider === 'vercel'
);
}
const LAST_AUDIO_ATTACHMENT_CONFIG_KEY = 'lastAudioAttachment';
function persistLatestAudioAttachment(
session: { setConfig(key: string, value: string): void },
audioAttachments: Attachment[],
): void {
const latest = [...audioAttachments].reverse().find((att) => (
(typeof att.data === 'string' && att.data.length > 0)
|| (typeof att.url === 'string' && att.url.length > 0)
));
if (!latest) {
return;
}
const payload: { data?: string; url?: string; mimeType?: string } = {
mimeType: latest.mimeType,
};
if (typeof latest.data === 'string' && latest.data.length > 0) {
payload.data = latest.data;
} else if (typeof latest.url === 'string' && latest.url.length > 0) {
payload.url = latest.url;
}
if (!payload.data && !payload.url) {
return;
}
try {
session.setConfig(LAST_AUDIO_ATTACHMENT_CONFIG_KEY, JSON.stringify(payload));
} catch (error) {
console.warn(
'Failed to persist latest audio attachment for tool hydration:',
error instanceof Error ? error.message : String(error),
);
}
}
function isTtsEnabledForChannel(config: Config, channel: string): boolean {
if (!config.tts?.enabled) {
return false;
@@ -1266,6 +1317,9 @@ export function createMessageRouter(deps: {
let messageText = incomingText;
let attachments = msg.attachments;
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
if (audioAttachments.length > 0) {
persistLatestAudioAttachment(session, audioAttachments);
}
if (audioAttachments.length > 0 && !nativeAudioSupported) {
// Model doesn't support native audio — transcribe via Whisper and strip audio attachments
@@ -1300,9 +1354,15 @@ export function createMessageRouter(deps: {
const transcript = await transcribeAudio(att, audioConfig);
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
}
// Remove audio attachments so buildUserMessage doesn't create audio content parts
attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
if (attachments.length === 0) { attachments = undefined; }
// For providers that cannot ingest native audio content parts (e.g. Anthropic),
// keep the original audio attachment available in the tool loop so
// audio.transcribe can still be hydrated from bytes if the model requests it.
// For providers that do accept native audio parts (OpenAI-compatible/Gemini),
// strip audio to avoid sending raw audio to a model tier that was marked as non-audio.
if (providerAcceptsNativeAudioContentParts(modelProvider)) {
attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
if (attachments.length === 0) { attachments = undefined; }
}
}
// If native audio IS supported, we pass attachments through unchanged —
// buildUserMessage() in the agent will create native audio content parts