Harden audio transcription arg hydration and add rewrite audit event
This commit is contained in:
@@ -1351,7 +1351,7 @@ describe('daemon audio routing integration', () => {
|
||||
expect(String(msg.text)).toContain('audio transcription is not configured');
|
||||
});
|
||||
|
||||
it('transcribes voice attachments when transcription is configured, then strips audio before calling agent.process', async () => {
|
||||
it('transcribes voice attachments when transcription is configured and preserves audio for anthropic tool fallback', async () => {
|
||||
const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('ok');
|
||||
|
||||
// Mock transcription endpoint call.
|
||||
@@ -1422,6 +1422,90 @@ describe('daemon audio routing integration', () => {
|
||||
timestamp: Date.now(),
|
||||
} as MessageRouterInput, reply);
|
||||
|
||||
expect(fetchSpy).toHaveBeenCalled();
|
||||
expect(processSpy).toHaveBeenCalledTimes(1);
|
||||
const [calledText, calledAttachments] = processSpy.mock.calls[0] ?? [];
|
||||
expect(String(calledText)).toContain('[Voice message]: hello world');
|
||||
expect(String(calledText)).toContain('caption');
|
||||
const atts = calledAttachments as Array<{ mimeType: string }> | undefined;
|
||||
expect(atts?.some(a => a.mimeType === 'audio/ogg')).toBe(true);
|
||||
expect(atts?.some(a => a.mimeType === 'image/jpeg')).toBe(true);
|
||||
expect(session.setConfig).toHaveBeenCalledWith(
|
||||
'lastAudioAttachment',
|
||||
expect.stringContaining('"mimeType":"audio/ogg"'),
|
||||
);
|
||||
});
|
||||
|
||||
it('transcribes voice attachments when transcription is configured and strips audio for openai-compatible providers', async () => {
|
||||
const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('ok');
|
||||
|
||||
const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
|
||||
ok: true,
|
||||
status: 200,
|
||||
statusText: 'OK',
|
||||
json: async () => ({ text: 'hello world' }),
|
||||
} as Response);
|
||||
|
||||
const session = {
|
||||
id: 'telegram:user-voice-3',
|
||||
addMessage: vi.fn(),
|
||||
getHistory: vi.fn(() => []),
|
||||
clear: vi.fn(),
|
||||
replaceHistory: vi.fn(),
|
||||
getConfig: vi.fn(() => undefined),
|
||||
setConfig: vi.fn(),
|
||||
deleteConfig: vi.fn(),
|
||||
};
|
||||
|
||||
const commandRegistry = new CommandRegistry();
|
||||
registerBuiltinCommands(commandRegistry);
|
||||
|
||||
const router = createMessageRouter({
|
||||
sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
|
||||
modelRouter: {
|
||||
getAvailableTiers: () => ['default'],
|
||||
getAllLabels: () => ({ default: 'default' }),
|
||||
getLabel: (tier: string) => tier,
|
||||
} as unknown as MessageRouterDeps['modelRouter'],
|
||||
systemPrompt: 'test prompt',
|
||||
toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
|
||||
toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
|
||||
config: {
|
||||
agents: {
|
||||
primary_tier: 'default',
|
||||
delegation: {
|
||||
compaction: 'default',
|
||||
memory_extraction: 'default',
|
||||
classification: 'default',
|
||||
tool_summarisation: 'default',
|
||||
complex_reasoning: 'default',
|
||||
},
|
||||
max_delegation_depth: 1,
|
||||
max_iterations: 3,
|
||||
},
|
||||
compaction: { enabled: false },
|
||||
models: { default: { provider: 'openai', model: 'gpt-4.1', supports_audio: false } },
|
||||
audio: {
|
||||
enabled: true,
|
||||
provider: { type: 'openai', endpoint: 'https://example.com/v1/audio/transcriptions', api_key: 'sk-test', model: 'whisper-1' },
|
||||
},
|
||||
} as unknown as MessageRouterDeps['config'],
|
||||
commandRegistry,
|
||||
});
|
||||
|
||||
const reply = vi.fn(async (_message: OutboundMessage) => {});
|
||||
await router.handler({
|
||||
id: 'v3',
|
||||
channel: 'telegram',
|
||||
senderId: 'user-voice-3',
|
||||
text: 'caption',
|
||||
attachments: [
|
||||
{ mimeType: 'audio/ogg', data: 'ZGF0YQ==', filename: 'voice.ogg' },
|
||||
{ mimeType: 'image/jpeg', data: 'aW1n', filename: 'img.jpg' },
|
||||
],
|
||||
timestamp: Date.now(),
|
||||
} as MessageRouterInput, reply);
|
||||
|
||||
expect(fetchSpy).toHaveBeenCalled();
|
||||
expect(processSpy).toHaveBeenCalledTimes(1);
|
||||
const [calledText, calledAttachments] = processSpy.mock.calls[0] ?? [];
|
||||
@@ -1430,6 +1514,10 @@ describe('daemon audio routing integration', () => {
|
||||
const atts = calledAttachments as Array<{ mimeType: string }> | undefined;
|
||||
expect(atts?.some(a => a.mimeType === 'audio/ogg')).toBe(false);
|
||||
expect(atts?.some(a => a.mimeType === 'image/jpeg')).toBe(true);
|
||||
expect(session.setConfig).toHaveBeenCalledWith(
|
||||
'lastAudioAttachment',
|
||||
expect.stringContaining('"mimeType":"audio/ogg"'),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
+63
-3
@@ -164,6 +164,57 @@ function shouldForceNativeForCapabilityQuery(text: string): boolean {
|
||||
);
|
||||
}
|
||||
|
||||
function providerAcceptsNativeAudioContentParts(provider: string): boolean {
|
||||
return (
|
||||
provider === 'openai'
|
||||
|| provider === 'github'
|
||||
|| provider === 'gemini'
|
||||
|| provider === 'openrouter'
|
||||
|| provider === 'zhipuai'
|
||||
|| provider === 'xai'
|
||||
|| provider === 'minimax'
|
||||
|| provider === 'moonshot'
|
||||
|| provider === 'vercel'
|
||||
);
|
||||
}
|
||||
|
||||
const LAST_AUDIO_ATTACHMENT_CONFIG_KEY = 'lastAudioAttachment';
|
||||
|
||||
function persistLatestAudioAttachment(
|
||||
session: { setConfig(key: string, value: string): void },
|
||||
audioAttachments: Attachment[],
|
||||
): void {
|
||||
const latest = [...audioAttachments].reverse().find((att) => (
|
||||
(typeof att.data === 'string' && att.data.length > 0)
|
||||
|| (typeof att.url === 'string' && att.url.length > 0)
|
||||
));
|
||||
if (!latest) {
|
||||
return;
|
||||
}
|
||||
|
||||
const payload: { data?: string; url?: string; mimeType?: string } = {
|
||||
mimeType: latest.mimeType,
|
||||
};
|
||||
if (typeof latest.data === 'string' && latest.data.length > 0) {
|
||||
payload.data = latest.data;
|
||||
} else if (typeof latest.url === 'string' && latest.url.length > 0) {
|
||||
payload.url = latest.url;
|
||||
}
|
||||
|
||||
if (!payload.data && !payload.url) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
session.setConfig(LAST_AUDIO_ATTACHMENT_CONFIG_KEY, JSON.stringify(payload));
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
'Failed to persist latest audio attachment for tool hydration:',
|
||||
error instanceof Error ? error.message : String(error),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function isTtsEnabledForChannel(config: Config, channel: string): boolean {
|
||||
if (!config.tts?.enabled) {
|
||||
return false;
|
||||
@@ -1266,6 +1317,9 @@ export function createMessageRouter(deps: {
|
||||
let messageText = incomingText;
|
||||
let attachments = msg.attachments;
|
||||
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
|
||||
if (audioAttachments.length > 0) {
|
||||
persistLatestAudioAttachment(session, audioAttachments);
|
||||
}
|
||||
|
||||
if (audioAttachments.length > 0 && !nativeAudioSupported) {
|
||||
// Model doesn't support native audio — transcribe via Whisper and strip audio attachments
|
||||
@@ -1300,9 +1354,15 @@ export function createMessageRouter(deps: {
|
||||
const transcript = await transcribeAudio(att, audioConfig);
|
||||
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
|
||||
}
|
||||
// Remove audio attachments so buildUserMessage doesn't create audio content parts
|
||||
attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
|
||||
if (attachments.length === 0) { attachments = undefined; }
|
||||
// For providers that cannot ingest native audio content parts (e.g. Anthropic),
|
||||
// keep the original audio attachment available in the tool loop so
|
||||
// audio.transcribe can still be hydrated from bytes if the model requests it.
|
||||
// For providers that do accept native audio parts (OpenAI-compatible/Gemini),
|
||||
// strip audio to avoid sending raw audio to a model tier that was marked as non-audio.
|
||||
if (providerAcceptsNativeAudioContentParts(modelProvider)) {
|
||||
attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
|
||||
if (attachments.length === 0) { attachments = undefined; }
|
||||
}
|
||||
}
|
||||
// If native audio IS supported, we pass attachments through unchanged —
|
||||
// buildUserMessage() in the agent will create native audio content parts
|
||||
|
||||
Reference in New Issue
Block a user