Harden audio transcription arg hydration and add rewrite audit event
This commit is contained in:
@@ -199,6 +199,458 @@ describe('NativeAgent tool loop', () => {
|
||||
expect(mockClient.chat).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it('hydrates missing audio.transcribe args from latest user audio attachment', async () => {
|
||||
let callCount = 0;
|
||||
let seenArgs: Record<string, unknown> | undefined;
|
||||
const mockClient: ModelClient = {
|
||||
chat: vi.fn().mockImplementation(() => {
|
||||
callCount++;
|
||||
if (callCount === 1) {
|
||||
return {
|
||||
content: '',
|
||||
stopReason: 'tool_use',
|
||||
usage: { inputTokens: 10, outputTokens: 5 },
|
||||
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: {} }],
|
||||
};
|
||||
}
|
||||
return {
|
||||
content: 'done',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 15, outputTokens: 10 },
|
||||
};
|
||||
}),
|
||||
};
|
||||
|
||||
const audioTool: Tool = {
|
||||
name: 'audio.transcribe',
|
||||
description: 'Transcribe audio',
|
||||
inputSchema: { type: 'object', properties: {} },
|
||||
execute: async (args) => {
|
||||
seenArgs = args as Record<string, unknown>;
|
||||
return { success: true, output: 'transcript' };
|
||||
},
|
||||
};
|
||||
|
||||
const registry = new ToolRegistry();
|
||||
registry.register(audioTool);
|
||||
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
|
||||
const executor = new ToolExecutor(registry, hooks);
|
||||
|
||||
const agent = new NativeAgent({
|
||||
modelClient: mockClient,
|
||||
systemPrompt: 'You are helpful.',
|
||||
toolRegistry: registry,
|
||||
toolExecutor: executor,
|
||||
});
|
||||
|
||||
const response = await agent.process('Please transcribe this', [{
|
||||
mimeType: 'audio/ogg',
|
||||
data: 'QUJDRA==',
|
||||
filename: 'voice.ogg',
|
||||
}]);
|
||||
|
||||
expect(response).toBe('done');
|
||||
expect(seenArgs).toEqual(expect.objectContaining({
|
||||
data: 'QUJDRA==',
|
||||
mime_type: 'audio/ogg',
|
||||
}));
|
||||
});
|
||||
|
||||
it('hydrates missing audio.transcribe args from persisted session audio attachment', async () => {
|
||||
let callCount = 0;
|
||||
let seenArgs: Record<string, unknown> | undefined;
|
||||
const mockClient: ModelClient = {
|
||||
chat: vi.fn().mockImplementation(() => {
|
||||
callCount++;
|
||||
if (callCount === 1) {
|
||||
return {
|
||||
content: '',
|
||||
stopReason: 'tool_use',
|
||||
usage: { inputTokens: 10, outputTokens: 5 },
|
||||
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: {} }],
|
||||
};
|
||||
}
|
||||
return {
|
||||
content: 'done',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 15, outputTokens: 10 },
|
||||
};
|
||||
}),
|
||||
};
|
||||
|
||||
const mockSession = {
|
||||
id: 'telegram:user-audio',
|
||||
getHistory: vi.fn().mockReturnValue([]),
|
||||
addMessage: vi.fn(),
|
||||
clear: vi.fn(),
|
||||
replaceHistory: vi.fn(),
|
||||
getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment'
|
||||
? JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' })
|
||||
: undefined)),
|
||||
setConfig: vi.fn(),
|
||||
deleteConfig: vi.fn(),
|
||||
};
|
||||
|
||||
const audioTool: Tool = {
|
||||
name: 'audio.transcribe',
|
||||
description: 'Transcribe audio',
|
||||
inputSchema: { type: 'object', properties: {} },
|
||||
execute: async (args) => {
|
||||
seenArgs = args as Record<string, unknown>;
|
||||
return { success: true, output: 'transcript' };
|
||||
},
|
||||
};
|
||||
|
||||
const registry = new ToolRegistry();
|
||||
registry.register(audioTool);
|
||||
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
|
||||
const executor = new ToolExecutor(registry, hooks);
|
||||
|
||||
const agent = new NativeAgent({
|
||||
modelClient: mockClient,
|
||||
systemPrompt: 'You are helpful.',
|
||||
session: mockSession,
|
||||
toolRegistry: registry,
|
||||
toolExecutor: executor,
|
||||
});
|
||||
|
||||
const response = await agent.process('Please transcribe this');
|
||||
|
||||
expect(response).toBe('done');
|
||||
expect(seenArgs).toEqual(expect.objectContaining({
|
||||
data: 'U0VTU0lPTg==',
|
||||
mime_type: 'audio/ogg',
|
||||
}));
|
||||
});
|
||||
|
||||
it('replaces invalid file:// audio.transcribe url with persisted session audio data', async () => {
|
||||
let callCount = 0;
|
||||
let seenArgs: Record<string, unknown> | undefined;
|
||||
const mockClient: ModelClient = {
|
||||
chat: vi.fn().mockImplementation(() => {
|
||||
callCount++;
|
||||
if (callCount === 1) {
|
||||
return {
|
||||
content: '',
|
||||
stopReason: 'tool_use',
|
||||
usage: { inputTokens: 10, outputTokens: 5 },
|
||||
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { url: 'file://voice_message', mime_type: 'audio/ogg' } }],
|
||||
};
|
||||
}
|
||||
return {
|
||||
content: 'done',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 15, outputTokens: 10 },
|
||||
};
|
||||
}),
|
||||
};
|
||||
|
||||
const mockSession = {
|
||||
id: 'telegram:user-audio',
|
||||
getHistory: vi.fn().mockReturnValue([]),
|
||||
addMessage: vi.fn(),
|
||||
clear: vi.fn(),
|
||||
replaceHistory: vi.fn(),
|
||||
getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment'
|
||||
? JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' })
|
||||
: undefined)),
|
||||
setConfig: vi.fn(),
|
||||
deleteConfig: vi.fn(),
|
||||
};
|
||||
|
||||
const audioTool: Tool = {
|
||||
name: 'audio.transcribe',
|
||||
description: 'Transcribe audio',
|
||||
inputSchema: { type: 'object', properties: {} },
|
||||
execute: async (args) => {
|
||||
seenArgs = args as Record<string, unknown>;
|
||||
return { success: true, output: 'transcript' };
|
||||
},
|
||||
};
|
||||
|
||||
const registry = new ToolRegistry();
|
||||
registry.register(audioTool);
|
||||
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
|
||||
const executor = new ToolExecutor(registry, hooks);
|
||||
|
||||
const agent = new NativeAgent({
|
||||
modelClient: mockClient,
|
||||
systemPrompt: 'You are helpful.',
|
||||
session: mockSession,
|
||||
toolRegistry: registry,
|
||||
toolExecutor: executor,
|
||||
});
|
||||
|
||||
const response = await agent.process('Please transcribe this');
|
||||
|
||||
expect(response).toBe('done');
|
||||
expect(seenArgs).toEqual(expect.objectContaining({
|
||||
data: 'U0VTU0lPTg==',
|
||||
mime_type: 'audio/ogg',
|
||||
}));
|
||||
expect(seenArgs).not.toHaveProperty('url');
|
||||
});
|
||||
|
||||
it('replaces text-like base64 audio.transcribe data with persisted session audio data', async () => {
|
||||
let callCount = 0;
|
||||
let seenArgs: Record<string, unknown> | undefined;
|
||||
const mockClient: ModelClient = {
|
||||
chat: vi.fn().mockImplementation(() => {
|
||||
callCount++;
|
||||
if (callCount === 1) {
|
||||
return {
|
||||
content: '',
|
||||
stopReason: 'tool_use',
|
||||
usage: { inputTokens: 10, outputTokens: 5 },
|
||||
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { data: 'VGhpcyBvbmUgdHdvIHRocmVl', mime_type: 'audio/wav' } }],
|
||||
};
|
||||
}
|
||||
return {
|
||||
content: 'done',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 15, outputTokens: 10 },
|
||||
};
|
||||
}),
|
||||
};
|
||||
|
||||
const mockSession = {
|
||||
id: 'telegram:user-audio',
|
||||
getHistory: vi.fn().mockReturnValue([]),
|
||||
addMessage: vi.fn(),
|
||||
clear: vi.fn(),
|
||||
replaceHistory: vi.fn(),
|
||||
getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment'
|
||||
? JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' })
|
||||
: undefined)),
|
||||
setConfig: vi.fn(),
|
||||
deleteConfig: vi.fn(),
|
||||
};
|
||||
|
||||
const audioTool: Tool = {
|
||||
name: 'audio.transcribe',
|
||||
description: 'Transcribe audio',
|
||||
inputSchema: { type: 'object', properties: {} },
|
||||
execute: async (args) => {
|
||||
seenArgs = args as Record<string, unknown>;
|
||||
return { success: true, output: 'transcript' };
|
||||
},
|
||||
};
|
||||
|
||||
const registry = new ToolRegistry();
|
||||
registry.register(audioTool);
|
||||
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
|
||||
const executor = new ToolExecutor(registry, hooks);
|
||||
|
||||
const agent = new NativeAgent({
|
||||
modelClient: mockClient,
|
||||
systemPrompt: 'You are helpful.',
|
||||
session: mockSession,
|
||||
toolRegistry: registry,
|
||||
toolExecutor: executor,
|
||||
});
|
||||
|
||||
const response = await agent.process('Please transcribe this');
|
||||
|
||||
expect(response).toBe('done');
|
||||
expect(seenArgs).toEqual(expect.objectContaining({
|
||||
data: 'U0VTU0lPTg==',
|
||||
mime_type: 'audio/ogg',
|
||||
}));
|
||||
});
|
||||
|
||||
it('forces persisted audio on voice-transcript fallback turns', async () => {
|
||||
let callCount = 0;
|
||||
let seenArgs: Record<string, unknown> | undefined;
|
||||
const mockClient: ModelClient = {
|
||||
chat: vi.fn().mockImplementation(() => {
|
||||
callCount++;
|
||||
if (callCount === 1) {
|
||||
return {
|
||||
content: '',
|
||||
stopReason: 'tool_use',
|
||||
usage: { inputTokens: 10, outputTokens: 5 },
|
||||
toolCalls: [{
|
||||
id: 'call_1',
|
||||
name: 'audio_transcribe',
|
||||
args: {
|
||||
data: 'UklGRigAAABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YQQAAAAAAAAA',
|
||||
mime_type: 'audio/wav',
|
||||
},
|
||||
}],
|
||||
};
|
||||
}
|
||||
return {
|
||||
content: 'done',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 15, outputTokens: 10 },
|
||||
};
|
||||
}),
|
||||
};
|
||||
|
||||
const mockSession = {
|
||||
id: 'telegram:user-audio',
|
||||
getHistory: vi.fn().mockReturnValue([
|
||||
{ role: 'user', content: '[Voice message]: hello world\n\ncaption' },
|
||||
]),
|
||||
addMessage: vi.fn(),
|
||||
clear: vi.fn(),
|
||||
replaceHistory: vi.fn(),
|
||||
getConfig: vi.fn((key: string) => {
|
||||
if (key === 'lastAudioAttachment') {
|
||||
return JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' });
|
||||
}
|
||||
return undefined;
|
||||
}),
|
||||
setConfig: vi.fn(),
|
||||
deleteConfig: vi.fn(),
|
||||
};
|
||||
|
||||
const audioTool: Tool = {
|
||||
name: 'audio.transcribe',
|
||||
description: 'Transcribe audio',
|
||||
inputSchema: { type: 'object', properties: {} },
|
||||
execute: async (args) => {
|
||||
seenArgs = args as Record<string, unknown>;
|
||||
return { success: true, output: 'transcript' };
|
||||
},
|
||||
};
|
||||
|
||||
const registry = new ToolRegistry();
|
||||
registry.register(audioTool);
|
||||
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
|
||||
const executor = new ToolExecutor(registry, hooks);
|
||||
|
||||
const agent = new NativeAgent({
|
||||
modelClient: mockClient,
|
||||
systemPrompt: 'You are helpful.',
|
||||
session: mockSession,
|
||||
toolRegistry: registry,
|
||||
toolExecutor: executor,
|
||||
});
|
||||
|
||||
const response = await agent.process('Please transcribe this');
|
||||
|
||||
expect(response).toBe('done');
|
||||
expect(seenArgs).toEqual(expect.objectContaining({
|
||||
data: 'U0VTU0lPTg==',
|
||||
mime_type: 'audio/ogg',
|
||||
}));
|
||||
});
|
||||
|
||||
it('replaces placeholder audio.transcribe data with latest attachment bytes', async () => {
|
||||
let callCount = 0;
|
||||
let seenArgs: Record<string, unknown> | undefined;
|
||||
const mockClient: ModelClient = {
|
||||
chat: vi.fn().mockImplementation(() => {
|
||||
callCount++;
|
||||
if (callCount === 1) {
|
||||
return {
|
||||
content: '',
|
||||
stopReason: 'tool_use',
|
||||
usage: { inputTokens: 10, outputTokens: 5 },
|
||||
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { data: '[voice message data not provided]', mime_type: 'audio/ogg' } }],
|
||||
};
|
||||
}
|
||||
return {
|
||||
content: 'done',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 15, outputTokens: 10 },
|
||||
};
|
||||
}),
|
||||
};
|
||||
|
||||
const audioTool: Tool = {
|
||||
name: 'audio.transcribe',
|
||||
description: 'Transcribe audio',
|
||||
inputSchema: { type: 'object', properties: {} },
|
||||
execute: async (args) => {
|
||||
seenArgs = args as Record<string, unknown>;
|
||||
return { success: true, output: 'transcript' };
|
||||
},
|
||||
};
|
||||
|
||||
const registry = new ToolRegistry();
|
||||
registry.register(audioTool);
|
||||
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
|
||||
const executor = new ToolExecutor(registry, hooks);
|
||||
|
||||
const agent = new NativeAgent({
|
||||
modelClient: mockClient,
|
||||
systemPrompt: 'You are helpful.',
|
||||
toolRegistry: registry,
|
||||
toolExecutor: executor,
|
||||
});
|
||||
|
||||
const response = await agent.process('Please transcribe this', [{
|
||||
mimeType: 'audio/ogg',
|
||||
data: 'QUJDRA==',
|
||||
filename: 'voice.ogg',
|
||||
}]);
|
||||
|
||||
expect(response).toBe('done');
|
||||
expect(seenArgs).toEqual(expect.objectContaining({
|
||||
data: 'QUJDRA==',
|
||||
mime_type: 'audio/ogg',
|
||||
}));
|
||||
});
|
||||
|
||||
it('overrides model-provided base64 with latest turn audio attachment bytes', async () => {
|
||||
let callCount = 0;
|
||||
let seenArgs: Record<string, unknown> | undefined;
|
||||
const mockClient: ModelClient = {
|
||||
chat: vi.fn().mockImplementation(() => {
|
||||
callCount++;
|
||||
if (callCount === 1) {
|
||||
return {
|
||||
content: '',
|
||||
stopReason: 'tool_use',
|
||||
usage: { inputTokens: 10, outputTokens: 5 },
|
||||
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { data: 'd3Jvbmc=', mime_type: 'audio/ogg' } }],
|
||||
};
|
||||
}
|
||||
return {
|
||||
content: 'done',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 15, outputTokens: 10 },
|
||||
};
|
||||
}),
|
||||
};
|
||||
|
||||
const audioTool: Tool = {
|
||||
name: 'audio.transcribe',
|
||||
description: 'Transcribe audio',
|
||||
inputSchema: { type: 'object', properties: {} },
|
||||
execute: async (args) => {
|
||||
seenArgs = args as Record<string, unknown>;
|
||||
return { success: true, output: 'transcript' };
|
||||
},
|
||||
};
|
||||
|
||||
const registry = new ToolRegistry();
|
||||
registry.register(audioTool);
|
||||
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
|
||||
const executor = new ToolExecutor(registry, hooks);
|
||||
|
||||
const agent = new NativeAgent({
|
||||
modelClient: mockClient,
|
||||
systemPrompt: 'You are helpful.',
|
||||
toolRegistry: registry,
|
||||
toolExecutor: executor,
|
||||
});
|
||||
|
||||
const response = await agent.process('Please transcribe this', [{
|
||||
mimeType: 'audio/ogg',
|
||||
data: 'QUJDRA==',
|
||||
filename: 'voice.ogg',
|
||||
}]);
|
||||
|
||||
expect(response).toBe('done');
|
||||
expect(seenArgs).toEqual(expect.objectContaining({
|
||||
data: 'QUJDRA==',
|
||||
mime_type: 'audio/ogg',
|
||||
}));
|
||||
});
|
||||
|
||||
it('respects max iterations when tool calls vary', async () => {
|
||||
// Model always returns tool_use but with different args each time (no loop detection)
|
||||
let callCount = 0;
|
||||
|
||||
@@ -9,6 +9,7 @@ import type { Attachment } from '../../channels/types.js';
|
||||
import type { OutboundAttachmentCollector } from './attachments.js';
|
||||
import { buildUserMessage } from '../../models/media.js';
|
||||
import { getElevationWindow } from '../../security/elevation.js';
|
||||
import { auditLogger } from '../../audit/index.js';
|
||||
|
||||
export interface ToolUseEvent {
|
||||
type: 'start' | 'end';
|
||||
@@ -62,6 +63,20 @@ interface ExtractedTextToolCall {
|
||||
end: number;
|
||||
}
|
||||
|
||||
const LAST_AUDIO_ATTACHMENT_CONFIG_KEY = 'lastAudioAttachment';
|
||||
|
||||
interface AudioToolInput {
|
||||
data?: string;
|
||||
url?: string;
|
||||
mime_type?: string;
|
||||
}
|
||||
|
||||
interface AudioToolArgSummary {
|
||||
hasData: boolean;
|
||||
hasUrl: boolean;
|
||||
mimeType?: string;
|
||||
}
|
||||
|
||||
export class NativeAgent {
|
||||
private static readonly EMPTY_RESPONSE_FALLBACK =
|
||||
'I could not generate a response for that. Please try again.';
|
||||
@@ -363,7 +378,8 @@ export class NativeAgent {
|
||||
}
|
||||
: undefined;
|
||||
|
||||
const result = await toolExecutor.execute(internalName, tc.args, perCallContext, {
|
||||
const toolArgs = this.normalizeToolArgsForExecution(internalName, tc.args);
|
||||
const result = await toolExecutor.execute(internalName, toolArgs, perCallContext, {
|
||||
signal: this._runAbortController?.signal,
|
||||
});
|
||||
|
||||
@@ -620,6 +636,284 @@ export class NativeAgent {
|
||||
return error instanceof Error && error.name === 'AbortError';
|
||||
}
|
||||
|
||||
private normalizeToolArgsForExecution(toolName: string, rawArgs: unknown): unknown {
|
||||
if (toolName !== 'audio.transcribe') {
|
||||
return rawArgs;
|
||||
}
|
||||
return this.hydrateAudioTranscribeArgs(rawArgs);
|
||||
}
|
||||
|
||||
private hydrateAudioTranscribeArgs(rawArgs: unknown): unknown {
|
||||
const args = (rawArgs && typeof rawArgs === 'object')
|
||||
? { ...(rawArgs as Record<string, unknown>) }
|
||||
: {};
|
||||
const original = this.summarizeAudioToolArgs(args);
|
||||
|
||||
const latestTurnAudio = this.getLatestTurnUserAudioInput();
|
||||
if (latestTurnAudio) {
|
||||
this.applyAudioToolInput(args, latestTurnAudio);
|
||||
this.logAudioArgsRewrite('latest_audio_preferred', 'latest_turn', original, args);
|
||||
return args;
|
||||
}
|
||||
|
||||
if (this.isCurrentTurnVoiceTranscriptFallback()) {
|
||||
const persistedAudio = this.getPersistedAudioInput();
|
||||
if (persistedAudio) {
|
||||
this.applyAudioToolInput(args, persistedAudio);
|
||||
this.logAudioArgsRewrite('voice_turn_fallback', 'persisted', original, args);
|
||||
return args;
|
||||
}
|
||||
}
|
||||
|
||||
const normalizedData = this.normalizeAudioTranscribeDataArg(args.data, args.mime_type);
|
||||
const normalizedUrl = this.normalizeAudioTranscribeUrlArg(args.url);
|
||||
if (normalizedData) {
|
||||
args.data = normalizedData;
|
||||
delete args.url;
|
||||
} else if (normalizedUrl) {
|
||||
args.url = normalizedUrl;
|
||||
delete args.data;
|
||||
} else {
|
||||
delete args.data;
|
||||
delete args.url;
|
||||
}
|
||||
|
||||
const hasData = typeof args.data === 'string' && args.data.length > 0;
|
||||
const hasUrl = typeof args.url === 'string' && args.url.length > 0;
|
||||
if (hasData || hasUrl) {
|
||||
if (hasData && (typeof args.mime_type !== 'string' || args.mime_type.length === 0)) {
|
||||
const latestAudioForMime = this.getLatestUserAudioInput();
|
||||
if (latestAudioForMime?.mime_type) {
|
||||
args.mime_type = latestAudioForMime.mime_type;
|
||||
}
|
||||
}
|
||||
return args;
|
||||
}
|
||||
|
||||
const latestAudio = this.getLatestUserAudioInput();
|
||||
if (!latestAudio) {
|
||||
return args;
|
||||
}
|
||||
const persistedAudio = this.getPersistedAudioInput();
|
||||
const source: 'history' | 'persisted' = persistedAudio?.data === latestAudio.data
|
||||
&& persistedAudio?.mime_type === latestAudio.mime_type
|
||||
? 'persisted'
|
||||
: 'history';
|
||||
this.applyAudioToolInput(args, latestAudio);
|
||||
this.logAudioArgsRewrite(original.hasData || original.hasUrl ? 'invalid_model_args' : 'missing_model_args', source, original, args);
|
||||
return args;
|
||||
}
|
||||
|
||||
private summarizeAudioToolArgs(args: Record<string, unknown>): AudioToolArgSummary {
|
||||
const hasData = typeof args.data === 'string' && args.data.length > 0;
|
||||
const hasUrl = typeof args.url === 'string' && args.url.length > 0;
|
||||
const mimeType = typeof args.mime_type === 'string' && args.mime_type.length > 0
|
||||
? args.mime_type
|
||||
: undefined;
|
||||
return { hasData, hasUrl, mimeType };
|
||||
}
|
||||
|
||||
private applyAudioToolInput(args: Record<string, unknown>, audio: AudioToolInput): void {
|
||||
if (audio.data) {
|
||||
args.data = audio.data;
|
||||
delete args.url;
|
||||
} else if (audio.url) {
|
||||
args.url = audio.url;
|
||||
delete args.data;
|
||||
} else {
|
||||
delete args.data;
|
||||
delete args.url;
|
||||
}
|
||||
if (audio.mime_type) {
|
||||
args.mime_type = audio.mime_type;
|
||||
}
|
||||
}
|
||||
|
||||
private logAudioArgsRewrite(
|
||||
reason: 'latest_audio_preferred' | 'voice_turn_fallback' | 'invalid_model_args' | 'missing_model_args',
|
||||
source: 'latest_turn' | 'history' | 'persisted',
|
||||
original: AudioToolArgSummary,
|
||||
normalizedArgs: Record<string, unknown>,
|
||||
): void {
|
||||
const finalMime = typeof normalizedArgs.mime_type === 'string' && normalizedArgs.mime_type.length > 0
|
||||
? normalizedArgs.mime_type
|
||||
: undefined;
|
||||
auditLogger?.toolArgsRewritten({
|
||||
tool_name: 'audio.transcribe',
|
||||
session_id: this.session?.id,
|
||||
source,
|
||||
reason,
|
||||
original_has_data: original.hasData,
|
||||
original_has_url: original.hasUrl,
|
||||
original_mime_type: original.mimeType,
|
||||
final_mime_type: finalMime,
|
||||
});
|
||||
}
|
||||
|
||||
private isCurrentTurnVoiceTranscriptFallback(): boolean {
|
||||
for (let i = this.history.length - 1; i >= 0; i--) {
|
||||
const msg = this.history[i];
|
||||
if (msg.role !== 'user') {
|
||||
continue;
|
||||
}
|
||||
if (typeof msg.content === 'string') {
|
||||
return msg.content.includes('[Voice message]:');
|
||||
}
|
||||
if (!Array.isArray(msg.content)) {
|
||||
return false;
|
||||
}
|
||||
return msg.content.some((part) => (
|
||||
part.type === 'text'
|
||||
&& typeof part.text === 'string'
|
||||
&& part.text.includes('[Voice message]:')
|
||||
));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private getLatestTurnUserAudioInput(): AudioToolInput | null {
|
||||
for (let i = this.history.length - 1; i >= 0; i--) {
|
||||
const msg = this.history[i];
|
||||
if (msg.role !== 'user') {
|
||||
continue;
|
||||
}
|
||||
if (!Array.isArray(msg.content)) {
|
||||
return null;
|
||||
}
|
||||
for (const part of msg.content) {
|
||||
if (part.type !== 'audio') {
|
||||
continue;
|
||||
}
|
||||
const source = part.source;
|
||||
if (typeof source.data === 'string' && source.data.length > 0 && typeof source.media_type === 'string' && source.media_type.length > 0) {
|
||||
return { data: source.data, mime_type: source.media_type };
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined {
|
||||
if (typeof rawData !== 'string') {
|
||||
return undefined;
|
||||
}
|
||||
const compact = rawData.replace(/\s+/g, '');
|
||||
if (compact.length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
if (!/^[A-Za-z0-9+/=]+$/.test(compact)) {
|
||||
return undefined;
|
||||
}
|
||||
try {
|
||||
const decoded = Buffer.from(compact, 'base64');
|
||||
if (decoded.length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
const mimeType = typeof rawMimeType === 'string' ? rawMimeType : undefined;
|
||||
if (!this.matchesAudioSignature(decoded, mimeType)) {
|
||||
return undefined;
|
||||
}
|
||||
return compact;
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
private matchesAudioSignature(buffer: Buffer, mimeType?: string): boolean {
|
||||
const ascii = (offset: number, value: string): boolean => {
|
||||
if (buffer.length < offset + value.length) {
|
||||
return false;
|
||||
}
|
||||
return buffer.subarray(offset, offset + value.length).toString('ascii') === value;
|
||||
};
|
||||
|
||||
if (!mimeType) {
|
||||
return true;
|
||||
}
|
||||
|
||||
switch (mimeType) {
|
||||
case 'audio/ogg':
|
||||
return ascii(0, 'OggS');
|
||||
case 'audio/wav':
|
||||
return ascii(0, 'RIFF') && ascii(8, 'WAVE');
|
||||
case 'audio/webm':
|
||||
return buffer.length >= 4
|
||||
&& buffer[0] === 0x1A
|
||||
&& buffer[1] === 0x45
|
||||
&& buffer[2] === 0xDF
|
||||
&& buffer[3] === 0xA3;
|
||||
case 'audio/mpeg':
|
||||
case 'audio/mp3':
|
||||
return ascii(0, 'ID3')
|
||||
|| (buffer.length >= 2 && buffer[0] === 0xFF && (buffer[1] & 0xE0) === 0xE0);
|
||||
case 'audio/mp4':
|
||||
case 'audio/x-m4a':
|
||||
return ascii(4, 'ftyp');
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private normalizeAudioTranscribeUrlArg(rawUrl: unknown): string | undefined {
|
||||
if (typeof rawUrl !== 'string') {
|
||||
return undefined;
|
||||
}
|
||||
const trimmed = rawUrl.trim();
|
||||
if (trimmed.length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
if (!/^https?:\/\//i.test(trimmed)) {
|
||||
return undefined;
|
||||
}
|
||||
return trimmed;
|
||||
}
|
||||
|
||||
private getLatestUserAudioInput(): AudioToolInput | null {
|
||||
for (let i = this.history.length - 1; i >= 0; i--) {
|
||||
const msg = this.history[i];
|
||||
if (msg.role !== 'user' || !Array.isArray(msg.content)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const part of msg.content) {
|
||||
if (part.type !== 'audio') {
|
||||
continue;
|
||||
}
|
||||
const source = part.source;
|
||||
if (typeof source.data === 'string' && source.data.length > 0 && typeof source.media_type === 'string' && source.media_type.length > 0) {
|
||||
return { data: source.data, mime_type: source.media_type };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return this.getPersistedAudioInput();
|
||||
}
|
||||
|
||||
private getPersistedAudioInput(): AudioToolInput | null {
|
||||
const persisted = this.session?.getConfig(LAST_AUDIO_ATTACHMENT_CONFIG_KEY);
|
||||
if (!persisted) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(persisted) as { data?: unknown; url?: unknown; mimeType?: unknown };
|
||||
const data = typeof parsed.data === 'string' && parsed.data.length > 0 ? parsed.data : undefined;
|
||||
const url = typeof parsed.url === 'string' && parsed.url.length > 0 ? parsed.url : undefined;
|
||||
const mimeType = typeof parsed.mimeType === 'string' && parsed.mimeType.length > 0 ? parsed.mimeType : undefined;
|
||||
if (!data && !url) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
...(data ? { data } : {}),
|
||||
...(url ? { url } : {}),
|
||||
...(mimeType ? { mime_type: mimeType } : {}),
|
||||
};
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private extractPseudoToolUse(content: string): PseudoToolUse | null {
|
||||
if (!content) {
|
||||
return null;
|
||||
|
||||
Reference in New Issue
Block a user