Harden audio transcription arg hydration and add rewrite audit event

This commit is contained in:
William Valentin
2026-02-22 18:56:22 -08:00
parent 7d0d8abec6
commit db4e52dd7e
10 changed files with 1183 additions and 16 deletions
+6
View File
@@ -8,6 +8,7 @@ import type {
ToolErrorEvent,
ToolDeniedEvent,
ToolApprovalEvent,
ToolArgsRewrittenEvent,
SkillsInstallerExecutionBlockedEvent,
SkillsInstallerCommandResultEvent,
SkillsScanEvent,
@@ -104,6 +105,11 @@ export class AuditLogger {
this.write({ level: 'debug', event_type: 'tool.approval', event: event as unknown as Record<string, unknown> });
}
toolArgsRewritten(event: ToolArgsRewrittenEvent): void {
if (!this.shouldLog('tools', 'debug')) {return;}
this.write({ level: 'debug', event_type: 'tool.args_rewritten', event: event as unknown as Record<string, unknown> });
}
skillsInstallerExecutionBlocked(event: SkillsInstallerExecutionBlockedEvent): void {
if (!this.shouldLog('tools', 'warn')) {return;}
this.write({
+12 -1
View File
@@ -2,7 +2,7 @@ export type AuditLevel = 'debug' | 'info' | 'warn' | 'error';
export type AuditEventType =
// Tool execution
| 'tool.start' | 'tool.success' | 'tool.error' | 'tool.denied' | 'tool.approval'
| 'tool.start' | 'tool.success' | 'tool.error' | 'tool.denied' | 'tool.approval' | 'tool.args_rewritten'
// Security
| 'security.elevation.enabled' | 'security.elevation.disabled' | 'security.elevation.expired'
// Skills scan
@@ -110,6 +110,17 @@ export interface ToolApprovalEvent {
session_id?: string;
}
export interface ToolArgsRewrittenEvent {
tool_name: string;
session_id?: string;
source: 'latest_turn' | 'persisted' | 'history';
reason: 'latest_audio_preferred' | 'voice_turn_fallback' | 'invalid_model_args' | 'missing_model_args';
original_has_data: boolean;
original_has_url: boolean;
original_mime_type?: string;
final_mime_type?: string;
}
export interface SkillsInstallerExecutionBlockedEvent {
skill_name: string;
phase: 'install' | 'execute';
+452
View File
@@ -199,6 +199,458 @@ describe('NativeAgent tool loop', () => {
expect(mockClient.chat).toHaveBeenCalledTimes(2);
});
it('hydrates missing audio.transcribe args from latest user audio attachment', async () => {
let callCount = 0;
let seenArgs: Record<string, unknown> | undefined;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: {} }],
};
}
return {
content: 'done',
stopReason: 'end_turn',
usage: { inputTokens: 15, outputTokens: 10 },
};
}),
};
const audioTool: Tool = {
name: 'audio.transcribe',
description: 'Transcribe audio',
inputSchema: { type: 'object', properties: {} },
execute: async (args) => {
seenArgs = args as Record<string, unknown>;
return { success: true, output: 'transcript' };
},
};
const registry = new ToolRegistry();
registry.register(audioTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process('Please transcribe this', [{
mimeType: 'audio/ogg',
data: 'QUJDRA==',
filename: 'voice.ogg',
}]);
expect(response).toBe('done');
expect(seenArgs).toEqual(expect.objectContaining({
data: 'QUJDRA==',
mime_type: 'audio/ogg',
}));
});
it('hydrates missing audio.transcribe args from persisted session audio attachment', async () => {
let callCount = 0;
let seenArgs: Record<string, unknown> | undefined;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: {} }],
};
}
return {
content: 'done',
stopReason: 'end_turn',
usage: { inputTokens: 15, outputTokens: 10 },
};
}),
};
const mockSession = {
id: 'telegram:user-audio',
getHistory: vi.fn().mockReturnValue([]),
addMessage: vi.fn(),
clear: vi.fn(),
replaceHistory: vi.fn(),
getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment'
? JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' })
: undefined)),
setConfig: vi.fn(),
deleteConfig: vi.fn(),
};
const audioTool: Tool = {
name: 'audio.transcribe',
description: 'Transcribe audio',
inputSchema: { type: 'object', properties: {} },
execute: async (args) => {
seenArgs = args as Record<string, unknown>;
return { success: true, output: 'transcript' };
},
};
const registry = new ToolRegistry();
registry.register(audioTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
session: mockSession,
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process('Please transcribe this');
expect(response).toBe('done');
expect(seenArgs).toEqual(expect.objectContaining({
data: 'U0VTU0lPTg==',
mime_type: 'audio/ogg',
}));
});
it('replaces invalid file:// audio.transcribe url with persisted session audio data', async () => {
let callCount = 0;
let seenArgs: Record<string, unknown> | undefined;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { url: 'file://voice_message', mime_type: 'audio/ogg' } }],
};
}
return {
content: 'done',
stopReason: 'end_turn',
usage: { inputTokens: 15, outputTokens: 10 },
};
}),
};
const mockSession = {
id: 'telegram:user-audio',
getHistory: vi.fn().mockReturnValue([]),
addMessage: vi.fn(),
clear: vi.fn(),
replaceHistory: vi.fn(),
getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment'
? JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' })
: undefined)),
setConfig: vi.fn(),
deleteConfig: vi.fn(),
};
const audioTool: Tool = {
name: 'audio.transcribe',
description: 'Transcribe audio',
inputSchema: { type: 'object', properties: {} },
execute: async (args) => {
seenArgs = args as Record<string, unknown>;
return { success: true, output: 'transcript' };
},
};
const registry = new ToolRegistry();
registry.register(audioTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
session: mockSession,
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process('Please transcribe this');
expect(response).toBe('done');
expect(seenArgs).toEqual(expect.objectContaining({
data: 'U0VTU0lPTg==',
mime_type: 'audio/ogg',
}));
expect(seenArgs).not.toHaveProperty('url');
});
it('replaces text-like base64 audio.transcribe data with persisted session audio data', async () => {
let callCount = 0;
let seenArgs: Record<string, unknown> | undefined;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { data: 'VGhpcyBvbmUgdHdvIHRocmVl', mime_type: 'audio/wav' } }],
};
}
return {
content: 'done',
stopReason: 'end_turn',
usage: { inputTokens: 15, outputTokens: 10 },
};
}),
};
const mockSession = {
id: 'telegram:user-audio',
getHistory: vi.fn().mockReturnValue([]),
addMessage: vi.fn(),
clear: vi.fn(),
replaceHistory: vi.fn(),
getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment'
? JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' })
: undefined)),
setConfig: vi.fn(),
deleteConfig: vi.fn(),
};
const audioTool: Tool = {
name: 'audio.transcribe',
description: 'Transcribe audio',
inputSchema: { type: 'object', properties: {} },
execute: async (args) => {
seenArgs = args as Record<string, unknown>;
return { success: true, output: 'transcript' };
},
};
const registry = new ToolRegistry();
registry.register(audioTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
session: mockSession,
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process('Please transcribe this');
expect(response).toBe('done');
expect(seenArgs).toEqual(expect.objectContaining({
data: 'U0VTU0lPTg==',
mime_type: 'audio/ogg',
}));
});
it('forces persisted audio on voice-transcript fallback turns', async () => {
let callCount = 0;
let seenArgs: Record<string, unknown> | undefined;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{
id: 'call_1',
name: 'audio_transcribe',
args: {
data: 'UklGRigAAABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YQQAAAAAAAAA',
mime_type: 'audio/wav',
},
}],
};
}
return {
content: 'done',
stopReason: 'end_turn',
usage: { inputTokens: 15, outputTokens: 10 },
};
}),
};
const mockSession = {
id: 'telegram:user-audio',
getHistory: vi.fn().mockReturnValue([
{ role: 'user', content: '[Voice message]: hello world\n\ncaption' },
]),
addMessage: vi.fn(),
clear: vi.fn(),
replaceHistory: vi.fn(),
getConfig: vi.fn((key: string) => {
if (key === 'lastAudioAttachment') {
return JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' });
}
return undefined;
}),
setConfig: vi.fn(),
deleteConfig: vi.fn(),
};
const audioTool: Tool = {
name: 'audio.transcribe',
description: 'Transcribe audio',
inputSchema: { type: 'object', properties: {} },
execute: async (args) => {
seenArgs = args as Record<string, unknown>;
return { success: true, output: 'transcript' };
},
};
const registry = new ToolRegistry();
registry.register(audioTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
session: mockSession,
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process('Please transcribe this');
expect(response).toBe('done');
expect(seenArgs).toEqual(expect.objectContaining({
data: 'U0VTU0lPTg==',
mime_type: 'audio/ogg',
}));
});
it('replaces placeholder audio.transcribe data with latest attachment bytes', async () => {
let callCount = 0;
let seenArgs: Record<string, unknown> | undefined;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { data: '[voice message data not provided]', mime_type: 'audio/ogg' } }],
};
}
return {
content: 'done',
stopReason: 'end_turn',
usage: { inputTokens: 15, outputTokens: 10 },
};
}),
};
const audioTool: Tool = {
name: 'audio.transcribe',
description: 'Transcribe audio',
inputSchema: { type: 'object', properties: {} },
execute: async (args) => {
seenArgs = args as Record<string, unknown>;
return { success: true, output: 'transcript' };
},
};
const registry = new ToolRegistry();
registry.register(audioTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process('Please transcribe this', [{
mimeType: 'audio/ogg',
data: 'QUJDRA==',
filename: 'voice.ogg',
}]);
expect(response).toBe('done');
expect(seenArgs).toEqual(expect.objectContaining({
data: 'QUJDRA==',
mime_type: 'audio/ogg',
}));
});
it('overrides model-provided base64 with latest turn audio attachment bytes', async () => {
let callCount = 0;
let seenArgs: Record<string, unknown> | undefined;
const mockClient: ModelClient = {
chat: vi.fn().mockImplementation(() => {
callCount++;
if (callCount === 1) {
return {
content: '',
stopReason: 'tool_use',
usage: { inputTokens: 10, outputTokens: 5 },
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { data: 'd3Jvbmc=', mime_type: 'audio/ogg' } }],
};
}
return {
content: 'done',
stopReason: 'end_turn',
usage: { inputTokens: 15, outputTokens: 10 },
};
}),
};
const audioTool: Tool = {
name: 'audio.transcribe',
description: 'Transcribe audio',
inputSchema: { type: 'object', properties: {} },
execute: async (args) => {
seenArgs = args as Record<string, unknown>;
return { success: true, output: 'transcript' };
},
};
const registry = new ToolRegistry();
registry.register(audioTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const agent = new NativeAgent({
modelClient: mockClient,
systemPrompt: 'You are helpful.',
toolRegistry: registry,
toolExecutor: executor,
});
const response = await agent.process('Please transcribe this', [{
mimeType: 'audio/ogg',
data: 'QUJDRA==',
filename: 'voice.ogg',
}]);
expect(response).toBe('done');
expect(seenArgs).toEqual(expect.objectContaining({
data: 'QUJDRA==',
mime_type: 'audio/ogg',
}));
});
it('respects max iterations when tool calls vary', async () => {
// Model always returns tool_use but with different args each time (no loop detection)
let callCount = 0;
+295 -1
View File
@@ -9,6 +9,7 @@ import type { Attachment } from '../../channels/types.js';
import type { OutboundAttachmentCollector } from './attachments.js';
import { buildUserMessage } from '../../models/media.js';
import { getElevationWindow } from '../../security/elevation.js';
import { auditLogger } from '../../audit/index.js';
export interface ToolUseEvent {
type: 'start' | 'end';
@@ -62,6 +63,20 @@ interface ExtractedTextToolCall {
end: number;
}
const LAST_AUDIO_ATTACHMENT_CONFIG_KEY = 'lastAudioAttachment';
interface AudioToolInput {
data?: string;
url?: string;
mime_type?: string;
}
interface AudioToolArgSummary {
hasData: boolean;
hasUrl: boolean;
mimeType?: string;
}
export class NativeAgent {
private static readonly EMPTY_RESPONSE_FALLBACK =
'I could not generate a response for that. Please try again.';
@@ -363,7 +378,8 @@ export class NativeAgent {
}
: undefined;
const result = await toolExecutor.execute(internalName, tc.args, perCallContext, {
const toolArgs = this.normalizeToolArgsForExecution(internalName, tc.args);
const result = await toolExecutor.execute(internalName, toolArgs, perCallContext, {
signal: this._runAbortController?.signal,
});
@@ -620,6 +636,284 @@ export class NativeAgent {
return error instanceof Error && error.name === 'AbortError';
}
private normalizeToolArgsForExecution(toolName: string, rawArgs: unknown): unknown {
if (toolName !== 'audio.transcribe') {
return rawArgs;
}
return this.hydrateAudioTranscribeArgs(rawArgs);
}
private hydrateAudioTranscribeArgs(rawArgs: unknown): unknown {
const args = (rawArgs && typeof rawArgs === 'object')
? { ...(rawArgs as Record<string, unknown>) }
: {};
const original = this.summarizeAudioToolArgs(args);
const latestTurnAudio = this.getLatestTurnUserAudioInput();
if (latestTurnAudio) {
this.applyAudioToolInput(args, latestTurnAudio);
this.logAudioArgsRewrite('latest_audio_preferred', 'latest_turn', original, args);
return args;
}
if (this.isCurrentTurnVoiceTranscriptFallback()) {
const persistedAudio = this.getPersistedAudioInput();
if (persistedAudio) {
this.applyAudioToolInput(args, persistedAudio);
this.logAudioArgsRewrite('voice_turn_fallback', 'persisted', original, args);
return args;
}
}
const normalizedData = this.normalizeAudioTranscribeDataArg(args.data, args.mime_type);
const normalizedUrl = this.normalizeAudioTranscribeUrlArg(args.url);
if (normalizedData) {
args.data = normalizedData;
delete args.url;
} else if (normalizedUrl) {
args.url = normalizedUrl;
delete args.data;
} else {
delete args.data;
delete args.url;
}
const hasData = typeof args.data === 'string' && args.data.length > 0;
const hasUrl = typeof args.url === 'string' && args.url.length > 0;
if (hasData || hasUrl) {
if (hasData && (typeof args.mime_type !== 'string' || args.mime_type.length === 0)) {
const latestAudioForMime = this.getLatestUserAudioInput();
if (latestAudioForMime?.mime_type) {
args.mime_type = latestAudioForMime.mime_type;
}
}
return args;
}
const latestAudio = this.getLatestUserAudioInput();
if (!latestAudio) {
return args;
}
const persistedAudio = this.getPersistedAudioInput();
const source: 'history' | 'persisted' = persistedAudio?.data === latestAudio.data
&& persistedAudio?.mime_type === latestAudio.mime_type
? 'persisted'
: 'history';
this.applyAudioToolInput(args, latestAudio);
this.logAudioArgsRewrite(original.hasData || original.hasUrl ? 'invalid_model_args' : 'missing_model_args', source, original, args);
return args;
}
private summarizeAudioToolArgs(args: Record<string, unknown>): AudioToolArgSummary {
const hasData = typeof args.data === 'string' && args.data.length > 0;
const hasUrl = typeof args.url === 'string' && args.url.length > 0;
const mimeType = typeof args.mime_type === 'string' && args.mime_type.length > 0
? args.mime_type
: undefined;
return { hasData, hasUrl, mimeType };
}
private applyAudioToolInput(args: Record<string, unknown>, audio: AudioToolInput): void {
if (audio.data) {
args.data = audio.data;
delete args.url;
} else if (audio.url) {
args.url = audio.url;
delete args.data;
} else {
delete args.data;
delete args.url;
}
if (audio.mime_type) {
args.mime_type = audio.mime_type;
}
}
private logAudioArgsRewrite(
reason: 'latest_audio_preferred' | 'voice_turn_fallback' | 'invalid_model_args' | 'missing_model_args',
source: 'latest_turn' | 'history' | 'persisted',
original: AudioToolArgSummary,
normalizedArgs: Record<string, unknown>,
): void {
const finalMime = typeof normalizedArgs.mime_type === 'string' && normalizedArgs.mime_type.length > 0
? normalizedArgs.mime_type
: undefined;
auditLogger?.toolArgsRewritten({
tool_name: 'audio.transcribe',
session_id: this.session?.id,
source,
reason,
original_has_data: original.hasData,
original_has_url: original.hasUrl,
original_mime_type: original.mimeType,
final_mime_type: finalMime,
});
}
private isCurrentTurnVoiceTranscriptFallback(): boolean {
for (let i = this.history.length - 1; i >= 0; i--) {
const msg = this.history[i];
if (msg.role !== 'user') {
continue;
}
if (typeof msg.content === 'string') {
return msg.content.includes('[Voice message]:');
}
if (!Array.isArray(msg.content)) {
return false;
}
return msg.content.some((part) => (
part.type === 'text'
&& typeof part.text === 'string'
&& part.text.includes('[Voice message]:')
));
}
return false;
}
private getLatestTurnUserAudioInput(): AudioToolInput | null {
for (let i = this.history.length - 1; i >= 0; i--) {
const msg = this.history[i];
if (msg.role !== 'user') {
continue;
}
if (!Array.isArray(msg.content)) {
return null;
}
for (const part of msg.content) {
if (part.type !== 'audio') {
continue;
}
const source = part.source;
if (typeof source.data === 'string' && source.data.length > 0 && typeof source.media_type === 'string' && source.media_type.length > 0) {
return { data: source.data, mime_type: source.media_type };
}
}
return null;
}
return null;
}
private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined {
if (typeof rawData !== 'string') {
return undefined;
}
const compact = rawData.replace(/\s+/g, '');
if (compact.length === 0) {
return undefined;
}
if (!/^[A-Za-z0-9+/=]+$/.test(compact)) {
return undefined;
}
try {
const decoded = Buffer.from(compact, 'base64');
if (decoded.length === 0) {
return undefined;
}
const mimeType = typeof rawMimeType === 'string' ? rawMimeType : undefined;
if (!this.matchesAudioSignature(decoded, mimeType)) {
return undefined;
}
return compact;
} catch {
return undefined;
}
}
private matchesAudioSignature(buffer: Buffer, mimeType?: string): boolean {
const ascii = (offset: number, value: string): boolean => {
if (buffer.length < offset + value.length) {
return false;
}
return buffer.subarray(offset, offset + value.length).toString('ascii') === value;
};
if (!mimeType) {
return true;
}
switch (mimeType) {
case 'audio/ogg':
return ascii(0, 'OggS');
case 'audio/wav':
return ascii(0, 'RIFF') && ascii(8, 'WAVE');
case 'audio/webm':
return buffer.length >= 4
&& buffer[0] === 0x1A
&& buffer[1] === 0x45
&& buffer[2] === 0xDF
&& buffer[3] === 0xA3;
case 'audio/mpeg':
case 'audio/mp3':
return ascii(0, 'ID3')
|| (buffer.length >= 2 && buffer[0] === 0xFF && (buffer[1] & 0xE0) === 0xE0);
case 'audio/mp4':
case 'audio/x-m4a':
return ascii(4, 'ftyp');
default:
return true;
}
}
private normalizeAudioTranscribeUrlArg(rawUrl: unknown): string | undefined {
if (typeof rawUrl !== 'string') {
return undefined;
}
const trimmed = rawUrl.trim();
if (trimmed.length === 0) {
return undefined;
}
if (!/^https?:\/\//i.test(trimmed)) {
return undefined;
}
return trimmed;
}
private getLatestUserAudioInput(): AudioToolInput | null {
for (let i = this.history.length - 1; i >= 0; i--) {
const msg = this.history[i];
if (msg.role !== 'user' || !Array.isArray(msg.content)) {
continue;
}
for (const part of msg.content) {
if (part.type !== 'audio') {
continue;
}
const source = part.source;
if (typeof source.data === 'string' && source.data.length > 0 && typeof source.media_type === 'string' && source.media_type.length > 0) {
return { data: source.data, mime_type: source.media_type };
}
}
}
return this.getPersistedAudioInput();
}
private getPersistedAudioInput(): AudioToolInput | null {
const persisted = this.session?.getConfig(LAST_AUDIO_ATTACHMENT_CONFIG_KEY);
if (!persisted) {
return null;
}
try {
const parsed = JSON.parse(persisted) as { data?: unknown; url?: unknown; mimeType?: unknown };
const data = typeof parsed.data === 'string' && parsed.data.length > 0 ? parsed.data : undefined;
const url = typeof parsed.url === 'string' && parsed.url.length > 0 ? parsed.url : undefined;
const mimeType = typeof parsed.mimeType === 'string' && parsed.mimeType.length > 0 ? parsed.mimeType : undefined;
if (!data && !url) {
return null;
}
return {
...(data ? { data } : {}),
...(url ? { url } : {}),
...(mimeType ? { mime_type: mimeType } : {}),
};
} catch {
return null;
}
}
private extractPseudoToolUse(content: string): PseudoToolUse | null {
if (!content) {
return null;
+89 -1
View File
@@ -1351,7 +1351,7 @@ describe('daemon audio routing integration', () => {
expect(String(msg.text)).toContain('audio transcription is not configured');
});
it('transcribes voice attachments when transcription is configured, then strips audio before calling agent.process', async () => {
it('transcribes voice attachments when transcription is configured and preserves audio for anthropic tool fallback', async () => {
const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('ok');
// Mock transcription endpoint call.
@@ -1422,6 +1422,90 @@ describe('daemon audio routing integration', () => {
timestamp: Date.now(),
} as MessageRouterInput, reply);
expect(fetchSpy).toHaveBeenCalled();
expect(processSpy).toHaveBeenCalledTimes(1);
const [calledText, calledAttachments] = processSpy.mock.calls[0] ?? [];
expect(String(calledText)).toContain('[Voice message]: hello world');
expect(String(calledText)).toContain('caption');
const atts = calledAttachments as Array<{ mimeType: string }> | undefined;
expect(atts?.some(a => a.mimeType === 'audio/ogg')).toBe(true);
expect(atts?.some(a => a.mimeType === 'image/jpeg')).toBe(true);
expect(session.setConfig).toHaveBeenCalledWith(
'lastAudioAttachment',
expect.stringContaining('"mimeType":"audio/ogg"'),
);
});
it('transcribes voice attachments when transcription is configured and strips audio for openai-compatible providers', async () => {
const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('ok');
const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
ok: true,
status: 200,
statusText: 'OK',
json: async () => ({ text: 'hello world' }),
} as Response);
const session = {
id: 'telegram:user-voice-3',
addMessage: vi.fn(),
getHistory: vi.fn(() => []),
clear: vi.fn(),
replaceHistory: vi.fn(),
getConfig: vi.fn(() => undefined),
setConfig: vi.fn(),
deleteConfig: vi.fn(),
};
const commandRegistry = new CommandRegistry();
registerBuiltinCommands(commandRegistry);
const router = createMessageRouter({
sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'],
modelRouter: {
getAvailableTiers: () => ['default'],
getAllLabels: () => ({ default: 'default' }),
getLabel: (tier: string) => tier,
} as unknown as MessageRouterDeps['modelRouter'],
systemPrompt: 'test prompt',
toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'],
toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'],
config: {
agents: {
primary_tier: 'default',
delegation: {
compaction: 'default',
memory_extraction: 'default',
classification: 'default',
tool_summarisation: 'default',
complex_reasoning: 'default',
},
max_delegation_depth: 1,
max_iterations: 3,
},
compaction: { enabled: false },
models: { default: { provider: 'openai', model: 'gpt-4.1', supports_audio: false } },
audio: {
enabled: true,
provider: { type: 'openai', endpoint: 'https://example.com/v1/audio/transcriptions', api_key: 'sk-test', model: 'whisper-1' },
},
} as unknown as MessageRouterDeps['config'],
commandRegistry,
});
const reply = vi.fn(async (_message: OutboundMessage) => {});
await router.handler({
id: 'v3',
channel: 'telegram',
senderId: 'user-voice-3',
text: 'caption',
attachments: [
{ mimeType: 'audio/ogg', data: 'ZGF0YQ==', filename: 'voice.ogg' },
{ mimeType: 'image/jpeg', data: 'aW1n', filename: 'img.jpg' },
],
timestamp: Date.now(),
} as MessageRouterInput, reply);
expect(fetchSpy).toHaveBeenCalled();
expect(processSpy).toHaveBeenCalledTimes(1);
const [calledText, calledAttachments] = processSpy.mock.calls[0] ?? [];
@@ -1430,6 +1514,10 @@ describe('daemon audio routing integration', () => {
const atts = calledAttachments as Array<{ mimeType: string }> | undefined;
expect(atts?.some(a => a.mimeType === 'audio/ogg')).toBe(false);
expect(atts?.some(a => a.mimeType === 'image/jpeg')).toBe(true);
expect(session.setConfig).toHaveBeenCalledWith(
'lastAudioAttachment',
expect.stringContaining('"mimeType":"audio/ogg"'),
);
});
});
+63 -3
View File
@@ -164,6 +164,57 @@ function shouldForceNativeForCapabilityQuery(text: string): boolean {
);
}
function providerAcceptsNativeAudioContentParts(provider: string): boolean {
return (
provider === 'openai'
|| provider === 'github'
|| provider === 'gemini'
|| provider === 'openrouter'
|| provider === 'zhipuai'
|| provider === 'xai'
|| provider === 'minimax'
|| provider === 'moonshot'
|| provider === 'vercel'
);
}
const LAST_AUDIO_ATTACHMENT_CONFIG_KEY = 'lastAudioAttachment';
function persistLatestAudioAttachment(
session: { setConfig(key: string, value: string): void },
audioAttachments: Attachment[],
): void {
const latest = [...audioAttachments].reverse().find((att) => (
(typeof att.data === 'string' && att.data.length > 0)
|| (typeof att.url === 'string' && att.url.length > 0)
));
if (!latest) {
return;
}
const payload: { data?: string; url?: string; mimeType?: string } = {
mimeType: latest.mimeType,
};
if (typeof latest.data === 'string' && latest.data.length > 0) {
payload.data = latest.data;
} else if (typeof latest.url === 'string' && latest.url.length > 0) {
payload.url = latest.url;
}
if (!payload.data && !payload.url) {
return;
}
try {
session.setConfig(LAST_AUDIO_ATTACHMENT_CONFIG_KEY, JSON.stringify(payload));
} catch (error) {
console.warn(
'Failed to persist latest audio attachment for tool hydration:',
error instanceof Error ? error.message : String(error),
);
}
}
function isTtsEnabledForChannel(config: Config, channel: string): boolean {
if (!config.tts?.enabled) {
return false;
@@ -1266,6 +1317,9 @@ export function createMessageRouter(deps: {
let messageText = incomingText;
let attachments = msg.attachments;
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
if (audioAttachments.length > 0) {
persistLatestAudioAttachment(session, audioAttachments);
}
if (audioAttachments.length > 0 && !nativeAudioSupported) {
// Model doesn't support native audio — transcribe via Whisper and strip audio attachments
@@ -1300,9 +1354,15 @@ export function createMessageRouter(deps: {
const transcript = await transcribeAudio(att, audioConfig);
messageText = `[Voice message]: ${transcript}\n\n${messageText}`;
}
// Remove audio attachments so buildUserMessage doesn't create audio content parts
attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
if (attachments.length === 0) { attachments = undefined; }
// For providers that cannot ingest native audio content parts (e.g. Anthropic),
// keep the original audio attachment available in the tool loop so
// audio.transcribe can still be hydrated from bytes if the model requests it.
// For providers that do accept native audio parts (OpenAI-compatible/Gemini),
// strip audio to avoid sending raw audio to a model tier that was marked as non-audio.
if (providerAcceptsNativeAudioContentParts(modelProvider)) {
attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a));
if (attachments.length === 0) { attachments = undefined; }
}
}
// If native audio IS supported, we pass attachments through unchanged —
// buildUserMessage() in the agent will create native audio content parts
+56 -6
View File
@@ -51,6 +51,12 @@ describe('createAudioTranscribeTool', () => {
expect(result.success).toBe(false);
expect(result.error).toMatch(/Unsupported MIME type/);
});
it('rejects invalid non-base64 data payloads', async () => {
const result = await tool.execute({ data: '[voice message data not provided]', mime_type: 'audio/ogg' });
expect(result.success).toBe(false);
expect(result.error).toMatch(/valid base64/i);
});
});
describe('URL validation (SSRF protection)', () => {
@@ -106,7 +112,7 @@ describe('createAudioTranscribeTool', () => {
});
mockFetch.mockResolvedValueOnce({
ok: true,
json: async () => ({ text: 'hello' }),
text: async () => JSON.stringify({ text: 'hello' }),
});
const result = await tool.execute({ url: 'https://example.com/audio.wav' });
@@ -136,7 +142,7 @@ describe('createAudioTranscribeTool', () => {
it('transcribes base64 audio data', async () => {
mockFetch.mockResolvedValueOnce({
ok: true,
json: async () => ({ text: 'Hello, world!' }),
text: async () => JSON.stringify({ text: 'Hello, world!' }),
});
const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' });
@@ -152,7 +158,7 @@ describe('createAudioTranscribeTool', () => {
it('sends Authorization header when apiKey is set', async () => {
mockFetch.mockResolvedValueOnce({
ok: true,
json: async () => ({ text: 'test' }),
text: async () => JSON.stringify({ text: 'test' }),
});
await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/ogg' });
@@ -164,7 +170,7 @@ describe('createAudioTranscribeTool', () => {
it('passes language and prompt parameters', async () => {
mockFetch.mockResolvedValueOnce({
ok: true,
json: async () => ({ text: 'Hola mundo' }),
text: async () => JSON.stringify({ text: 'Hola mundo' }),
});
const result = await tool.execute({
@@ -176,6 +182,28 @@ describe('createAudioTranscribeTool', () => {
expect(result.success).toBe(true);
expect(result.output).toBe('Hola mundo');
});
it('accepts plain-text transcription responses', async () => {
mockFetch.mockResolvedValueOnce({
ok: true,
text: async () => 'Plain transcript',
});
const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' });
expect(result.success).toBe(true);
expect(result.output).toBe('Plain transcript');
});
it('returns a no-speech placeholder for empty transcript text', async () => {
mockFetch.mockResolvedValueOnce({
ok: true,
text: async () => JSON.stringify({ text: '' }),
});
const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' });
expect(result.success).toBe(true);
expect(result.output).toBe('[No speech detected]');
});
});
describe('URL-based transcription', () => {
@@ -191,7 +219,7 @@ describe('createAudioTranscribeTool', () => {
// Second fetch: transcription API
mockFetch.mockResolvedValueOnce({
ok: true,
json: async () => ({ text: 'URL transcription result' }),
text: async () => JSON.stringify({ text: 'URL transcription result' }),
});
const result = await tool.execute({ url: 'https://cdn.example.com/audio.mp3' });
@@ -207,7 +235,7 @@ describe('createAudioTranscribeTool', () => {
});
mockFetch.mockResolvedValueOnce({
ok: true,
json: async () => ({ text: 'ogg result' }),
text: async () => JSON.stringify({ text: 'ogg result' }),
});
const result = await tool.execute({ url: 'https://cdn.example.com/voice' });
@@ -287,5 +315,27 @@ describe('createAudioTranscribeTool', () => {
expect(result.success).toBe(false);
expect(result.error).toMatch(/ECONNREFUSED/);
});
it('returns clear error when transcription payload has no text field', async () => {
mockFetch.mockResolvedValueOnce({
ok: true,
text: async () => JSON.stringify({ id: 'abc123', status: 'ok' }),
});
const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' });
expect(result.success).toBe(false);
expect(result.error).toMatch(/missing text field/i);
});
it('surfaces endpoint error payloads', async () => {
mockFetch.mockResolvedValueOnce({
ok: true,
text: async () => JSON.stringify({ error: { message: 'model not loaded' } }),
});
const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' });
expect(result.success).toBe(false);
expect(result.error).toMatch(/endpoint error: model not loaded/i);
});
});
});
+184 -4
View File
@@ -51,6 +51,22 @@ function validateUrl(url: string): { valid: boolean; error?: string } {
}
function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: string } {
if (args.data !== undefined && typeof args.data !== 'string') {
return { valid: false, error: 'data must be a base64 string when provided' };
}
if (args.url !== undefined && typeof args.url !== 'string') {
return { valid: false, error: 'url must be a string when provided' };
}
if (args.mime_type !== undefined && typeof args.mime_type !== 'string') {
return { valid: false, error: 'mime_type must be a string when provided' };
}
if (args.language !== undefined && typeof args.language !== 'string') {
return { valid: false, error: 'language must be a string when provided' };
}
if (args.prompt !== undefined && typeof args.prompt !== 'string') {
return { valid: false, error: 'prompt must be a string when provided' };
}
const hasData = args.data !== undefined && args.data !== '';
const hasUrl = args.url !== undefined && args.url !== '';
@@ -62,6 +78,22 @@ function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: str
return { valid: false, error: 'Only one of data or url can be provided' };
}
if (hasData) {
const compact = (args.data ?? '').replace(/\s+/g, '');
const isBase64 = /^[A-Za-z0-9+/=]+$/.test(compact);
let hasDecodedBytes = false;
if (isBase64) {
try {
hasDecodedBytes = Buffer.from(compact, 'base64').length > 0;
} catch {
hasDecodedBytes = false;
}
}
if (!isBase64 || !hasDecodedBytes) {
return { valid: false, error: 'data must be valid base64-encoded audio bytes' };
}
}
if (hasData && !args.mime_type) {
return { valid: false, error: 'mime_type is required when using data' };
}
@@ -84,6 +116,131 @@ function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: str
return { valid: true };
}
function extractTranscriptionText(payload: unknown): string | undefined {
if (typeof payload === 'string') {
return payload;
}
if (!payload || typeof payload !== 'object') {
return undefined;
}
const obj = payload as Record<string, unknown>;
const directKeys = ['text', 'transcript', 'transcription', 'output'];
for (const key of directKeys) {
const value = obj[key];
if (typeof value === 'string') {
return value;
}
}
if (obj.result && typeof obj.result === 'object') {
const resultObj = obj.result as Record<string, unknown>;
const nested = resultObj.text ?? resultObj.transcript;
if (typeof nested === 'string') {
return nested;
}
}
if (obj.data && typeof obj.data === 'object') {
const dataObj = obj.data as Record<string, unknown>;
const nested = dataObj.text ?? dataObj.transcript;
if (typeof nested === 'string') {
return nested;
}
}
if (Array.isArray(obj.results)) {
for (const result of obj.results) {
if (!result || typeof result !== 'object') {
continue;
}
const resultObj = result as Record<string, unknown>;
if (typeof resultObj.text === 'string') {
return resultObj.text;
}
if (Array.isArray(resultObj.alternatives)) {
for (const alternative of resultObj.alternatives) {
if (!alternative || typeof alternative !== 'object') {
continue;
}
const altObj = alternative as Record<string, unknown>;
const altTranscript = altObj.transcript ?? altObj.text;
if (typeof altTranscript === 'string') {
return altTranscript;
}
}
}
}
}
if (Array.isArray(obj.segments)) {
const joined = obj.segments
.map((segment) => (segment && typeof segment === 'object'
? (segment as Record<string, unknown>).text
: undefined))
.filter((v): v is string => typeof v === 'string' && v.trim().length > 0)
.join(' ');
if (joined.trim().length > 0) {
return joined;
}
}
return undefined;
}
function extractTranscriptionError(payload: unknown): string | undefined {
if (!payload || typeof payload !== 'object') {
return undefined;
}
const obj = payload as Record<string, unknown>;
if (typeof obj.error === 'string' && obj.error.trim().length > 0) {
return obj.error;
}
if (obj.error && typeof obj.error === 'object') {
const errorObj = obj.error as Record<string, unknown>;
const message = errorObj.message ?? errorObj.error;
if (typeof message === 'string' && message.trim().length > 0) {
return message;
}
}
if (typeof obj.detail === 'string' && obj.detail.trim().length > 0) {
return obj.detail;
}
if (typeof obj.message === 'string' && obj.message.trim().length > 0) {
return obj.message;
}
return undefined;
}
function truncateForError(text: string, max = 180): string {
const normalized = text.replace(/\s+/g, ' ').trim();
if (normalized.length <= max) {
return normalized;
}
return `${normalized.slice(0, max)}...`;
}
async function readResponseBody(response: Response): Promise<string> {
const textReader = response.text as unknown;
if (typeof textReader === 'function') {
return await response.text();
}
const maybeJsonResponse = response as unknown as { json?: () => Promise<unknown> };
if (typeof maybeJsonResponse.json === 'function') {
const jsonPayload = await maybeJsonResponse.json();
return JSON.stringify(jsonPayload);
}
return '';
}
interface AudioTranscriptionConfig {
endpoint?: string;
apiKey?: string;
@@ -146,7 +303,9 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig
if (args.data) {
const rawBuffer = Buffer.from(args.data, 'base64');
const audioBuffer = rawBuffer.buffer;
if (rawBuffer.length === 0) {
throw new Error('Decoded audio data is empty');
}
const extMap: Record<string, string> = {
'audio/ogg': 'ogg',
@@ -161,7 +320,7 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig
filename = `audio.${ext}`;
const mimeType = args.mime_type ?? 'audio/wav';
audioBlob = new Blob([audioBuffer], { type: mimeType });
audioBlob = new Blob([rawBuffer], { type: mimeType });
} else if (args.url) {
const response = await fetch(args.url);
if (!response.ok) {
@@ -204,6 +363,7 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig
const formData = new FormData();
formData.append('file', audioBlob, filename);
formData.append('model', model);
formData.append('response_format', 'json');
if (args.language) {
formData.append('language', args.language);
@@ -234,10 +394,30 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig
throw new Error(`Transcription request failed (${response.status}): ${errorText}`);
}
const json = await response.json() as { text: string };
const rawBody = await readResponseBody(response);
const trimmedBody = rawBody.trim();
let payload: unknown = rawBody;
if (trimmedBody.startsWith('{') || trimmedBody.startsWith('[')) {
try {
payload = JSON.parse(rawBody) as unknown;
} catch {
payload = rawBody;
}
}
const transcript = extractTranscriptionText(payload);
if (transcript === undefined) {
const endpointError = extractTranscriptionError(payload);
if (endpointError) {
throw new Error(`Transcription endpoint error: ${endpointError}`);
}
throw new Error(`Transcription response missing text field (body: ${truncateForError(rawBody)})`);
}
const normalizedTranscript = transcript.trim().length > 0 ? transcript : '[No speech detected]';
return {
success: true,
output: json.text,
output: normalizedTranscript,
};
} catch (error) {
return {
+18
View File
@@ -35,6 +35,13 @@ const bigOutputTool: Tool = {
execute: async () => ({ success: true, output: 'x'.repeat(100_000) }),
};
const malformedOutputTool: Tool = {
name: 'test.malformed_output',
description: 'Returns non-string output at runtime',
inputSchema: { type: 'object', properties: {} },
execute: async () => ({ success: true, output: undefined as unknown as string }),
};
const fileWriteLikeTool: Tool = {
name: 'file.write',
description: 'Test file write tool',
@@ -191,6 +198,17 @@ describe('ToolExecutor', () => {
expect(result.output).toContain('[truncated]');
});
it('normalizes non-string output without throwing', async () => {
const registry = new ToolRegistry();
registry.register(malformedOutputTool);
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
const executor = new ToolExecutor(registry, hooks);
const result = await executor.execute('test.malformed_output', {});
expect(result.success).toBe(true);
expect(result.output).toBe('');
});
it('clears timeout timer after fast tool completion', async () => {
vi.useFakeTimers();
try {
+8
View File
@@ -342,6 +342,14 @@ export class ToolExecutor {
const duration = Date.now() - startTime;
// Defensive normalization: tool implementations should return string output,
// but third-party/custom tools can violate this at runtime.
if (typeof result.output !== 'string') {
result.output = result.output === undefined || result.output === null
? ''
: String(result.output);
}
// Truncate output if too large
if (result.output.length > this.maxOutputBytes) {
result.output = result.output.slice(0, this.maxOutputBytes) + '\n[truncated]';