From db4e52dd7e2134dd83d5a95d8bffcaf5d8cd92fe Mon Sep 17 00:00:00 2001 From: William Valentin Date: Sun, 22 Feb 2026 18:56:22 -0800 Subject: [PATCH] Harden audio transcription arg hydration and add rewrite audit event --- src/audit/logger.ts | 6 + src/audit/types.ts | 13 +- src/backends/native/agent.test.ts | 452 +++++++++++++++++++++ src/backends/native/agent.ts | 296 +++++++++++++- src/daemon/routing.test.ts | 90 +++- src/daemon/routing.ts | 66 ++- src/tools/builtin/audio-transcribe.test.ts | 62 ++- src/tools/builtin/audio-transcribe.ts | 188 ++++++++- src/tools/executor.test.ts | 18 + src/tools/executor.ts | 8 + 10 files changed, 1183 insertions(+), 16 deletions(-) diff --git a/src/audit/logger.ts b/src/audit/logger.ts index 8ecee59..e0fd065 100644 --- a/src/audit/logger.ts +++ b/src/audit/logger.ts @@ -8,6 +8,7 @@ import type { ToolErrorEvent, ToolDeniedEvent, ToolApprovalEvent, + ToolArgsRewrittenEvent, SkillsInstallerExecutionBlockedEvent, SkillsInstallerCommandResultEvent, SkillsScanEvent, @@ -104,6 +105,11 @@ export class AuditLogger { this.write({ level: 'debug', event_type: 'tool.approval', event: event as unknown as Record }); } + toolArgsRewritten(event: ToolArgsRewrittenEvent): void { + if (!this.shouldLog('tools', 'debug')) {return;} + this.write({ level: 'debug', event_type: 'tool.args_rewritten', event: event as unknown as Record }); + } + skillsInstallerExecutionBlocked(event: SkillsInstallerExecutionBlockedEvent): void { if (!this.shouldLog('tools', 'warn')) {return;} this.write({ diff --git a/src/audit/types.ts b/src/audit/types.ts index 21d2e19..d06df43 100644 --- a/src/audit/types.ts +++ b/src/audit/types.ts @@ -2,7 +2,7 @@ export type AuditLevel = 'debug' | 'info' | 'warn' | 'error'; export type AuditEventType = // Tool execution - | 'tool.start' | 'tool.success' | 'tool.error' | 'tool.denied' | 'tool.approval' + | 'tool.start' | 'tool.success' | 'tool.error' | 'tool.denied' | 'tool.approval' | 'tool.args_rewritten' // Security | 'security.elevation.enabled' | 'security.elevation.disabled' | 'security.elevation.expired' // Skills scan @@ -110,6 +110,17 @@ export interface ToolApprovalEvent { session_id?: string; } +export interface ToolArgsRewrittenEvent { + tool_name: string; + session_id?: string; + source: 'latest_turn' | 'persisted' | 'history'; + reason: 'latest_audio_preferred' | 'voice_turn_fallback' | 'invalid_model_args' | 'missing_model_args'; + original_has_data: boolean; + original_has_url: boolean; + original_mime_type?: string; + final_mime_type?: string; +} + export interface SkillsInstallerExecutionBlockedEvent { skill_name: string; phase: 'install' | 'execute'; diff --git a/src/backends/native/agent.test.ts b/src/backends/native/agent.test.ts index 64ed4c3..65aa8c7 100644 --- a/src/backends/native/agent.test.ts +++ b/src/backends/native/agent.test.ts @@ -199,6 +199,458 @@ describe('NativeAgent tool loop', () => { expect(mockClient.chat).toHaveBeenCalledTimes(2); }); + it('hydrates missing audio.transcribe args from latest user audio attachment', async () => { + let callCount = 0; + let seenArgs: Record | undefined; + const mockClient: ModelClient = { + chat: vi.fn().mockImplementation(() => { + callCount++; + if (callCount === 1) { + return { + content: '', + stopReason: 'tool_use', + usage: { inputTokens: 10, outputTokens: 5 }, + toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: {} }], + }; + } + return { + content: 'done', + stopReason: 'end_turn', + usage: { inputTokens: 15, outputTokens: 10 }, + }; + }), + }; + + const audioTool: Tool = { + name: 'audio.transcribe', + description: 'Transcribe audio', + inputSchema: { type: 'object', properties: {} }, + execute: async (args) => { + seenArgs = args as Record; + return { success: true, output: 'transcript' }; + }, + }; + + const registry = new ToolRegistry(); + registry.register(audioTool); + const hooks = new HookEngine({ confirm: [], log: [], silent: [] }); + const executor = new ToolExecutor(registry, hooks); + + const agent = new NativeAgent({ + modelClient: mockClient, + systemPrompt: 'You are helpful.', + toolRegistry: registry, + toolExecutor: executor, + }); + + const response = await agent.process('Please transcribe this', [{ + mimeType: 'audio/ogg', + data: 'QUJDRA==', + filename: 'voice.ogg', + }]); + + expect(response).toBe('done'); + expect(seenArgs).toEqual(expect.objectContaining({ + data: 'QUJDRA==', + mime_type: 'audio/ogg', + })); + }); + + it('hydrates missing audio.transcribe args from persisted session audio attachment', async () => { + let callCount = 0; + let seenArgs: Record | undefined; + const mockClient: ModelClient = { + chat: vi.fn().mockImplementation(() => { + callCount++; + if (callCount === 1) { + return { + content: '', + stopReason: 'tool_use', + usage: { inputTokens: 10, outputTokens: 5 }, + toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: {} }], + }; + } + return { + content: 'done', + stopReason: 'end_turn', + usage: { inputTokens: 15, outputTokens: 10 }, + }; + }), + }; + + const mockSession = { + id: 'telegram:user-audio', + getHistory: vi.fn().mockReturnValue([]), + addMessage: vi.fn(), + clear: vi.fn(), + replaceHistory: vi.fn(), + getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment' + ? JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' }) + : undefined)), + setConfig: vi.fn(), + deleteConfig: vi.fn(), + }; + + const audioTool: Tool = { + name: 'audio.transcribe', + description: 'Transcribe audio', + inputSchema: { type: 'object', properties: {} }, + execute: async (args) => { + seenArgs = args as Record; + return { success: true, output: 'transcript' }; + }, + }; + + const registry = new ToolRegistry(); + registry.register(audioTool); + const hooks = new HookEngine({ confirm: [], log: [], silent: [] }); + const executor = new ToolExecutor(registry, hooks); + + const agent = new NativeAgent({ + modelClient: mockClient, + systemPrompt: 'You are helpful.', + session: mockSession, + toolRegistry: registry, + toolExecutor: executor, + }); + + const response = await agent.process('Please transcribe this'); + + expect(response).toBe('done'); + expect(seenArgs).toEqual(expect.objectContaining({ + data: 'U0VTU0lPTg==', + mime_type: 'audio/ogg', + })); + }); + + it('replaces invalid file:// audio.transcribe url with persisted session audio data', async () => { + let callCount = 0; + let seenArgs: Record | undefined; + const mockClient: ModelClient = { + chat: vi.fn().mockImplementation(() => { + callCount++; + if (callCount === 1) { + return { + content: '', + stopReason: 'tool_use', + usage: { inputTokens: 10, outputTokens: 5 }, + toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { url: 'file://voice_message', mime_type: 'audio/ogg' } }], + }; + } + return { + content: 'done', + stopReason: 'end_turn', + usage: { inputTokens: 15, outputTokens: 10 }, + }; + }), + }; + + const mockSession = { + id: 'telegram:user-audio', + getHistory: vi.fn().mockReturnValue([]), + addMessage: vi.fn(), + clear: vi.fn(), + replaceHistory: vi.fn(), + getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment' + ? JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' }) + : undefined)), + setConfig: vi.fn(), + deleteConfig: vi.fn(), + }; + + const audioTool: Tool = { + name: 'audio.transcribe', + description: 'Transcribe audio', + inputSchema: { type: 'object', properties: {} }, + execute: async (args) => { + seenArgs = args as Record; + return { success: true, output: 'transcript' }; + }, + }; + + const registry = new ToolRegistry(); + registry.register(audioTool); + const hooks = new HookEngine({ confirm: [], log: [], silent: [] }); + const executor = new ToolExecutor(registry, hooks); + + const agent = new NativeAgent({ + modelClient: mockClient, + systemPrompt: 'You are helpful.', + session: mockSession, + toolRegistry: registry, + toolExecutor: executor, + }); + + const response = await agent.process('Please transcribe this'); + + expect(response).toBe('done'); + expect(seenArgs).toEqual(expect.objectContaining({ + data: 'U0VTU0lPTg==', + mime_type: 'audio/ogg', + })); + expect(seenArgs).not.toHaveProperty('url'); + }); + + it('replaces text-like base64 audio.transcribe data with persisted session audio data', async () => { + let callCount = 0; + let seenArgs: Record | undefined; + const mockClient: ModelClient = { + chat: vi.fn().mockImplementation(() => { + callCount++; + if (callCount === 1) { + return { + content: '', + stopReason: 'tool_use', + usage: { inputTokens: 10, outputTokens: 5 }, + toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { data: 'VGhpcyBvbmUgdHdvIHRocmVl', mime_type: 'audio/wav' } }], + }; + } + return { + content: 'done', + stopReason: 'end_turn', + usage: { inputTokens: 15, outputTokens: 10 }, + }; + }), + }; + + const mockSession = { + id: 'telegram:user-audio', + getHistory: vi.fn().mockReturnValue([]), + addMessage: vi.fn(), + clear: vi.fn(), + replaceHistory: vi.fn(), + getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment' + ? JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' }) + : undefined)), + setConfig: vi.fn(), + deleteConfig: vi.fn(), + }; + + const audioTool: Tool = { + name: 'audio.transcribe', + description: 'Transcribe audio', + inputSchema: { type: 'object', properties: {} }, + execute: async (args) => { + seenArgs = args as Record; + return { success: true, output: 'transcript' }; + }, + }; + + const registry = new ToolRegistry(); + registry.register(audioTool); + const hooks = new HookEngine({ confirm: [], log: [], silent: [] }); + const executor = new ToolExecutor(registry, hooks); + + const agent = new NativeAgent({ + modelClient: mockClient, + systemPrompt: 'You are helpful.', + session: mockSession, + toolRegistry: registry, + toolExecutor: executor, + }); + + const response = await agent.process('Please transcribe this'); + + expect(response).toBe('done'); + expect(seenArgs).toEqual(expect.objectContaining({ + data: 'U0VTU0lPTg==', + mime_type: 'audio/ogg', + })); + }); + + it('forces persisted audio on voice-transcript fallback turns', async () => { + let callCount = 0; + let seenArgs: Record | undefined; + const mockClient: ModelClient = { + chat: vi.fn().mockImplementation(() => { + callCount++; + if (callCount === 1) { + return { + content: '', + stopReason: 'tool_use', + usage: { inputTokens: 10, outputTokens: 5 }, + toolCalls: [{ + id: 'call_1', + name: 'audio_transcribe', + args: { + data: 'UklGRigAAABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YQQAAAAAAAAA', + mime_type: 'audio/wav', + }, + }], + }; + } + return { + content: 'done', + stopReason: 'end_turn', + usage: { inputTokens: 15, outputTokens: 10 }, + }; + }), + }; + + const mockSession = { + id: 'telegram:user-audio', + getHistory: vi.fn().mockReturnValue([ + { role: 'user', content: '[Voice message]: hello world\n\ncaption' }, + ]), + addMessage: vi.fn(), + clear: vi.fn(), + replaceHistory: vi.fn(), + getConfig: vi.fn((key: string) => { + if (key === 'lastAudioAttachment') { + return JSON.stringify({ data: 'U0VTU0lPTg==', mimeType: 'audio/ogg' }); + } + return undefined; + }), + setConfig: vi.fn(), + deleteConfig: vi.fn(), + }; + + const audioTool: Tool = { + name: 'audio.transcribe', + description: 'Transcribe audio', + inputSchema: { type: 'object', properties: {} }, + execute: async (args) => { + seenArgs = args as Record; + return { success: true, output: 'transcript' }; + }, + }; + + const registry = new ToolRegistry(); + registry.register(audioTool); + const hooks = new HookEngine({ confirm: [], log: [], silent: [] }); + const executor = new ToolExecutor(registry, hooks); + + const agent = new NativeAgent({ + modelClient: mockClient, + systemPrompt: 'You are helpful.', + session: mockSession, + toolRegistry: registry, + toolExecutor: executor, + }); + + const response = await agent.process('Please transcribe this'); + + expect(response).toBe('done'); + expect(seenArgs).toEqual(expect.objectContaining({ + data: 'U0VTU0lPTg==', + mime_type: 'audio/ogg', + })); + }); + + it('replaces placeholder audio.transcribe data with latest attachment bytes', async () => { + let callCount = 0; + let seenArgs: Record | undefined; + const mockClient: ModelClient = { + chat: vi.fn().mockImplementation(() => { + callCount++; + if (callCount === 1) { + return { + content: '', + stopReason: 'tool_use', + usage: { inputTokens: 10, outputTokens: 5 }, + toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { data: '[voice message data not provided]', mime_type: 'audio/ogg' } }], + }; + } + return { + content: 'done', + stopReason: 'end_turn', + usage: { inputTokens: 15, outputTokens: 10 }, + }; + }), + }; + + const audioTool: Tool = { + name: 'audio.transcribe', + description: 'Transcribe audio', + inputSchema: { type: 'object', properties: {} }, + execute: async (args) => { + seenArgs = args as Record; + return { success: true, output: 'transcript' }; + }, + }; + + const registry = new ToolRegistry(); + registry.register(audioTool); + const hooks = new HookEngine({ confirm: [], log: [], silent: [] }); + const executor = new ToolExecutor(registry, hooks); + + const agent = new NativeAgent({ + modelClient: mockClient, + systemPrompt: 'You are helpful.', + toolRegistry: registry, + toolExecutor: executor, + }); + + const response = await agent.process('Please transcribe this', [{ + mimeType: 'audio/ogg', + data: 'QUJDRA==', + filename: 'voice.ogg', + }]); + + expect(response).toBe('done'); + expect(seenArgs).toEqual(expect.objectContaining({ + data: 'QUJDRA==', + mime_type: 'audio/ogg', + })); + }); + + it('overrides model-provided base64 with latest turn audio attachment bytes', async () => { + let callCount = 0; + let seenArgs: Record | undefined; + const mockClient: ModelClient = { + chat: vi.fn().mockImplementation(() => { + callCount++; + if (callCount === 1) { + return { + content: '', + stopReason: 'tool_use', + usage: { inputTokens: 10, outputTokens: 5 }, + toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: { data: 'd3Jvbmc=', mime_type: 'audio/ogg' } }], + }; + } + return { + content: 'done', + stopReason: 'end_turn', + usage: { inputTokens: 15, outputTokens: 10 }, + }; + }), + }; + + const audioTool: Tool = { + name: 'audio.transcribe', + description: 'Transcribe audio', + inputSchema: { type: 'object', properties: {} }, + execute: async (args) => { + seenArgs = args as Record; + return { success: true, output: 'transcript' }; + }, + }; + + const registry = new ToolRegistry(); + registry.register(audioTool); + const hooks = new HookEngine({ confirm: [], log: [], silent: [] }); + const executor = new ToolExecutor(registry, hooks); + + const agent = new NativeAgent({ + modelClient: mockClient, + systemPrompt: 'You are helpful.', + toolRegistry: registry, + toolExecutor: executor, + }); + + const response = await agent.process('Please transcribe this', [{ + mimeType: 'audio/ogg', + data: 'QUJDRA==', + filename: 'voice.ogg', + }]); + + expect(response).toBe('done'); + expect(seenArgs).toEqual(expect.objectContaining({ + data: 'QUJDRA==', + mime_type: 'audio/ogg', + })); + }); + it('respects max iterations when tool calls vary', async () => { // Model always returns tool_use but with different args each time (no loop detection) let callCount = 0; diff --git a/src/backends/native/agent.ts b/src/backends/native/agent.ts index 4d258a1..e78ac13 100644 --- a/src/backends/native/agent.ts +++ b/src/backends/native/agent.ts @@ -9,6 +9,7 @@ import type { Attachment } from '../../channels/types.js'; import type { OutboundAttachmentCollector } from './attachments.js'; import { buildUserMessage } from '../../models/media.js'; import { getElevationWindow } from '../../security/elevation.js'; +import { auditLogger } from '../../audit/index.js'; export interface ToolUseEvent { type: 'start' | 'end'; @@ -62,6 +63,20 @@ interface ExtractedTextToolCall { end: number; } +const LAST_AUDIO_ATTACHMENT_CONFIG_KEY = 'lastAudioAttachment'; + +interface AudioToolInput { + data?: string; + url?: string; + mime_type?: string; +} + +interface AudioToolArgSummary { + hasData: boolean; + hasUrl: boolean; + mimeType?: string; +} + export class NativeAgent { private static readonly EMPTY_RESPONSE_FALLBACK = 'I could not generate a response for that. Please try again.'; @@ -363,7 +378,8 @@ export class NativeAgent { } : undefined; - const result = await toolExecutor.execute(internalName, tc.args, perCallContext, { + const toolArgs = this.normalizeToolArgsForExecution(internalName, tc.args); + const result = await toolExecutor.execute(internalName, toolArgs, perCallContext, { signal: this._runAbortController?.signal, }); @@ -620,6 +636,284 @@ export class NativeAgent { return error instanceof Error && error.name === 'AbortError'; } + private normalizeToolArgsForExecution(toolName: string, rawArgs: unknown): unknown { + if (toolName !== 'audio.transcribe') { + return rawArgs; + } + return this.hydrateAudioTranscribeArgs(rawArgs); + } + + private hydrateAudioTranscribeArgs(rawArgs: unknown): unknown { + const args = (rawArgs && typeof rawArgs === 'object') + ? { ...(rawArgs as Record) } + : {}; + const original = this.summarizeAudioToolArgs(args); + + const latestTurnAudio = this.getLatestTurnUserAudioInput(); + if (latestTurnAudio) { + this.applyAudioToolInput(args, latestTurnAudio); + this.logAudioArgsRewrite('latest_audio_preferred', 'latest_turn', original, args); + return args; + } + + if (this.isCurrentTurnVoiceTranscriptFallback()) { + const persistedAudio = this.getPersistedAudioInput(); + if (persistedAudio) { + this.applyAudioToolInput(args, persistedAudio); + this.logAudioArgsRewrite('voice_turn_fallback', 'persisted', original, args); + return args; + } + } + + const normalizedData = this.normalizeAudioTranscribeDataArg(args.data, args.mime_type); + const normalizedUrl = this.normalizeAudioTranscribeUrlArg(args.url); + if (normalizedData) { + args.data = normalizedData; + delete args.url; + } else if (normalizedUrl) { + args.url = normalizedUrl; + delete args.data; + } else { + delete args.data; + delete args.url; + } + + const hasData = typeof args.data === 'string' && args.data.length > 0; + const hasUrl = typeof args.url === 'string' && args.url.length > 0; + if (hasData || hasUrl) { + if (hasData && (typeof args.mime_type !== 'string' || args.mime_type.length === 0)) { + const latestAudioForMime = this.getLatestUserAudioInput(); + if (latestAudioForMime?.mime_type) { + args.mime_type = latestAudioForMime.mime_type; + } + } + return args; + } + + const latestAudio = this.getLatestUserAudioInput(); + if (!latestAudio) { + return args; + } + const persistedAudio = this.getPersistedAudioInput(); + const source: 'history' | 'persisted' = persistedAudio?.data === latestAudio.data + && persistedAudio?.mime_type === latestAudio.mime_type + ? 'persisted' + : 'history'; + this.applyAudioToolInput(args, latestAudio); + this.logAudioArgsRewrite(original.hasData || original.hasUrl ? 'invalid_model_args' : 'missing_model_args', source, original, args); + return args; + } + + private summarizeAudioToolArgs(args: Record): AudioToolArgSummary { + const hasData = typeof args.data === 'string' && args.data.length > 0; + const hasUrl = typeof args.url === 'string' && args.url.length > 0; + const mimeType = typeof args.mime_type === 'string' && args.mime_type.length > 0 + ? args.mime_type + : undefined; + return { hasData, hasUrl, mimeType }; + } + + private applyAudioToolInput(args: Record, audio: AudioToolInput): void { + if (audio.data) { + args.data = audio.data; + delete args.url; + } else if (audio.url) { + args.url = audio.url; + delete args.data; + } else { + delete args.data; + delete args.url; + } + if (audio.mime_type) { + args.mime_type = audio.mime_type; + } + } + + private logAudioArgsRewrite( + reason: 'latest_audio_preferred' | 'voice_turn_fallback' | 'invalid_model_args' | 'missing_model_args', + source: 'latest_turn' | 'history' | 'persisted', + original: AudioToolArgSummary, + normalizedArgs: Record, + ): void { + const finalMime = typeof normalizedArgs.mime_type === 'string' && normalizedArgs.mime_type.length > 0 + ? normalizedArgs.mime_type + : undefined; + auditLogger?.toolArgsRewritten({ + tool_name: 'audio.transcribe', + session_id: this.session?.id, + source, + reason, + original_has_data: original.hasData, + original_has_url: original.hasUrl, + original_mime_type: original.mimeType, + final_mime_type: finalMime, + }); + } + + private isCurrentTurnVoiceTranscriptFallback(): boolean { + for (let i = this.history.length - 1; i >= 0; i--) { + const msg = this.history[i]; + if (msg.role !== 'user') { + continue; + } + if (typeof msg.content === 'string') { + return msg.content.includes('[Voice message]:'); + } + if (!Array.isArray(msg.content)) { + return false; + } + return msg.content.some((part) => ( + part.type === 'text' + && typeof part.text === 'string' + && part.text.includes('[Voice message]:') + )); + } + return false; + } + + private getLatestTurnUserAudioInput(): AudioToolInput | null { + for (let i = this.history.length - 1; i >= 0; i--) { + const msg = this.history[i]; + if (msg.role !== 'user') { + continue; + } + if (!Array.isArray(msg.content)) { + return null; + } + for (const part of msg.content) { + if (part.type !== 'audio') { + continue; + } + const source = part.source; + if (typeof source.data === 'string' && source.data.length > 0 && typeof source.media_type === 'string' && source.media_type.length > 0) { + return { data: source.data, mime_type: source.media_type }; + } + } + return null; + } + return null; + } + + private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined { + if (typeof rawData !== 'string') { + return undefined; + } + const compact = rawData.replace(/\s+/g, ''); + if (compact.length === 0) { + return undefined; + } + if (!/^[A-Za-z0-9+/=]+$/.test(compact)) { + return undefined; + } + try { + const decoded = Buffer.from(compact, 'base64'); + if (decoded.length === 0) { + return undefined; + } + const mimeType = typeof rawMimeType === 'string' ? rawMimeType : undefined; + if (!this.matchesAudioSignature(decoded, mimeType)) { + return undefined; + } + return compact; + } catch { + return undefined; + } + } + + private matchesAudioSignature(buffer: Buffer, mimeType?: string): boolean { + const ascii = (offset: number, value: string): boolean => { + if (buffer.length < offset + value.length) { + return false; + } + return buffer.subarray(offset, offset + value.length).toString('ascii') === value; + }; + + if (!mimeType) { + return true; + } + + switch (mimeType) { + case 'audio/ogg': + return ascii(0, 'OggS'); + case 'audio/wav': + return ascii(0, 'RIFF') && ascii(8, 'WAVE'); + case 'audio/webm': + return buffer.length >= 4 + && buffer[0] === 0x1A + && buffer[1] === 0x45 + && buffer[2] === 0xDF + && buffer[3] === 0xA3; + case 'audio/mpeg': + case 'audio/mp3': + return ascii(0, 'ID3') + || (buffer.length >= 2 && buffer[0] === 0xFF && (buffer[1] & 0xE0) === 0xE0); + case 'audio/mp4': + case 'audio/x-m4a': + return ascii(4, 'ftyp'); + default: + return true; + } + } + + private normalizeAudioTranscribeUrlArg(rawUrl: unknown): string | undefined { + if (typeof rawUrl !== 'string') { + return undefined; + } + const trimmed = rawUrl.trim(); + if (trimmed.length === 0) { + return undefined; + } + if (!/^https?:\/\//i.test(trimmed)) { + return undefined; + } + return trimmed; + } + + private getLatestUserAudioInput(): AudioToolInput | null { + for (let i = this.history.length - 1; i >= 0; i--) { + const msg = this.history[i]; + if (msg.role !== 'user' || !Array.isArray(msg.content)) { + continue; + } + + for (const part of msg.content) { + if (part.type !== 'audio') { + continue; + } + const source = part.source; + if (typeof source.data === 'string' && source.data.length > 0 && typeof source.media_type === 'string' && source.media_type.length > 0) { + return { data: source.data, mime_type: source.media_type }; + } + } + } + + return this.getPersistedAudioInput(); + } + + private getPersistedAudioInput(): AudioToolInput | null { + const persisted = this.session?.getConfig(LAST_AUDIO_ATTACHMENT_CONFIG_KEY); + if (!persisted) { + return null; + } + + try { + const parsed = JSON.parse(persisted) as { data?: unknown; url?: unknown; mimeType?: unknown }; + const data = typeof parsed.data === 'string' && parsed.data.length > 0 ? parsed.data : undefined; + const url = typeof parsed.url === 'string' && parsed.url.length > 0 ? parsed.url : undefined; + const mimeType = typeof parsed.mimeType === 'string' && parsed.mimeType.length > 0 ? parsed.mimeType : undefined; + if (!data && !url) { + return null; + } + return { + ...(data ? { data } : {}), + ...(url ? { url } : {}), + ...(mimeType ? { mime_type: mimeType } : {}), + }; + } catch { + return null; + } + } + private extractPseudoToolUse(content: string): PseudoToolUse | null { if (!content) { return null; diff --git a/src/daemon/routing.test.ts b/src/daemon/routing.test.ts index ed93de1..437c490 100644 --- a/src/daemon/routing.test.ts +++ b/src/daemon/routing.test.ts @@ -1351,7 +1351,7 @@ describe('daemon audio routing integration', () => { expect(String(msg.text)).toContain('audio transcription is not configured'); }); - it('transcribes voice attachments when transcription is configured, then strips audio before calling agent.process', async () => { + it('transcribes voice attachments when transcription is configured and preserves audio for anthropic tool fallback', async () => { const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('ok'); // Mock transcription endpoint call. @@ -1422,6 +1422,90 @@ describe('daemon audio routing integration', () => { timestamp: Date.now(), } as MessageRouterInput, reply); + expect(fetchSpy).toHaveBeenCalled(); + expect(processSpy).toHaveBeenCalledTimes(1); + const [calledText, calledAttachments] = processSpy.mock.calls[0] ?? []; + expect(String(calledText)).toContain('[Voice message]: hello world'); + expect(String(calledText)).toContain('caption'); + const atts = calledAttachments as Array<{ mimeType: string }> | undefined; + expect(atts?.some(a => a.mimeType === 'audio/ogg')).toBe(true); + expect(atts?.some(a => a.mimeType === 'image/jpeg')).toBe(true); + expect(session.setConfig).toHaveBeenCalledWith( + 'lastAudioAttachment', + expect.stringContaining('"mimeType":"audio/ogg"'), + ); + }); + + it('transcribes voice attachments when transcription is configured and strips audio for openai-compatible providers', async () => { + const processSpy = vi.spyOn(AgentOrchestrator.prototype, 'process').mockResolvedValue('ok'); + + const fetchSpy = vi.spyOn(globalThis, 'fetch').mockResolvedValue({ + ok: true, + status: 200, + statusText: 'OK', + json: async () => ({ text: 'hello world' }), + } as Response); + + const session = { + id: 'telegram:user-voice-3', + addMessage: vi.fn(), + getHistory: vi.fn(() => []), + clear: vi.fn(), + replaceHistory: vi.fn(), + getConfig: vi.fn(() => undefined), + setConfig: vi.fn(), + deleteConfig: vi.fn(), + }; + + const commandRegistry = new CommandRegistry(); + registerBuiltinCommands(commandRegistry); + + const router = createMessageRouter({ + sessionManager: { getSession: vi.fn(() => session) } as unknown as MessageRouterDeps['sessionManager'], + modelRouter: { + getAvailableTiers: () => ['default'], + getAllLabels: () => ({ default: 'default' }), + getLabel: (tier: string) => tier, + } as unknown as MessageRouterDeps['modelRouter'], + systemPrompt: 'test prompt', + toolRegistry: { clone() { return this; }, register: vi.fn() } as unknown as MessageRouterDeps['toolRegistry'], + toolExecutor: {} as unknown as MessageRouterDeps['toolExecutor'], + config: { + agents: { + primary_tier: 'default', + delegation: { + compaction: 'default', + memory_extraction: 'default', + classification: 'default', + tool_summarisation: 'default', + complex_reasoning: 'default', + }, + max_delegation_depth: 1, + max_iterations: 3, + }, + compaction: { enabled: false }, + models: { default: { provider: 'openai', model: 'gpt-4.1', supports_audio: false } }, + audio: { + enabled: true, + provider: { type: 'openai', endpoint: 'https://example.com/v1/audio/transcriptions', api_key: 'sk-test', model: 'whisper-1' }, + }, + } as unknown as MessageRouterDeps['config'], + commandRegistry, + }); + + const reply = vi.fn(async (_message: OutboundMessage) => {}); + await router.handler({ + id: 'v3', + channel: 'telegram', + senderId: 'user-voice-3', + text: 'caption', + attachments: [ + { mimeType: 'audio/ogg', data: 'ZGF0YQ==', filename: 'voice.ogg' }, + { mimeType: 'image/jpeg', data: 'aW1n', filename: 'img.jpg' }, + ], + timestamp: Date.now(), + } as MessageRouterInput, reply); + expect(fetchSpy).toHaveBeenCalled(); expect(processSpy).toHaveBeenCalledTimes(1); const [calledText, calledAttachments] = processSpy.mock.calls[0] ?? []; @@ -1430,6 +1514,10 @@ describe('daemon audio routing integration', () => { const atts = calledAttachments as Array<{ mimeType: string }> | undefined; expect(atts?.some(a => a.mimeType === 'audio/ogg')).toBe(false); expect(atts?.some(a => a.mimeType === 'image/jpeg')).toBe(true); + expect(session.setConfig).toHaveBeenCalledWith( + 'lastAudioAttachment', + expect.stringContaining('"mimeType":"audio/ogg"'), + ); }); }); diff --git a/src/daemon/routing.ts b/src/daemon/routing.ts index f7e15d6..fbfc75b 100644 --- a/src/daemon/routing.ts +++ b/src/daemon/routing.ts @@ -164,6 +164,57 @@ function shouldForceNativeForCapabilityQuery(text: string): boolean { ); } +function providerAcceptsNativeAudioContentParts(provider: string): boolean { + return ( + provider === 'openai' + || provider === 'github' + || provider === 'gemini' + || provider === 'openrouter' + || provider === 'zhipuai' + || provider === 'xai' + || provider === 'minimax' + || provider === 'moonshot' + || provider === 'vercel' + ); +} + +const LAST_AUDIO_ATTACHMENT_CONFIG_KEY = 'lastAudioAttachment'; + +function persistLatestAudioAttachment( + session: { setConfig(key: string, value: string): void }, + audioAttachments: Attachment[], +): void { + const latest = [...audioAttachments].reverse().find((att) => ( + (typeof att.data === 'string' && att.data.length > 0) + || (typeof att.url === 'string' && att.url.length > 0) + )); + if (!latest) { + return; + } + + const payload: { data?: string; url?: string; mimeType?: string } = { + mimeType: latest.mimeType, + }; + if (typeof latest.data === 'string' && latest.data.length > 0) { + payload.data = latest.data; + } else if (typeof latest.url === 'string' && latest.url.length > 0) { + payload.url = latest.url; + } + + if (!payload.data && !payload.url) { + return; + } + + try { + session.setConfig(LAST_AUDIO_ATTACHMENT_CONFIG_KEY, JSON.stringify(payload)); + } catch (error) { + console.warn( + 'Failed to persist latest audio attachment for tool hydration:', + error instanceof Error ? error.message : String(error), + ); + } +} + function isTtsEnabledForChannel(config: Config, channel: string): boolean { if (!config.tts?.enabled) { return false; @@ -1266,6 +1317,9 @@ export function createMessageRouter(deps: { let messageText = incomingText; let attachments = msg.attachments; const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a)); + if (audioAttachments.length > 0) { + persistLatestAudioAttachment(session, audioAttachments); + } if (audioAttachments.length > 0 && !nativeAudioSupported) { // Model doesn't support native audio — transcribe via Whisper and strip audio attachments @@ -1300,9 +1354,15 @@ export function createMessageRouter(deps: { const transcript = await transcribeAudio(att, audioConfig); messageText = `[Voice message]: ${transcript}\n\n${messageText}`; } - // Remove audio attachments so buildUserMessage doesn't create audio content parts - attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a)); - if (attachments.length === 0) { attachments = undefined; } + // For providers that cannot ingest native audio content parts (e.g. Anthropic), + // keep the original audio attachment available in the tool loop so + // audio.transcribe can still be hydrated from bytes if the model requests it. + // For providers that do accept native audio parts (OpenAI-compatible/Gemini), + // strip audio to avoid sending raw audio to a model tier that was marked as non-audio. + if (providerAcceptsNativeAudioContentParts(modelProvider)) { + attachments = (msg.attachments ?? []).filter((a: Attachment) => !isSupportedAudio(a)); + if (attachments.length === 0) { attachments = undefined; } + } } // If native audio IS supported, we pass attachments through unchanged — // buildUserMessage() in the agent will create native audio content parts diff --git a/src/tools/builtin/audio-transcribe.test.ts b/src/tools/builtin/audio-transcribe.test.ts index 3a970ba..3e614ee 100644 --- a/src/tools/builtin/audio-transcribe.test.ts +++ b/src/tools/builtin/audio-transcribe.test.ts @@ -51,6 +51,12 @@ describe('createAudioTranscribeTool', () => { expect(result.success).toBe(false); expect(result.error).toMatch(/Unsupported MIME type/); }); + + it('rejects invalid non-base64 data payloads', async () => { + const result = await tool.execute({ data: '[voice message data not provided]', mime_type: 'audio/ogg' }); + expect(result.success).toBe(false); + expect(result.error).toMatch(/valid base64/i); + }); }); describe('URL validation (SSRF protection)', () => { @@ -106,7 +112,7 @@ describe('createAudioTranscribeTool', () => { }); mockFetch.mockResolvedValueOnce({ ok: true, - json: async () => ({ text: 'hello' }), + text: async () => JSON.stringify({ text: 'hello' }), }); const result = await tool.execute({ url: 'https://example.com/audio.wav' }); @@ -136,7 +142,7 @@ describe('createAudioTranscribeTool', () => { it('transcribes base64 audio data', async () => { mockFetch.mockResolvedValueOnce({ ok: true, - json: async () => ({ text: 'Hello, world!' }), + text: async () => JSON.stringify({ text: 'Hello, world!' }), }); const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' }); @@ -152,7 +158,7 @@ describe('createAudioTranscribeTool', () => { it('sends Authorization header when apiKey is set', async () => { mockFetch.mockResolvedValueOnce({ ok: true, - json: async () => ({ text: 'test' }), + text: async () => JSON.stringify({ text: 'test' }), }); await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/ogg' }); @@ -164,7 +170,7 @@ describe('createAudioTranscribeTool', () => { it('passes language and prompt parameters', async () => { mockFetch.mockResolvedValueOnce({ ok: true, - json: async () => ({ text: 'Hola mundo' }), + text: async () => JSON.stringify({ text: 'Hola mundo' }), }); const result = await tool.execute({ @@ -176,6 +182,28 @@ describe('createAudioTranscribeTool', () => { expect(result.success).toBe(true); expect(result.output).toBe('Hola mundo'); }); + + it('accepts plain-text transcription responses', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + text: async () => 'Plain transcript', + }); + + const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' }); + expect(result.success).toBe(true); + expect(result.output).toBe('Plain transcript'); + }); + + it('returns a no-speech placeholder for empty transcript text', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + text: async () => JSON.stringify({ text: '' }), + }); + + const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' }); + expect(result.success).toBe(true); + expect(result.output).toBe('[No speech detected]'); + }); }); describe('URL-based transcription', () => { @@ -191,7 +219,7 @@ describe('createAudioTranscribeTool', () => { // Second fetch: transcription API mockFetch.mockResolvedValueOnce({ ok: true, - json: async () => ({ text: 'URL transcription result' }), + text: async () => JSON.stringify({ text: 'URL transcription result' }), }); const result = await tool.execute({ url: 'https://cdn.example.com/audio.mp3' }); @@ -207,7 +235,7 @@ describe('createAudioTranscribeTool', () => { }); mockFetch.mockResolvedValueOnce({ ok: true, - json: async () => ({ text: 'ogg result' }), + text: async () => JSON.stringify({ text: 'ogg result' }), }); const result = await tool.execute({ url: 'https://cdn.example.com/voice' }); @@ -287,5 +315,27 @@ describe('createAudioTranscribeTool', () => { expect(result.success).toBe(false); expect(result.error).toMatch(/ECONNREFUSED/); }); + + it('returns clear error when transcription payload has no text field', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + text: async () => JSON.stringify({ id: 'abc123', status: 'ok' }), + }); + + const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' }); + expect(result.success).toBe(false); + expect(result.error).toMatch(/missing text field/i); + }); + + it('surfaces endpoint error payloads', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + text: async () => JSON.stringify({ error: { message: 'model not loaded' } }), + }); + + const result = await tool.execute({ data: 'AAAAAAA=', mime_type: 'audio/wav' }); + expect(result.success).toBe(false); + expect(result.error).toMatch(/endpoint error: model not loaded/i); + }); }); }); diff --git a/src/tools/builtin/audio-transcribe.ts b/src/tools/builtin/audio-transcribe.ts index 8423452..fed70a0 100644 --- a/src/tools/builtin/audio-transcribe.ts +++ b/src/tools/builtin/audio-transcribe.ts @@ -51,6 +51,22 @@ function validateUrl(url: string): { valid: boolean; error?: string } { } function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: string } { + if (args.data !== undefined && typeof args.data !== 'string') { + return { valid: false, error: 'data must be a base64 string when provided' }; + } + if (args.url !== undefined && typeof args.url !== 'string') { + return { valid: false, error: 'url must be a string when provided' }; + } + if (args.mime_type !== undefined && typeof args.mime_type !== 'string') { + return { valid: false, error: 'mime_type must be a string when provided' }; + } + if (args.language !== undefined && typeof args.language !== 'string') { + return { valid: false, error: 'language must be a string when provided' }; + } + if (args.prompt !== undefined && typeof args.prompt !== 'string') { + return { valid: false, error: 'prompt must be a string when provided' }; + } + const hasData = args.data !== undefined && args.data !== ''; const hasUrl = args.url !== undefined && args.url !== ''; @@ -62,6 +78,22 @@ function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: str return { valid: false, error: 'Only one of data or url can be provided' }; } + if (hasData) { + const compact = (args.data ?? '').replace(/\s+/g, ''); + const isBase64 = /^[A-Za-z0-9+/=]+$/.test(compact); + let hasDecodedBytes = false; + if (isBase64) { + try { + hasDecodedBytes = Buffer.from(compact, 'base64').length > 0; + } catch { + hasDecodedBytes = false; + } + } + if (!isBase64 || !hasDecodedBytes) { + return { valid: false, error: 'data must be valid base64-encoded audio bytes' }; + } + } + if (hasData && !args.mime_type) { return { valid: false, error: 'mime_type is required when using data' }; } @@ -84,6 +116,131 @@ function validateInput(args: AudioTranscribeArgs): { valid: boolean; error?: str return { valid: true }; } +function extractTranscriptionText(payload: unknown): string | undefined { + if (typeof payload === 'string') { + return payload; + } + if (!payload || typeof payload !== 'object') { + return undefined; + } + + const obj = payload as Record; + const directKeys = ['text', 'transcript', 'transcription', 'output']; + for (const key of directKeys) { + const value = obj[key]; + if (typeof value === 'string') { + return value; + } + } + + if (obj.result && typeof obj.result === 'object') { + const resultObj = obj.result as Record; + const nested = resultObj.text ?? resultObj.transcript; + if (typeof nested === 'string') { + return nested; + } + } + + if (obj.data && typeof obj.data === 'object') { + const dataObj = obj.data as Record; + const nested = dataObj.text ?? dataObj.transcript; + if (typeof nested === 'string') { + return nested; + } + } + + if (Array.isArray(obj.results)) { + for (const result of obj.results) { + if (!result || typeof result !== 'object') { + continue; + } + const resultObj = result as Record; + if (typeof resultObj.text === 'string') { + return resultObj.text; + } + + if (Array.isArray(resultObj.alternatives)) { + for (const alternative of resultObj.alternatives) { + if (!alternative || typeof alternative !== 'object') { + continue; + } + const altObj = alternative as Record; + const altTranscript = altObj.transcript ?? altObj.text; + if (typeof altTranscript === 'string') { + return altTranscript; + } + } + } + } + } + + if (Array.isArray(obj.segments)) { + const joined = obj.segments + .map((segment) => (segment && typeof segment === 'object' + ? (segment as Record).text + : undefined)) + .filter((v): v is string => typeof v === 'string' && v.trim().length > 0) + .join(' '); + if (joined.trim().length > 0) { + return joined; + } + } + + return undefined; +} + +function extractTranscriptionError(payload: unknown): string | undefined { + if (!payload || typeof payload !== 'object') { + return undefined; + } + + const obj = payload as Record; + if (typeof obj.error === 'string' && obj.error.trim().length > 0) { + return obj.error; + } + + if (obj.error && typeof obj.error === 'object') { + const errorObj = obj.error as Record; + const message = errorObj.message ?? errorObj.error; + if (typeof message === 'string' && message.trim().length > 0) { + return message; + } + } + + if (typeof obj.detail === 'string' && obj.detail.trim().length > 0) { + return obj.detail; + } + + if (typeof obj.message === 'string' && obj.message.trim().length > 0) { + return obj.message; + } + + return undefined; +} + +function truncateForError(text: string, max = 180): string { + const normalized = text.replace(/\s+/g, ' ').trim(); + if (normalized.length <= max) { + return normalized; + } + return `${normalized.slice(0, max)}...`; +} + +async function readResponseBody(response: Response): Promise { + const textReader = response.text as unknown; + if (typeof textReader === 'function') { + return await response.text(); + } + + const maybeJsonResponse = response as unknown as { json?: () => Promise }; + if (typeof maybeJsonResponse.json === 'function') { + const jsonPayload = await maybeJsonResponse.json(); + return JSON.stringify(jsonPayload); + } + + return ''; +} + interface AudioTranscriptionConfig { endpoint?: string; apiKey?: string; @@ -146,7 +303,9 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig if (args.data) { const rawBuffer = Buffer.from(args.data, 'base64'); - const audioBuffer = rawBuffer.buffer; + if (rawBuffer.length === 0) { + throw new Error('Decoded audio data is empty'); + } const extMap: Record = { 'audio/ogg': 'ogg', @@ -161,7 +320,7 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig filename = `audio.${ext}`; const mimeType = args.mime_type ?? 'audio/wav'; - audioBlob = new Blob([audioBuffer], { type: mimeType }); + audioBlob = new Blob([rawBuffer], { type: mimeType }); } else if (args.url) { const response = await fetch(args.url); if (!response.ok) { @@ -204,6 +363,7 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig const formData = new FormData(); formData.append('file', audioBlob, filename); formData.append('model', model); + formData.append('response_format', 'json'); if (args.language) { formData.append('language', args.language); @@ -234,10 +394,30 @@ export function createAudioTranscribeTool(audioConfig?: AudioTranscriptionConfig throw new Error(`Transcription request failed (${response.status}): ${errorText}`); } - const json = await response.json() as { text: string }; + const rawBody = await readResponseBody(response); + const trimmedBody = rawBody.trim(); + let payload: unknown = rawBody; + if (trimmedBody.startsWith('{') || trimmedBody.startsWith('[')) { + try { + payload = JSON.parse(rawBody) as unknown; + } catch { + payload = rawBody; + } + } + + const transcript = extractTranscriptionText(payload); + if (transcript === undefined) { + const endpointError = extractTranscriptionError(payload); + if (endpointError) { + throw new Error(`Transcription endpoint error: ${endpointError}`); + } + throw new Error(`Transcription response missing text field (body: ${truncateForError(rawBody)})`); + } + + const normalizedTranscript = transcript.trim().length > 0 ? transcript : '[No speech detected]'; return { success: true, - output: json.text, + output: normalizedTranscript, }; } catch (error) { return { diff --git a/src/tools/executor.test.ts b/src/tools/executor.test.ts index 9a1e58c..2c11f85 100644 --- a/src/tools/executor.test.ts +++ b/src/tools/executor.test.ts @@ -35,6 +35,13 @@ const bigOutputTool: Tool = { execute: async () => ({ success: true, output: 'x'.repeat(100_000) }), }; +const malformedOutputTool: Tool = { + name: 'test.malformed_output', + description: 'Returns non-string output at runtime', + inputSchema: { type: 'object', properties: {} }, + execute: async () => ({ success: true, output: undefined as unknown as string }), +}; + const fileWriteLikeTool: Tool = { name: 'file.write', description: 'Test file write tool', @@ -191,6 +198,17 @@ describe('ToolExecutor', () => { expect(result.output).toContain('[truncated]'); }); + it('normalizes non-string output without throwing', async () => { + const registry = new ToolRegistry(); + registry.register(malformedOutputTool); + const hooks = new HookEngine({ confirm: [], log: [], silent: [] }); + const executor = new ToolExecutor(registry, hooks); + + const result = await executor.execute('test.malformed_output', {}); + expect(result.success).toBe(true); + expect(result.output).toBe(''); + }); + it('clears timeout timer after fast tool completion', async () => { vi.useFakeTimers(); try { diff --git a/src/tools/executor.ts b/src/tools/executor.ts index 983c0d4..5e2e32a 100644 --- a/src/tools/executor.ts +++ b/src/tools/executor.ts @@ -342,6 +342,14 @@ export class ToolExecutor { const duration = Date.now() - startTime; + // Defensive normalization: tool implementations should return string output, + // but third-party/custom tools can violate this at runtime. + if (typeof result.output !== 'string') { + result.output = result.output === undefined || result.output === null + ? '' + : String(result.output); + } + // Truncate output if too large if (result.output.length > this.maxOutputBytes) { result.output = result.output.slice(0, this.maxOutputBytes) + '\n[truncated]';