Bind audio.transcribe hydration to current message turn
This commit is contained in:
@@ -323,6 +323,79 @@ describe('NativeAgent tool loop', () => {
|
||||
}));
|
||||
});
|
||||
|
||||
it('prefers per-turn audio input over persisted fallback during voice transcript turns', async () => {
|
||||
let callCount = 0;
|
||||
let seenArgs: Record<string, unknown> | undefined;
|
||||
const mockClient: ModelClient = {
|
||||
chat: vi.fn().mockImplementation(() => {
|
||||
callCount++;
|
||||
if (callCount === 1) {
|
||||
return {
|
||||
content: '',
|
||||
stopReason: 'tool_use',
|
||||
usage: { inputTokens: 10, outputTokens: 5 },
|
||||
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: {} }],
|
||||
};
|
||||
}
|
||||
return {
|
||||
content: 'done',
|
||||
stopReason: 'end_turn',
|
||||
usage: { inputTokens: 15, outputTokens: 10 },
|
||||
};
|
||||
}),
|
||||
};
|
||||
|
||||
const mockSession = {
|
||||
id: 'telegram:user-audio',
|
||||
getHistory: vi.fn().mockReturnValue([
|
||||
{ role: 'user', content: '[Voice message]: old transcript' },
|
||||
]),
|
||||
addMessage: vi.fn(),
|
||||
clear: vi.fn(),
|
||||
replaceHistory: vi.fn(),
|
||||
getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment'
|
||||
? JSON.stringify({ data: 'T0xEX0FVRElP', mimeType: 'audio/ogg' })
|
||||
: undefined)),
|
||||
setConfig: vi.fn(),
|
||||
deleteConfig: vi.fn(),
|
||||
};
|
||||
|
||||
const audioTool: Tool = {
|
||||
name: 'audio.transcribe',
|
||||
description: 'Transcribe audio',
|
||||
inputSchema: { type: 'object', properties: {} },
|
||||
execute: async (args) => {
|
||||
seenArgs = args as Record<string, unknown>;
|
||||
return { success: true, output: 'transcript' };
|
||||
},
|
||||
};
|
||||
|
||||
const registry = new ToolRegistry();
|
||||
registry.register(audioTool);
|
||||
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
|
||||
const executor = new ToolExecutor(registry, hooks);
|
||||
|
||||
const agent = new NativeAgent({
|
||||
modelClient: mockClient,
|
||||
systemPrompt: 'You are helpful.',
|
||||
session: mockSession,
|
||||
toolRegistry: registry,
|
||||
toolExecutor: executor,
|
||||
});
|
||||
|
||||
const response = await agent.process(
|
||||
'Please transcribe this',
|
||||
undefined,
|
||||
{ data: 'TkVXX0FVRElP', mime_type: 'audio/ogg' },
|
||||
);
|
||||
|
||||
expect(response).toBe('done');
|
||||
expect(seenArgs).toEqual(expect.objectContaining({
|
||||
data: 'TkVXX0FVRElP',
|
||||
mime_type: 'audio/ogg',
|
||||
}));
|
||||
});
|
||||
|
||||
it('replaces invalid file:// audio.transcribe url with persisted session audio data', async () => {
|
||||
let callCount = 0;
|
||||
let seenArgs: Record<string, unknown> | undefined;
|
||||
|
||||
@@ -77,6 +77,12 @@ interface AudioToolArgSummary {
|
||||
mimeType?: string;
|
||||
}
|
||||
|
||||
export interface NativeAgentTurnAudioInput {
|
||||
data?: string;
|
||||
url?: string;
|
||||
mime_type?: string;
|
||||
}
|
||||
|
||||
export class NativeAgent {
|
||||
private static readonly EMPTY_RESPONSE_FALLBACK =
|
||||
'I could not generate a response for that. Please try again.';
|
||||
@@ -100,6 +106,7 @@ export class NativeAgent {
|
||||
private _runInProgress = false;
|
||||
private _runAbortController?: AbortController;
|
||||
private modelTimeoutMs: number;
|
||||
private _currentTurnAudioInput?: AudioToolInput;
|
||||
|
||||
constructor(config: NativeAgentConfig) {
|
||||
this.modelClient = config.modelClient;
|
||||
@@ -120,9 +127,14 @@ export class NativeAgent {
|
||||
return this.session?.getHistory() ?? [...this.inMemoryHistory];
|
||||
}
|
||||
|
||||
async process(userMessage: string, attachments?: Attachment[]): Promise<string> {
|
||||
async process(
|
||||
userMessage: string,
|
||||
attachments?: Attachment[],
|
||||
turnAudioInput?: NativeAgentTurnAudioInput,
|
||||
): Promise<string> {
|
||||
this._cancelRequested = false;
|
||||
this._runAbortController = new AbortController();
|
||||
this._currentTurnAudioInput = this.normalizeTurnAudioInput(turnAudioInput) ?? this.extractLatestAudioInputFromAttachments(attachments);
|
||||
if ('clearAbort' in this.modelClient && typeof this.modelClient.clearAbort === 'function') {
|
||||
this.modelClient.clearAbort();
|
||||
}
|
||||
@@ -162,6 +174,7 @@ export class NativeAgent {
|
||||
this._runInProgress = false;
|
||||
this._cancelRequested = false;
|
||||
this._runAbortController = undefined;
|
||||
this._currentTurnAudioInput = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -649,6 +662,12 @@ export class NativeAgent {
|
||||
: {};
|
||||
const original = this.summarizeAudioToolArgs(args);
|
||||
|
||||
if (this._currentTurnAudioInput) {
|
||||
this.applyAudioToolInput(args, this._currentTurnAudioInput);
|
||||
this.logAudioArgsRewrite('latest_audio_preferred', 'latest_turn', original, args);
|
||||
return args;
|
||||
}
|
||||
|
||||
const latestTurnAudio = this.getLatestTurnUserAudioInput();
|
||||
if (latestTurnAudio) {
|
||||
this.applyAudioToolInput(args, latestTurnAudio);
|
||||
@@ -794,6 +813,56 @@ export class NativeAgent {
|
||||
return null;
|
||||
}
|
||||
|
||||
private normalizeTurnAudioInput(turnAudioInput: NativeAgentTurnAudioInput | undefined): AudioToolInput | undefined {
|
||||
if (!turnAudioInput) {
|
||||
return undefined;
|
||||
}
|
||||
const data = typeof turnAudioInput.data === 'string' && turnAudioInput.data.length > 0
|
||||
? turnAudioInput.data
|
||||
: undefined;
|
||||
const url = typeof turnAudioInput.url === 'string' && turnAudioInput.url.length > 0
|
||||
? turnAudioInput.url
|
||||
: undefined;
|
||||
const mimeType = typeof turnAudioInput.mime_type === 'string' && turnAudioInput.mime_type.length > 0
|
||||
? turnAudioInput.mime_type
|
||||
: undefined;
|
||||
if (!data && !url) {
|
||||
return undefined;
|
||||
}
|
||||
return {
|
||||
...(data ? { data } : {}),
|
||||
...(url ? { url } : {}),
|
||||
...(mimeType ? { mime_type: mimeType } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
private extractLatestAudioInputFromAttachments(attachments?: Attachment[]): AudioToolInput | undefined {
|
||||
if (!attachments || attachments.length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
for (let i = attachments.length - 1; i >= 0; i--) {
|
||||
const attachment = attachments[i];
|
||||
if (!attachment.mimeType.startsWith('audio/')) {
|
||||
continue;
|
||||
}
|
||||
const data = typeof attachment.data === 'string' && attachment.data.length > 0
|
||||
? attachment.data
|
||||
: undefined;
|
||||
const url = typeof attachment.url === 'string' && attachment.url.length > 0
|
||||
? attachment.url
|
||||
: undefined;
|
||||
if (!data && !url) {
|
||||
continue;
|
||||
}
|
||||
return {
|
||||
...(data ? { data } : {}),
|
||||
...(url ? { url } : {}),
|
||||
mime_type: attachment.mimeType,
|
||||
};
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined {
|
||||
if (typeof rawData !== 'string') {
|
||||
return undefined;
|
||||
|
||||
@@ -7,6 +7,7 @@ import type { MemoryStore } from '../../memory/store.js';
|
||||
import type { ToolPolicyContext } from '../../tools/policy.js';
|
||||
import type { Attachment } from '../../channels/types.js';
|
||||
import { NativeAgent } from './agent.js';
|
||||
import type { NativeAgentTurnAudioInput } from './agent.js';
|
||||
import type { ToolUseEvent } from './agent.js';
|
||||
import type { OutboundAttachmentCollector } from './attachments.js';
|
||||
import { estimateMessageTokens, getContextWindow, shouldCompact } from '../../context/tokens.js';
|
||||
@@ -339,7 +340,11 @@ export class AgentOrchestrator {
|
||||
* When compaction is configured, checks whether the conversation history
|
||||
* exceeds the context window threshold and compacts it before processing.
|
||||
*/
|
||||
async process(userMessage: string, attachments?: Attachment[]): Promise<string> {
|
||||
async process(
|
||||
userMessage: string,
|
||||
attachments?: Attachment[],
|
||||
turnAudioInput?: NativeAgentTurnAudioInput,
|
||||
): Promise<string> {
|
||||
this._activeRunToolStarts = 0;
|
||||
this._injectMemoryContext(userMessage);
|
||||
await this._runProactiveContextMaintenance();
|
||||
@@ -352,10 +357,10 @@ export class AgentOrchestrator {
|
||||
|
||||
let result: string;
|
||||
try {
|
||||
result = await this._agent.process(userMessage, attachments);
|
||||
result = await this._agent.process(userMessage, attachments, turnAudioInput);
|
||||
} catch {
|
||||
this._restoreHistory(before);
|
||||
const escalated = await this._retryWithEscalation(userMessage, attachments, before, originalTier);
|
||||
const escalated = await this._retryWithEscalation(userMessage, attachments, turnAudioInput, before, originalTier);
|
||||
if (escalated) {
|
||||
await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts);
|
||||
return escalated;
|
||||
@@ -383,7 +388,7 @@ export class AgentOrchestrator {
|
||||
if (ctx) {
|
||||
// Attempt: compact + hard-trim to fit the discovered context window, then retry once.
|
||||
await this._compactAndTrimToFit(ctx);
|
||||
const retry = await this._agent.process(userMessage, attachments);
|
||||
const retry = await this._agent.process(userMessage, attachments, turnAudioInput);
|
||||
if (!this._isToolLoopErrorMessage(retry)) {
|
||||
return retry;
|
||||
}
|
||||
@@ -391,7 +396,7 @@ export class AgentOrchestrator {
|
||||
this._restoreHistory(before);
|
||||
}
|
||||
|
||||
const escalated = await this._retryWithEscalation(userMessage, attachments, before, originalTier);
|
||||
const escalated = await this._retryWithEscalation(userMessage, attachments, turnAudioInput, before, originalTier);
|
||||
if (escalated) {
|
||||
await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts);
|
||||
return escalated;
|
||||
@@ -419,6 +424,7 @@ export class AgentOrchestrator {
|
||||
private async _retryWithEscalation(
|
||||
userMessage: string,
|
||||
attachments: Attachment[] | undefined,
|
||||
turnAudioInput: NativeAgentTurnAudioInput | undefined,
|
||||
historyBefore: Message[],
|
||||
originalTier: ModelTier,
|
||||
): Promise<string | null> {
|
||||
@@ -437,7 +443,7 @@ export class AgentOrchestrator {
|
||||
|
||||
this._agent.setModelTier(targetTier);
|
||||
try {
|
||||
const retry = await this._agent.process(userMessage, attachments);
|
||||
const retry = await this._agent.process(userMessage, attachments, turnAudioInput);
|
||||
if (!this._isToolLoopErrorMessage(retry)) {
|
||||
return retry;
|
||||
}
|
||||
|
||||
@@ -656,7 +656,7 @@ describe('daemon command fast-path integration', () => {
|
||||
|
||||
const keys = Array.from(router.agents.keys());
|
||||
expect(keys.some(key => key.includes(':research'))).toBe(true);
|
||||
expect(processSpy).toHaveBeenCalledWith('compare k0s vs k3s for a homelab', undefined);
|
||||
expect(processSpy).toHaveBeenCalledWith('compare k0s vs k3s for a homelab', undefined, undefined);
|
||||
});
|
||||
|
||||
it('falls back to llm path when confidence is below fast threshold', async () => {
|
||||
@@ -1938,6 +1938,6 @@ describe('daemon talk mode (voice wake) integration', () => {
|
||||
timestamp: Date.now(),
|
||||
} as MessageRouterInput, reply);
|
||||
expect(processSpy).toHaveBeenCalledOnce();
|
||||
expect(processSpy).toHaveBeenCalledWith('what time is it?', undefined);
|
||||
expect(processSpy).toHaveBeenCalledWith('what time is it?', undefined, undefined);
|
||||
});
|
||||
});
|
||||
|
||||
+23
-2
@@ -215,6 +215,26 @@ function persistLatestAudioAttachment(
|
||||
}
|
||||
}
|
||||
|
||||
function extractLatestAudioToolInput(audioAttachments: Attachment[]): { data?: string; url?: string; mime_type?: string } | undefined {
|
||||
const latest = [...audioAttachments].reverse().find((att) => (
|
||||
(typeof att.data === 'string' && att.data.length > 0)
|
||||
|| (typeof att.url === 'string' && att.url.length > 0)
|
||||
));
|
||||
if (!latest) {
|
||||
return undefined;
|
||||
}
|
||||
const data = typeof latest.data === 'string' && latest.data.length > 0 ? latest.data : undefined;
|
||||
const url = typeof latest.url === 'string' && latest.url.length > 0 ? latest.url : undefined;
|
||||
if (!data && !url) {
|
||||
return undefined;
|
||||
}
|
||||
return {
|
||||
...(data ? { data } : {}),
|
||||
...(url ? { url } : {}),
|
||||
mime_type: latest.mimeType,
|
||||
};
|
||||
}
|
||||
|
||||
function isTtsEnabledForChannel(config: Config, channel: string): boolean {
|
||||
if (!config.tts?.enabled) {
|
||||
return false;
|
||||
@@ -1317,6 +1337,7 @@ export function createMessageRouter(deps: {
|
||||
let messageText = incomingText;
|
||||
let attachments = msg.attachments;
|
||||
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
|
||||
const turnAudioToolInput = extractLatestAudioToolInput(audioAttachments);
|
||||
if (audioAttachments.length > 0) {
|
||||
persistLatestAudioAttachment(session, audioAttachments);
|
||||
}
|
||||
@@ -1424,7 +1445,7 @@ export function createMessageRouter(deps: {
|
||||
let response: string;
|
||||
activeRuns.set(sessionIdForRun, agent);
|
||||
try {
|
||||
response = await agent.process(messageText, attachments);
|
||||
response = await agent.process(messageText, attachments, turnAudioToolInput);
|
||||
} catch (error) {
|
||||
const currentTier = agent.getModelTier();
|
||||
const canEscalate = deps.config.agents.auto_escalate && currentTier !== 'complex';
|
||||
@@ -1434,7 +1455,7 @@ export function createMessageRouter(deps: {
|
||||
|
||||
console.warn(`Auto-escalating session ${msg.channel}:${msg.senderId} from ${currentTier} to complex after processing failure.`);
|
||||
agent.setModelTier('complex');
|
||||
response = await agent.process(messageText, attachments);
|
||||
response = await agent.process(messageText, attachments, turnAudioToolInput);
|
||||
}
|
||||
const outboundAttachments = collector.drain();
|
||||
const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel);
|
||||
|
||||
Reference in New Issue
Block a user