Bind audio.transcribe hydration to current message turn
This commit is contained in:
@@ -323,6 +323,79 @@ describe('NativeAgent tool loop', () => {
|
|||||||
}));
|
}));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('prefers per-turn audio input over persisted fallback during voice transcript turns', async () => {
|
||||||
|
let callCount = 0;
|
||||||
|
let seenArgs: Record<string, unknown> | undefined;
|
||||||
|
const mockClient: ModelClient = {
|
||||||
|
chat: vi.fn().mockImplementation(() => {
|
||||||
|
callCount++;
|
||||||
|
if (callCount === 1) {
|
||||||
|
return {
|
||||||
|
content: '',
|
||||||
|
stopReason: 'tool_use',
|
||||||
|
usage: { inputTokens: 10, outputTokens: 5 },
|
||||||
|
toolCalls: [{ id: 'call_1', name: 'audio_transcribe', args: {} }],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
content: 'done',
|
||||||
|
stopReason: 'end_turn',
|
||||||
|
usage: { inputTokens: 15, outputTokens: 10 },
|
||||||
|
};
|
||||||
|
}),
|
||||||
|
};
|
||||||
|
|
||||||
|
const mockSession = {
|
||||||
|
id: 'telegram:user-audio',
|
||||||
|
getHistory: vi.fn().mockReturnValue([
|
||||||
|
{ role: 'user', content: '[Voice message]: old transcript' },
|
||||||
|
]),
|
||||||
|
addMessage: vi.fn(),
|
||||||
|
clear: vi.fn(),
|
||||||
|
replaceHistory: vi.fn(),
|
||||||
|
getConfig: vi.fn((key: string) => (key === 'lastAudioAttachment'
|
||||||
|
? JSON.stringify({ data: 'T0xEX0FVRElP', mimeType: 'audio/ogg' })
|
||||||
|
: undefined)),
|
||||||
|
setConfig: vi.fn(),
|
||||||
|
deleteConfig: vi.fn(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const audioTool: Tool = {
|
||||||
|
name: 'audio.transcribe',
|
||||||
|
description: 'Transcribe audio',
|
||||||
|
inputSchema: { type: 'object', properties: {} },
|
||||||
|
execute: async (args) => {
|
||||||
|
seenArgs = args as Record<string, unknown>;
|
||||||
|
return { success: true, output: 'transcript' };
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const registry = new ToolRegistry();
|
||||||
|
registry.register(audioTool);
|
||||||
|
const hooks = new HookEngine({ confirm: [], log: [], silent: [] });
|
||||||
|
const executor = new ToolExecutor(registry, hooks);
|
||||||
|
|
||||||
|
const agent = new NativeAgent({
|
||||||
|
modelClient: mockClient,
|
||||||
|
systemPrompt: 'You are helpful.',
|
||||||
|
session: mockSession,
|
||||||
|
toolRegistry: registry,
|
||||||
|
toolExecutor: executor,
|
||||||
|
});
|
||||||
|
|
||||||
|
const response = await agent.process(
|
||||||
|
'Please transcribe this',
|
||||||
|
undefined,
|
||||||
|
{ data: 'TkVXX0FVRElP', mime_type: 'audio/ogg' },
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(response).toBe('done');
|
||||||
|
expect(seenArgs).toEqual(expect.objectContaining({
|
||||||
|
data: 'TkVXX0FVRElP',
|
||||||
|
mime_type: 'audio/ogg',
|
||||||
|
}));
|
||||||
|
});
|
||||||
|
|
||||||
it('replaces invalid file:// audio.transcribe url with persisted session audio data', async () => {
|
it('replaces invalid file:// audio.transcribe url with persisted session audio data', async () => {
|
||||||
let callCount = 0;
|
let callCount = 0;
|
||||||
let seenArgs: Record<string, unknown> | undefined;
|
let seenArgs: Record<string, unknown> | undefined;
|
||||||
|
|||||||
@@ -77,6 +77,12 @@ interface AudioToolArgSummary {
|
|||||||
mimeType?: string;
|
mimeType?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface NativeAgentTurnAudioInput {
|
||||||
|
data?: string;
|
||||||
|
url?: string;
|
||||||
|
mime_type?: string;
|
||||||
|
}
|
||||||
|
|
||||||
export class NativeAgent {
|
export class NativeAgent {
|
||||||
private static readonly EMPTY_RESPONSE_FALLBACK =
|
private static readonly EMPTY_RESPONSE_FALLBACK =
|
||||||
'I could not generate a response for that. Please try again.';
|
'I could not generate a response for that. Please try again.';
|
||||||
@@ -100,6 +106,7 @@ export class NativeAgent {
|
|||||||
private _runInProgress = false;
|
private _runInProgress = false;
|
||||||
private _runAbortController?: AbortController;
|
private _runAbortController?: AbortController;
|
||||||
private modelTimeoutMs: number;
|
private modelTimeoutMs: number;
|
||||||
|
private _currentTurnAudioInput?: AudioToolInput;
|
||||||
|
|
||||||
constructor(config: NativeAgentConfig) {
|
constructor(config: NativeAgentConfig) {
|
||||||
this.modelClient = config.modelClient;
|
this.modelClient = config.modelClient;
|
||||||
@@ -120,9 +127,14 @@ export class NativeAgent {
|
|||||||
return this.session?.getHistory() ?? [...this.inMemoryHistory];
|
return this.session?.getHistory() ?? [...this.inMemoryHistory];
|
||||||
}
|
}
|
||||||
|
|
||||||
async process(userMessage: string, attachments?: Attachment[]): Promise<string> {
|
async process(
|
||||||
|
userMessage: string,
|
||||||
|
attachments?: Attachment[],
|
||||||
|
turnAudioInput?: NativeAgentTurnAudioInput,
|
||||||
|
): Promise<string> {
|
||||||
this._cancelRequested = false;
|
this._cancelRequested = false;
|
||||||
this._runAbortController = new AbortController();
|
this._runAbortController = new AbortController();
|
||||||
|
this._currentTurnAudioInput = this.normalizeTurnAudioInput(turnAudioInput) ?? this.extractLatestAudioInputFromAttachments(attachments);
|
||||||
if ('clearAbort' in this.modelClient && typeof this.modelClient.clearAbort === 'function') {
|
if ('clearAbort' in this.modelClient && typeof this.modelClient.clearAbort === 'function') {
|
||||||
this.modelClient.clearAbort();
|
this.modelClient.clearAbort();
|
||||||
}
|
}
|
||||||
@@ -162,6 +174,7 @@ export class NativeAgent {
|
|||||||
this._runInProgress = false;
|
this._runInProgress = false;
|
||||||
this._cancelRequested = false;
|
this._cancelRequested = false;
|
||||||
this._runAbortController = undefined;
|
this._runAbortController = undefined;
|
||||||
|
this._currentTurnAudioInput = undefined;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -649,6 +662,12 @@ export class NativeAgent {
|
|||||||
: {};
|
: {};
|
||||||
const original = this.summarizeAudioToolArgs(args);
|
const original = this.summarizeAudioToolArgs(args);
|
||||||
|
|
||||||
|
if (this._currentTurnAudioInput) {
|
||||||
|
this.applyAudioToolInput(args, this._currentTurnAudioInput);
|
||||||
|
this.logAudioArgsRewrite('latest_audio_preferred', 'latest_turn', original, args);
|
||||||
|
return args;
|
||||||
|
}
|
||||||
|
|
||||||
const latestTurnAudio = this.getLatestTurnUserAudioInput();
|
const latestTurnAudio = this.getLatestTurnUserAudioInput();
|
||||||
if (latestTurnAudio) {
|
if (latestTurnAudio) {
|
||||||
this.applyAudioToolInput(args, latestTurnAudio);
|
this.applyAudioToolInput(args, latestTurnAudio);
|
||||||
@@ -794,6 +813,56 @@ export class NativeAgent {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private normalizeTurnAudioInput(turnAudioInput: NativeAgentTurnAudioInput | undefined): AudioToolInput | undefined {
|
||||||
|
if (!turnAudioInput) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const data = typeof turnAudioInput.data === 'string' && turnAudioInput.data.length > 0
|
||||||
|
? turnAudioInput.data
|
||||||
|
: undefined;
|
||||||
|
const url = typeof turnAudioInput.url === 'string' && turnAudioInput.url.length > 0
|
||||||
|
? turnAudioInput.url
|
||||||
|
: undefined;
|
||||||
|
const mimeType = typeof turnAudioInput.mime_type === 'string' && turnAudioInput.mime_type.length > 0
|
||||||
|
? turnAudioInput.mime_type
|
||||||
|
: undefined;
|
||||||
|
if (!data && !url) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
...(data ? { data } : {}),
|
||||||
|
...(url ? { url } : {}),
|
||||||
|
...(mimeType ? { mime_type: mimeType } : {}),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractLatestAudioInputFromAttachments(attachments?: Attachment[]): AudioToolInput | undefined {
|
||||||
|
if (!attachments || attachments.length === 0) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
for (let i = attachments.length - 1; i >= 0; i--) {
|
||||||
|
const attachment = attachments[i];
|
||||||
|
if (!attachment.mimeType.startsWith('audio/')) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const data = typeof attachment.data === 'string' && attachment.data.length > 0
|
||||||
|
? attachment.data
|
||||||
|
: undefined;
|
||||||
|
const url = typeof attachment.url === 'string' && attachment.url.length > 0
|
||||||
|
? attachment.url
|
||||||
|
: undefined;
|
||||||
|
if (!data && !url) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
...(data ? { data } : {}),
|
||||||
|
...(url ? { url } : {}),
|
||||||
|
mime_type: attachment.mimeType,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined {
|
private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined {
|
||||||
if (typeof rawData !== 'string') {
|
if (typeof rawData !== 'string') {
|
||||||
return undefined;
|
return undefined;
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import type { MemoryStore } from '../../memory/store.js';
|
|||||||
import type { ToolPolicyContext } from '../../tools/policy.js';
|
import type { ToolPolicyContext } from '../../tools/policy.js';
|
||||||
import type { Attachment } from '../../channels/types.js';
|
import type { Attachment } from '../../channels/types.js';
|
||||||
import { NativeAgent } from './agent.js';
|
import { NativeAgent } from './agent.js';
|
||||||
|
import type { NativeAgentTurnAudioInput } from './agent.js';
|
||||||
import type { ToolUseEvent } from './agent.js';
|
import type { ToolUseEvent } from './agent.js';
|
||||||
import type { OutboundAttachmentCollector } from './attachments.js';
|
import type { OutboundAttachmentCollector } from './attachments.js';
|
||||||
import { estimateMessageTokens, getContextWindow, shouldCompact } from '../../context/tokens.js';
|
import { estimateMessageTokens, getContextWindow, shouldCompact } from '../../context/tokens.js';
|
||||||
@@ -339,7 +340,11 @@ export class AgentOrchestrator {
|
|||||||
* When compaction is configured, checks whether the conversation history
|
* When compaction is configured, checks whether the conversation history
|
||||||
* exceeds the context window threshold and compacts it before processing.
|
* exceeds the context window threshold and compacts it before processing.
|
||||||
*/
|
*/
|
||||||
async process(userMessage: string, attachments?: Attachment[]): Promise<string> {
|
async process(
|
||||||
|
userMessage: string,
|
||||||
|
attachments?: Attachment[],
|
||||||
|
turnAudioInput?: NativeAgentTurnAudioInput,
|
||||||
|
): Promise<string> {
|
||||||
this._activeRunToolStarts = 0;
|
this._activeRunToolStarts = 0;
|
||||||
this._injectMemoryContext(userMessage);
|
this._injectMemoryContext(userMessage);
|
||||||
await this._runProactiveContextMaintenance();
|
await this._runProactiveContextMaintenance();
|
||||||
@@ -352,10 +357,10 @@ export class AgentOrchestrator {
|
|||||||
|
|
||||||
let result: string;
|
let result: string;
|
||||||
try {
|
try {
|
||||||
result = await this._agent.process(userMessage, attachments);
|
result = await this._agent.process(userMessage, attachments, turnAudioInput);
|
||||||
} catch {
|
} catch {
|
||||||
this._restoreHistory(before);
|
this._restoreHistory(before);
|
||||||
const escalated = await this._retryWithEscalation(userMessage, attachments, before, originalTier);
|
const escalated = await this._retryWithEscalation(userMessage, attachments, turnAudioInput, before, originalTier);
|
||||||
if (escalated) {
|
if (escalated) {
|
||||||
await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts);
|
await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts);
|
||||||
return escalated;
|
return escalated;
|
||||||
@@ -383,7 +388,7 @@ export class AgentOrchestrator {
|
|||||||
if (ctx) {
|
if (ctx) {
|
||||||
// Attempt: compact + hard-trim to fit the discovered context window, then retry once.
|
// Attempt: compact + hard-trim to fit the discovered context window, then retry once.
|
||||||
await this._compactAndTrimToFit(ctx);
|
await this._compactAndTrimToFit(ctx);
|
||||||
const retry = await this._agent.process(userMessage, attachments);
|
const retry = await this._agent.process(userMessage, attachments, turnAudioInput);
|
||||||
if (!this._isToolLoopErrorMessage(retry)) {
|
if (!this._isToolLoopErrorMessage(retry)) {
|
||||||
return retry;
|
return retry;
|
||||||
}
|
}
|
||||||
@@ -391,7 +396,7 @@ export class AgentOrchestrator {
|
|||||||
this._restoreHistory(before);
|
this._restoreHistory(before);
|
||||||
}
|
}
|
||||||
|
|
||||||
const escalated = await this._retryWithEscalation(userMessage, attachments, before, originalTier);
|
const escalated = await this._retryWithEscalation(userMessage, attachments, turnAudioInput, before, originalTier);
|
||||||
if (escalated) {
|
if (escalated) {
|
||||||
await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts);
|
await this._runPostTurnMemoryMaintenance(userMessage, escalated, this._activeRunToolStarts);
|
||||||
return escalated;
|
return escalated;
|
||||||
@@ -419,6 +424,7 @@ export class AgentOrchestrator {
|
|||||||
private async _retryWithEscalation(
|
private async _retryWithEscalation(
|
||||||
userMessage: string,
|
userMessage: string,
|
||||||
attachments: Attachment[] | undefined,
|
attachments: Attachment[] | undefined,
|
||||||
|
turnAudioInput: NativeAgentTurnAudioInput | undefined,
|
||||||
historyBefore: Message[],
|
historyBefore: Message[],
|
||||||
originalTier: ModelTier,
|
originalTier: ModelTier,
|
||||||
): Promise<string | null> {
|
): Promise<string | null> {
|
||||||
@@ -437,7 +443,7 @@ export class AgentOrchestrator {
|
|||||||
|
|
||||||
this._agent.setModelTier(targetTier);
|
this._agent.setModelTier(targetTier);
|
||||||
try {
|
try {
|
||||||
const retry = await this._agent.process(userMessage, attachments);
|
const retry = await this._agent.process(userMessage, attachments, turnAudioInput);
|
||||||
if (!this._isToolLoopErrorMessage(retry)) {
|
if (!this._isToolLoopErrorMessage(retry)) {
|
||||||
return retry;
|
return retry;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -656,7 +656,7 @@ describe('daemon command fast-path integration', () => {
|
|||||||
|
|
||||||
const keys = Array.from(router.agents.keys());
|
const keys = Array.from(router.agents.keys());
|
||||||
expect(keys.some(key => key.includes(':research'))).toBe(true);
|
expect(keys.some(key => key.includes(':research'))).toBe(true);
|
||||||
expect(processSpy).toHaveBeenCalledWith('compare k0s vs k3s for a homelab', undefined);
|
expect(processSpy).toHaveBeenCalledWith('compare k0s vs k3s for a homelab', undefined, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('falls back to llm path when confidence is below fast threshold', async () => {
|
it('falls back to llm path when confidence is below fast threshold', async () => {
|
||||||
@@ -1938,6 +1938,6 @@ describe('daemon talk mode (voice wake) integration', () => {
|
|||||||
timestamp: Date.now(),
|
timestamp: Date.now(),
|
||||||
} as MessageRouterInput, reply);
|
} as MessageRouterInput, reply);
|
||||||
expect(processSpy).toHaveBeenCalledOnce();
|
expect(processSpy).toHaveBeenCalledOnce();
|
||||||
expect(processSpy).toHaveBeenCalledWith('what time is it?', undefined);
|
expect(processSpy).toHaveBeenCalledWith('what time is it?', undefined, undefined);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
+23
-2
@@ -215,6 +215,26 @@ function persistLatestAudioAttachment(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function extractLatestAudioToolInput(audioAttachments: Attachment[]): { data?: string; url?: string; mime_type?: string } | undefined {
|
||||||
|
const latest = [...audioAttachments].reverse().find((att) => (
|
||||||
|
(typeof att.data === 'string' && att.data.length > 0)
|
||||||
|
|| (typeof att.url === 'string' && att.url.length > 0)
|
||||||
|
));
|
||||||
|
if (!latest) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const data = typeof latest.data === 'string' && latest.data.length > 0 ? latest.data : undefined;
|
||||||
|
const url = typeof latest.url === 'string' && latest.url.length > 0 ? latest.url : undefined;
|
||||||
|
if (!data && !url) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
...(data ? { data } : {}),
|
||||||
|
...(url ? { url } : {}),
|
||||||
|
mime_type: latest.mimeType,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
function isTtsEnabledForChannel(config: Config, channel: string): boolean {
|
function isTtsEnabledForChannel(config: Config, channel: string): boolean {
|
||||||
if (!config.tts?.enabled) {
|
if (!config.tts?.enabled) {
|
||||||
return false;
|
return false;
|
||||||
@@ -1317,6 +1337,7 @@ export function createMessageRouter(deps: {
|
|||||||
let messageText = incomingText;
|
let messageText = incomingText;
|
||||||
let attachments = msg.attachments;
|
let attachments = msg.attachments;
|
||||||
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
|
const audioAttachments = (msg.attachments ?? []).filter((a: Attachment) => isSupportedAudio(a));
|
||||||
|
const turnAudioToolInput = extractLatestAudioToolInput(audioAttachments);
|
||||||
if (audioAttachments.length > 0) {
|
if (audioAttachments.length > 0) {
|
||||||
persistLatestAudioAttachment(session, audioAttachments);
|
persistLatestAudioAttachment(session, audioAttachments);
|
||||||
}
|
}
|
||||||
@@ -1424,7 +1445,7 @@ export function createMessageRouter(deps: {
|
|||||||
let response: string;
|
let response: string;
|
||||||
activeRuns.set(sessionIdForRun, agent);
|
activeRuns.set(sessionIdForRun, agent);
|
||||||
try {
|
try {
|
||||||
response = await agent.process(messageText, attachments);
|
response = await agent.process(messageText, attachments, turnAudioToolInput);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const currentTier = agent.getModelTier();
|
const currentTier = agent.getModelTier();
|
||||||
const canEscalate = deps.config.agents.auto_escalate && currentTier !== 'complex';
|
const canEscalate = deps.config.agents.auto_escalate && currentTier !== 'complex';
|
||||||
@@ -1434,7 +1455,7 @@ export function createMessageRouter(deps: {
|
|||||||
|
|
||||||
console.warn(`Auto-escalating session ${msg.channel}:${msg.senderId} from ${currentTier} to complex after processing failure.`);
|
console.warn(`Auto-escalating session ${msg.channel}:${msg.senderId} from ${currentTier} to complex after processing failure.`);
|
||||||
agent.setModelTier('complex');
|
agent.setModelTier('complex');
|
||||||
response = await agent.process(messageText, attachments);
|
response = await agent.process(messageText, attachments, turnAudioToolInput);
|
||||||
}
|
}
|
||||||
const outboundAttachments = collector.drain();
|
const outboundAttachments = collector.drain();
|
||||||
const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel);
|
const ttsAttachment = await maybeBuildTtsAttachment(response, msg.channel);
|
||||||
|
|||||||
Reference in New Issue
Block a user