Files
flynn/src/backends/native/agent.ts
T
2026-02-22 23:01:52 -08:00

1260 lines
41 KiB
TypeScript

import type { ModelClient, Message, ChatRequest, ChatResponse, ModelToolCall, TokenUsage } from '../../models/types.js';
import type { ModelRouter, ModelTier } from '../../models/router.js';
import type { Session } from '../../session/index.js';
import type { ToolRegistry } from '../../tools/registry.js';
import type { ToolExecutor } from '../../tools/executor.js';
import type { ToolResult } from '../../tools/types.js';
import type { ToolPolicyContext } from '../../tools/policy.js';
import type { Attachment } from '../../channels/types.js';
import type { OutboundAttachmentCollector } from './attachments.js';
import { buildUserMessage } from '../../models/media.js';
import { getElevationWindow } from '../../security/elevation.js';
import { auditLogger } from '../../audit/index.js';
import { AsyncLocalStorage } from 'node:async_hooks';
export interface ToolUseEvent {
type: 'start' | 'end';
tool: string;
args?: unknown;
result?: ToolResult;
}
export interface ToolInventorySnapshot {
sessionId: string;
agent: string;
provider: string;
skill: string;
internalCount: number;
exposedCount: number;
internalBrowser: string[];
exposedBrowser: string[];
}
export interface NativeAgentConfig {
modelClient: ModelClient | ModelRouter;
systemPrompt: string;
session?: Session;
toolRegistry?: ToolRegistry;
toolExecutor?: ToolExecutor;
maxIterations?: number;
onToolUse?: (event: ToolUseEvent) => void;
/** Policy context for tool filtering (agent tier, provider). */
toolPolicyContext?: ToolPolicyContext;
/** Collector for outbound attachments queued by tools (e.g. media.send). */
attachmentCollector?: OutboundAttachmentCollector;
/** Hard timeout for each model request in milliseconds. */
modelTimeoutMs?: number;
}
// Internal message type for the tool loop — supports both text and structured content blocks.
// This is broader than Message to accommodate Anthropic's tool_use/tool_result block format.
interface LoopMessage {
role: 'user' | 'assistant';
content: string | unknown[];
}
interface PseudoToolUse {
name?: string;
id?: string;
}
interface ExtractedTextToolCall {
toolCall: ModelToolCall;
start: number;
end: number;
}
const LAST_AUDIO_ATTACHMENT_CONFIG_KEY = 'lastAudioAttachment';
interface AudioToolInput {
data?: string;
url?: string;
mime_type?: string;
}
interface AudioToolArgSummary {
hasData: boolean;
hasUrl: boolean;
mimeType?: string;
}
export interface NativeAgentTurnAudioInput {
data?: string;
url?: string;
mime_type?: string;
}
interface NativeAgentRunContext {
turnAudioInput?: AudioToolInput;
}
export class NativeAgent {
private static readonly EMPTY_RESPONSE_FALLBACK =
'I could not generate a response for that. Please try again.';
private static readonly DEFAULT_MODEL_TIMEOUT_MS = 120_000;
private modelClient: ModelClient | ModelRouter;
private systemPrompt: string;
private session?: Session;
private inMemoryHistory: Message[] = [];
private currentTier: ModelTier = 'default';
private toolRegistry?: ToolRegistry;
private toolExecutor?: ToolExecutor;
private maxIterations: number;
private onToolUse?: (event: ToolUseEvent) => void;
private _totalUsage: TokenUsage = { inputTokens: 0, outputTokens: 0 };
private _callCount: number = 0;
private _toolPolicyContext?: ToolPolicyContext;
private _attachmentCollector?: OutboundAttachmentCollector;
private _thinking: boolean = false;
private _lastToolFingerprint?: string;
private _cancelRequested = false;
private _runInProgress = false;
private _runAbortController?: AbortController;
private modelTimeoutMs: number;
private readonly _runContext = new AsyncLocalStorage<NativeAgentRunContext>();
constructor(config: NativeAgentConfig) {
this.modelClient = config.modelClient;
this.systemPrompt = config.systemPrompt;
this.session = config.session;
this.toolRegistry = config.toolRegistry;
this.toolExecutor = config.toolExecutor;
this.maxIterations = config.maxIterations ?? 200;
this.onToolUse = config.onToolUse;
this._toolPolicyContext = config.toolPolicyContext;
this._attachmentCollector = config.attachmentCollector;
this.modelTimeoutMs = Number.isFinite(config.modelTimeoutMs)
? Math.max(1, Math.floor(config.modelTimeoutMs as number))
: NativeAgent.DEFAULT_MODEL_TIMEOUT_MS;
}
private get history(): Message[] {
return this.session?.getHistory() ?? [...this.inMemoryHistory];
}
async process(
userMessage: string,
attachments?: Attachment[],
turnAudioInput?: NativeAgentTurnAudioInput,
): Promise<string> {
this._cancelRequested = false;
this._runAbortController = new AbortController();
const normalizedTurnAudioInput = this.normalizeTurnAudioInput(turnAudioInput)
?? this.extractLatestAudioInputFromAttachments(attachments);
if ('clearAbort' in this.modelClient && typeof this.modelClient.clearAbort === 'function') {
this.modelClient.clearAbort();
}
this._runInProgress = true;
return await this._runContext.run({ turnAudioInput: normalizedTurnAudioInput }, async () => {
// Detect and strip !!think prefix for per-message thinking mode
try {
if (userMessage.startsWith('!!think ') || userMessage === '!!think') {
this._thinking = true;
userMessage = userMessage.replace(/^!!think\s*/, '').trim() || 'Think about this.';
} else {
this._thinking = false;
}
const userMsg = buildUserMessage(userMessage, attachments);
if (this.session) {
this.session.addMessage(userMsg);
} else {
this.inMemoryHistory.push(userMsg);
}
// If no tools configured, use the simple single-turn path
if (!this.toolRegistry || !this.toolExecutor) {
return await this.singleTurn();
}
return await this.toolLoop();
} catch (error) {
if (this.isAbortError(error)) {
const cancelledMsg = 'Operation cancelled by user.';
this.addToHistory({ role: 'assistant', content: cancelledMsg });
return cancelledMsg;
}
throw error;
} finally {
this._runInProgress = false;
this._cancelRequested = false;
this._runAbortController = undefined;
}
});
}
private async singleTurn(): Promise<string> {
this.throwIfCancelled();
const request: ChatRequest = {
messages: this.history,
system: this.systemPrompt,
...(this._thinking ? { thinking: true } : {}),
};
const response = await this.chatWithRouter(request);
this.throwIfCancelled();
this._totalUsage.inputTokens += response.usage.inputTokens;
this._totalUsage.outputTokens += response.usage.outputTokens;
this._callCount++;
const normalizedContent = this.normalizeAssistantContent(response.content);
// Prepend thinking content if present
let finalContent = normalizedContent;
if (response.thinkingContent) {
finalContent = `<thinking>\n${response.thinkingContent}\n</thinking>\n\n${normalizedContent}`;
}
const assistantMsg: Message = { role: 'assistant', content: normalizedContent };
this.addToHistory(assistantMsg);
return finalContent;
}
private async toolLoop(): Promise<string> {
const toolRegistry = this.toolRegistry;
const toolExecutor = this.toolExecutor;
if (!toolRegistry || !toolExecutor) {
throw new Error('Tool loop requires tool registry and executor');
}
const tools = toolRegistry.filteredToAnthropicFormat(this._toolPolicyContext);
// Track whether untrusted content (web/fetched/tool output) has been introduced
// during this run. Used to harden against prompt injection.
let untrustedContentSeen = false;
// Detect tool inventory changes to combat conversational inertia in long sessions.
// When tools change (e.g. new tools added between restarts), the model's prior messages
// saying "I can't do that" can override tool definitions. Injecting a system note fixes this.
const currentFingerprint = tools.map(t => t.name).sort().join(',');
const hasHistory = this.history.length > 1; // more than just the current user message
let effectiveSystem = this.systemPrompt;
if (hasHistory && this._lastToolFingerprint !== currentFingerprint) {
const toolNames = tools.map(t => t.name).join(', ');
effectiveSystem += `\n\n[Tool inventory updated — available tools: ${toolNames}. Use these tools directly; do not attempt workarounds for functionality that a tool already provides.]`;
}
this._lastToolFingerprint = currentFingerprint;
// Build the loop messages from existing history.
// These are the messages sent to the model, including any structured tool blocks.
const loopMessages: LoopMessage[] = this.history.map(m => ({
role: m.role,
content: m.content,
}));
// Track consecutive identical tool call fingerprints to detect loops.
// Local LLMs are especially prone to repeatedly requesting the same tool call.
let lastFingerprint: string | undefined;
let consecutiveRepeats = 0;
const maxConsecutiveRepeats = 3;
let lastToolResults: string[] = [];
// Track consecutive calls to the same tool (even with different args).
// Local models often call the same tool with slight query variations.
let lastToolName: string | undefined;
let sameToolStreak = 0;
const maxSameToolStreak = 4; // nudge after 4 calls to the same tool
let nudged = false;
let actionIntentNudged = false;
for (let iteration = 0; iteration < this.maxIterations; iteration++) {
try {
this.throwIfCancelled();
// Build request — cast loopMessages to Message[] because the underlying
// model client will pass them through to the API which accepts structured content.
const request = {
messages: loopMessages as unknown as Message[],
system: effectiveSystem,
tools,
...(this._thinking ? { thinking: true } : {}),
};
const response = await this.chatWithRouter(request);
this.throwIfCancelled();
this._totalUsage.inputTokens += response.usage.inputTokens;
this._totalUsage.outputTokens += response.usage.outputTokens;
this._callCount++;
// Some backends emit tool_use JSON as plain text rather than structured tool metadata.
// Recover those calls when possible so the loop can continue safely.
let toolCalls = response.toolCalls ?? [];
let assistantTextContent = response.content;
if (toolCalls.length === 0) {
const extracted = this.extractToolCallsFromText(response.content);
if (extracted && extracted.toolCalls.length > 0) {
const executableCalls = extracted.toolCalls.filter(tc => Boolean(toolRegistry.getByApiName(tc.name)));
if (executableCalls.length > 0) {
toolCalls = executableCalls;
assistantTextContent = extracted.remainingText;
}
}
}
if (toolCalls.length === 0) {
const recovered = this.extractMalformedShellToolCall(response.content);
if (recovered && toolRegistry.getByApiName(recovered.toolCall.name)) {
toolCalls = [recovered.toolCall];
assistantTextContent = recovered.remainingText;
}
}
const wantsToolUse = toolCalls.length > 0;
if (!wantsToolUse) {
const pseudoToolUse = this.extractPseudoToolUse(response.content);
if (this.shouldNudgeForMissingToolCall(response.content, pseudoToolUse) && !actionIntentNudged) {
actionIntentNudged = true;
const normalized = this.normalizeAssistantContent(response.content);
loopMessages.push({ role: 'assistant', content: normalized });
loopMessages.push({
role: 'user',
content: 'You said you would perform an action now, but no tool call was emitted. If a tool is needed, call it now. If blocked, explain the exact blocker.',
});
continue;
}
const baseContent = pseudoToolUse
? this.buildPseudoToolUseWarning(response.content, pseudoToolUse)
: this.normalizeAssistantContent(response.content);
let finalContent = baseContent;
if (response.thinkingContent) {
finalContent = `<thinking>\n${response.thinkingContent}\n</thinking>\n\n${baseContent}`;
}
const assistantMsg: Message = { role: 'assistant', content: baseContent };
this.addToHistory(assistantMsg);
return finalContent;
}
// Check for repeated tool calls — build a fingerprint from tool names + args
const fingerprint = toolCalls
.map(tc => `${tc.name}:${JSON.stringify(tc.args)}`)
.sort()
.join('|');
if (fingerprint === lastFingerprint) {
consecutiveRepeats++;
} else {
consecutiveRepeats = 1;
lastFingerprint = fingerprint;
}
// Track consecutive calls to the same tool (by name, ignoring args)
const toolNames = toolCalls.map(tc => tc.name).sort().join(',');
if (toolNames === lastToolName) {
sameToolStreak++;
} else {
sameToolStreak = 1;
lastToolName = toolNames;
nudged = false;
}
// Build the assistant message with tool_use content blocks
const assistantContent: unknown[] = [];
if (assistantTextContent) {
assistantContent.push({ type: 'text', text: assistantTextContent });
}
for (const tc of toolCalls) {
assistantContent.push({
type: 'tool_use',
id: tc.id,
name: tc.name,
input: tc.args,
});
}
loopMessages.push({ role: 'assistant', content: assistantContent });
// Execute each tool call and collect results
const toolResultBlocks: unknown[] = [];
lastToolResults = [];
for (const tc of toolCalls) {
this.throwIfCancelled();
const internalName = toolRegistry.getByApiName(tc.name)?.name ?? tc.name;
this.onToolUse?.({ type: 'start', tool: internalName, args: tc.args });
let elevationUntilMs: number | undefined;
let elevationReason: string | undefined;
let elevationId: string | undefined;
if (this.session) {
const elevation = getElevationWindow({
get: (key) => this.session!.getConfig(key),
set: (key, value) => this.session!.setConfig(key, value),
delete: (key) => this.session!.deleteConfig(key),
}, {
auditContext: {
sessionId: this.session.id,
channel: this._toolPolicyContext?.channel ?? 'unknown',
sender: this._toolPolicyContext?.sender ?? 'unknown',
},
});
if (elevation.window) {
elevationUntilMs = elevation.window.untilMs;
elevationId = elevation.window.id;
elevationReason = elevation.window.reason;
}
}
const perCallContext: ToolPolicyContext | undefined = this._toolPolicyContext
? {
...this._toolPolicyContext,
untrustedContent: untrustedContentSeen,
elevatedHostUntilMs: elevationUntilMs,
elevatedHostReason: elevationReason,
elevatedHostId: elevationId,
}
: undefined;
const toolArgs = this.normalizeToolArgsForExecution(internalName, tc.args);
const result = await toolExecutor.execute(internalName, toolArgs, perCallContext, {
signal: this._runAbortController?.signal,
});
this.onToolUse?.({ type: 'end', tool: internalName, result });
const provenance = (internalName === 'web.fetch' || internalName === 'web.search' || internalName === 'browser.content')
? 'fetched_content'
: 'tool_output';
if (provenance === 'fetched_content') {
untrustedContentSeen = true;
}
const rawContent = result.success ? result.output : (result.error ?? 'Unknown error');
const resultContent = `[provenance=${provenance} tool=${internalName} untrusted=${provenance === 'fetched_content' ? 'true' : 'false'}]\n${rawContent}\n[/provenance]`;
toolResultBlocks.push({
type: 'tool_result',
tool_use_id: tc.id,
content: resultContent,
is_error: !result.success,
});
if (result.success && result.output) {
lastToolResults.push(result.output);
}
}
// If the same tool has been called too many times, append a nudge
// telling the model to use what it has. This combats local models
// that endlessly retry searches with slight query variations.
let nudgeMessage: string | null = null;
if (sameToolStreak >= maxSameToolStreak && !nudged) {
nudged = true;
nudgeMessage = `You have called this tool ${sameToolStreak} times in a row. You have enough information — do NOT call it again. Summarize what you have found and respond to the user now.`;
}
// Add tool results as a user message
loopMessages.push({ role: 'user', content: toolResultBlocks });
if (nudgeMessage) {
loopMessages.push({ role: 'user', content: nudgeMessage });
}
// Break out if the model is stuck in a repeated tool call loop
if (consecutiveRepeats >= maxConsecutiveRepeats) {
const toolOutput = lastToolResults.length > 0
? lastToolResults.join('\n\n')
: 'No results available.';
const breakMsg = `Tool loop detected (same tool called ${consecutiveRepeats} times). Returning last results:\n\n${toolOutput}`;
const assistantMsg: Message = { role: 'assistant', content: breakMsg };
this.addToHistory(assistantMsg);
return breakMsg;
}
} catch (error) {
if (this.isAbortError(error)) {
const cancelledMsg = 'Operation cancelled by user.';
const assistantMsg: Message = { role: 'assistant', content: cancelledMsg };
this.addToHistory(assistantMsg);
return cancelledMsg;
}
const errorMsg = `Error in tool loop (iteration ${iteration + 1}): ${error instanceof Error ? error.message : String(error)}`;
const assistantMsg: Message = { role: 'assistant', content: errorMsg };
this.addToHistory(assistantMsg);
return errorMsg;
}
}
// Max iterations reached
const warningMsg = `Stopped after reaching max iterations (${this.maxIterations}). The task may be incomplete.`;
const assistantMsg: Message = { role: 'assistant', content: warningMsg };
this.addToHistory(assistantMsg);
return warningMsg;
}
private shouldNudgeForMissingToolCall(content: string, pseudoToolUse: PseudoToolUse | null): boolean {
if (!content || pseudoToolUse) {
return false;
}
const normalized = content.toLowerCase();
const intentRegex = /\b(i(?:'m| am)? going to|i(?:'ll| will)|let me|proceeding(?: now)?|i can(?: now)?)\b/;
if (!intentRegex.test(normalized)) {
return false;
}
const actionRegex = /\b(create|run|execute|call|use|check|fetch|search|read|write|send|retry|proceed|attempt|apply|delete|update|list)\b/;
return actionRegex.test(normalized);
}
private async chatWithRouter(request: ChatRequest): Promise<ChatResponse> {
const runSignal = this._runAbortController?.signal;
const requestSignal = request.signal;
const signal = runSignal && requestSignal
? AbortSignal.any([runSignal, requestSignal])
: (runSignal ?? requestSignal);
const requestWithSignal = signal
? { ...request, signal }
: request;
const requestPromise = 'getClient' in this.modelClient
? (this.modelClient as ModelRouter).chat(requestWithSignal, this.currentTier)
: this.modelClient.chat(requestWithSignal);
let timer: NodeJS.Timeout | undefined;
let abortCleanup: (() => void) | undefined;
const timeoutPromise = new Promise<never>((_, reject) => {
timer = setTimeout(() => {
const error = new Error(`Model request timed out after ${this.modelTimeoutMs}ms`);
error.name = 'TimeoutError';
reject(error);
}, this.modelTimeoutMs);
timer.unref?.();
});
const abortPromise = signal
? new Promise<never>((_, reject) => {
if (signal.aborted) {
const error = new Error('Operation cancelled by user.');
error.name = 'AbortError';
reject(error);
return;
}
const onAbort = () => {
const error = new Error('Operation cancelled by user.');
error.name = 'AbortError';
reject(error);
};
signal.addEventListener('abort', onAbort, { once: true });
abortCleanup = () => signal.removeEventListener('abort', onAbort);
})
: null;
try {
return await Promise.race([requestPromise, timeoutPromise, ...(abortPromise ? [abortPromise] : [])]);
} finally {
if (timer) {
clearTimeout(timer);
}
abortCleanup?.();
}
}
private addToHistory(msg: Message): void {
if (this.session) {
this.session.addMessage(msg);
} else {
this.inMemoryHistory.push(msg);
}
}
reset(): void {
if (this.session) {
this.session.clear();
} else {
this.inMemoryHistory = [];
}
this._lastToolFingerprint = undefined;
this.resetUsage();
}
getUsage(): { inputTokens: number; outputTokens: number; calls: number } {
return { ...this._totalUsage, calls: this._callCount };
}
resetUsage(): void {
this._totalUsage = { inputTokens: 0, outputTokens: 0 };
this._callCount = 0;
}
getHistory(): Message[] {
return [...this.history];
}
setModelTier(tier: ModelTier): void {
this.currentTier = tier;
}
getModelTier(): ModelTier {
return this.currentTier;
}
setSystemPrompt(prompt: string): void {
this.systemPrompt = prompt;
}
setOnToolUse(callback: ((event: ToolUseEvent) => void) | undefined): void {
this.onToolUse = callback;
}
setToolPolicyContext(context: ToolPolicyContext | undefined): void {
this._toolPolicyContext = context;
}
getToolPolicyContext(): ToolPolicyContext | undefined {
return this._toolPolicyContext;
}
getToolInventorySnapshot(): ToolInventorySnapshot {
if (!this.toolRegistry) {
return {
sessionId: '-',
agent: '-',
provider: '-',
skill: '-',
internalCount: 0,
exposedCount: 0,
internalBrowser: [],
exposedBrowser: [],
};
}
const internal = this.toolRegistry.filteredList(this._toolPolicyContext).map((tool) => tool.name);
const exposed = this.toolRegistry.filteredToAnthropicFormat(this._toolPolicyContext).map((tool) => tool.name);
const context = this._toolPolicyContext;
return {
sessionId: context?.sessionId ?? '-',
agent: context?.agent ?? '-',
provider: context?.provider ?? '-',
skill: context?.skillName ?? '-',
internalCount: internal.length,
exposedCount: exposed.length,
internalBrowser: internal.filter((name) => name.startsWith('browser.')),
exposedBrowser: exposed.filter((name) => name.startsWith('browser_')),
};
}
getAvailableToolNames(): string[] {
if (!this.toolRegistry) {
return [];
}
return this.toolRegistry
.filteredList(this._toolPolicyContext)
.map((tool) => tool.name)
.sort();
}
setAttachmentCollector(collector: OutboundAttachmentCollector | undefined): void {
this._attachmentCollector = collector;
}
getAttachmentCollector(): OutboundAttachmentCollector | undefined {
return this._attachmentCollector;
}
cancel(): void {
if (this._runInProgress) {
this._cancelRequested = true;
this._runAbortController?.abort();
if ('requestAbort' in this.modelClient && typeof this.modelClient.requestAbort === 'function') {
this.modelClient.requestAbort();
}
}
}
isCancellable(): boolean {
return this._runInProgress;
}
private throwIfCancelled(): void {
if (!this._cancelRequested && !this._runAbortController?.signal.aborted) {
return;
}
const error = new Error('Operation cancelled by user.');
error.name = 'AbortError';
throw error;
}
private isAbortError(error: unknown): boolean {
return error instanceof Error && error.name === 'AbortError';
}
private normalizeToolArgsForExecution(toolName: string, rawArgs: unknown): unknown {
if (toolName !== 'audio.transcribe') {
return rawArgs;
}
return this.hydrateAudioTranscribeArgs(rawArgs);
}
private hydrateAudioTranscribeArgs(rawArgs: unknown): unknown {
const args = (rawArgs && typeof rawArgs === 'object')
? { ...(rawArgs as Record<string, unknown>) }
: {};
const original = this.summarizeAudioToolArgs(args);
const runTurnAudioInput = this._runContext.getStore()?.turnAudioInput;
if (runTurnAudioInput) {
this.applyAudioToolInput(args, runTurnAudioInput);
this.logAudioArgsRewrite('latest_audio_preferred', 'latest_turn', original, args);
return args;
}
const latestTurnAudio = this.getLatestTurnUserAudioInput();
if (latestTurnAudio) {
this.applyAudioToolInput(args, latestTurnAudio);
this.logAudioArgsRewrite('latest_audio_preferred', 'latest_turn', original, args);
return args;
}
if (this.isCurrentTurnVoiceTranscriptFallback()) {
const persistedAudio = this.getPersistedAudioInput();
if (persistedAudio) {
this.applyAudioToolInput(args, persistedAudio);
this.logAudioArgsRewrite('voice_turn_fallback', 'persisted', original, args);
return args;
}
}
const normalizedData = this.normalizeAudioTranscribeDataArg(args.data, args.mime_type);
const normalizedUrl = this.normalizeAudioTranscribeUrlArg(args.url);
if (normalizedData) {
args.data = normalizedData;
delete args.url;
} else if (normalizedUrl) {
args.url = normalizedUrl;
delete args.data;
} else {
delete args.data;
delete args.url;
}
const hasData = typeof args.data === 'string' && args.data.length > 0;
const hasUrl = typeof args.url === 'string' && args.url.length > 0;
if (hasData || hasUrl) {
if (hasData && (typeof args.mime_type !== 'string' || args.mime_type.length === 0)) {
const latestAudioForMime = this.getLatestUserAudioInput();
if (latestAudioForMime?.mime_type) {
args.mime_type = latestAudioForMime.mime_type;
}
}
return args;
}
const latestAudio = this.getLatestUserAudioInput();
if (!latestAudio) {
return args;
}
const persistedAudio = this.getPersistedAudioInput();
const source: 'history' | 'persisted' = persistedAudio?.data === latestAudio.data
&& persistedAudio?.mime_type === latestAudio.mime_type
? 'persisted'
: 'history';
this.applyAudioToolInput(args, latestAudio);
this.logAudioArgsRewrite(original.hasData || original.hasUrl ? 'invalid_model_args' : 'missing_model_args', source, original, args);
return args;
}
private summarizeAudioToolArgs(args: Record<string, unknown>): AudioToolArgSummary {
const hasData = typeof args.data === 'string' && args.data.length > 0;
const hasUrl = typeof args.url === 'string' && args.url.length > 0;
const mimeType = typeof args.mime_type === 'string' && args.mime_type.length > 0
? args.mime_type
: undefined;
return { hasData, hasUrl, mimeType };
}
private applyAudioToolInput(args: Record<string, unknown>, audio: AudioToolInput): void {
if (audio.data) {
args.data = audio.data;
delete args.url;
} else if (audio.url) {
args.url = audio.url;
delete args.data;
} else {
delete args.data;
delete args.url;
}
if (audio.mime_type) {
args.mime_type = audio.mime_type;
}
}
private logAudioArgsRewrite(
reason: 'latest_audio_preferred' | 'voice_turn_fallback' | 'invalid_model_args' | 'missing_model_args',
source: 'latest_turn' | 'history' | 'persisted',
original: AudioToolArgSummary,
normalizedArgs: Record<string, unknown>,
): void {
const finalMime = typeof normalizedArgs.mime_type === 'string' && normalizedArgs.mime_type.length > 0
? normalizedArgs.mime_type
: undefined;
auditLogger?.toolArgsRewritten({
tool_name: 'audio.transcribe',
session_id: this.session?.id,
source,
reason,
original_has_data: original.hasData,
original_has_url: original.hasUrl,
original_mime_type: original.mimeType,
final_mime_type: finalMime,
});
}
private isCurrentTurnVoiceTranscriptFallback(): boolean {
for (let i = this.history.length - 1; i >= 0; i--) {
const msg = this.history[i];
if (msg.role !== 'user') {
continue;
}
if (typeof msg.content === 'string') {
return msg.content.includes('[Voice message]:');
}
if (!Array.isArray(msg.content)) {
return false;
}
return msg.content.some((part) => (
part.type === 'text'
&& typeof part.text === 'string'
&& part.text.includes('[Voice message]:')
));
}
return false;
}
private getLatestTurnUserAudioInput(): AudioToolInput | null {
for (let i = this.history.length - 1; i >= 0; i--) {
const msg = this.history[i];
if (msg.role !== 'user') {
continue;
}
if (!Array.isArray(msg.content)) {
return null;
}
for (const part of msg.content) {
if (part.type !== 'audio') {
continue;
}
const source = part.source;
if (typeof source.data === 'string' && source.data.length > 0 && typeof source.media_type === 'string' && source.media_type.length > 0) {
return { data: source.data, mime_type: source.media_type };
}
}
return null;
}
return null;
}
private normalizeTurnAudioInput(turnAudioInput: NativeAgentTurnAudioInput | undefined): AudioToolInput | undefined {
if (!turnAudioInput) {
return undefined;
}
const data = typeof turnAudioInput.data === 'string' && turnAudioInput.data.length > 0
? turnAudioInput.data
: undefined;
const url = typeof turnAudioInput.url === 'string' && turnAudioInput.url.length > 0
? turnAudioInput.url
: undefined;
const mimeType = typeof turnAudioInput.mime_type === 'string' && turnAudioInput.mime_type.length > 0
? turnAudioInput.mime_type
: undefined;
if (!data && !url) {
return undefined;
}
return {
...(data ? { data } : {}),
...(url ? { url } : {}),
...(mimeType ? { mime_type: mimeType } : {}),
};
}
private extractLatestAudioInputFromAttachments(attachments?: Attachment[]): AudioToolInput | undefined {
if (!attachments || attachments.length === 0) {
return undefined;
}
for (let i = attachments.length - 1; i >= 0; i--) {
const attachment = attachments[i];
if (!attachment.mimeType.startsWith('audio/')) {
continue;
}
const data = typeof attachment.data === 'string' && attachment.data.length > 0
? attachment.data
: undefined;
const url = typeof attachment.url === 'string' && attachment.url.length > 0
? attachment.url
: undefined;
if (!data && !url) {
continue;
}
return {
...(data ? { data } : {}),
...(url ? { url } : {}),
mime_type: attachment.mimeType,
};
}
return undefined;
}
private normalizeAudioTranscribeDataArg(rawData: unknown, rawMimeType: unknown): string | undefined {
if (typeof rawData !== 'string') {
return undefined;
}
const compact = rawData.replace(/\s+/g, '');
if (compact.length === 0) {
return undefined;
}
if (!/^[A-Za-z0-9+/=]+$/.test(compact)) {
return undefined;
}
try {
const decoded = Buffer.from(compact, 'base64');
if (decoded.length === 0) {
return undefined;
}
const mimeType = typeof rawMimeType === 'string' ? rawMimeType : undefined;
if (!this.matchesAudioSignature(decoded, mimeType)) {
return undefined;
}
return compact;
} catch {
return undefined;
}
}
private matchesAudioSignature(buffer: Buffer, mimeType?: string): boolean {
const ascii = (offset: number, value: string): boolean => {
if (buffer.length < offset + value.length) {
return false;
}
return buffer.subarray(offset, offset + value.length).toString('ascii') === value;
};
if (!mimeType) {
return true;
}
switch (mimeType) {
case 'audio/ogg':
return ascii(0, 'OggS');
case 'audio/wav':
return ascii(0, 'RIFF') && ascii(8, 'WAVE');
case 'audio/webm':
return buffer.length >= 4
&& buffer[0] === 0x1A
&& buffer[1] === 0x45
&& buffer[2] === 0xDF
&& buffer[3] === 0xA3;
case 'audio/mpeg':
case 'audio/mp3':
return ascii(0, 'ID3')
|| (buffer.length >= 2 && buffer[0] === 0xFF && (buffer[1] & 0xE0) === 0xE0);
case 'audio/mp4':
case 'audio/x-m4a':
return ascii(4, 'ftyp');
default:
return true;
}
}
private normalizeAudioTranscribeUrlArg(rawUrl: unknown): string | undefined {
if (typeof rawUrl !== 'string') {
return undefined;
}
const trimmed = rawUrl.trim();
if (trimmed.length === 0) {
return undefined;
}
if (!/^https?:\/\//i.test(trimmed)) {
return undefined;
}
return trimmed;
}
private getLatestUserAudioInput(): AudioToolInput | null {
for (let i = this.history.length - 1; i >= 0; i--) {
const msg = this.history[i];
if (msg.role !== 'user' || !Array.isArray(msg.content)) {
continue;
}
for (const part of msg.content) {
if (part.type !== 'audio') {
continue;
}
const source = part.source;
if (typeof source.data === 'string' && source.data.length > 0 && typeof source.media_type === 'string' && source.media_type.length > 0) {
return { data: source.data, mime_type: source.media_type };
}
}
}
return this.getPersistedAudioInput();
}
private getPersistedAudioInput(): AudioToolInput | null {
const persisted = this.session?.getConfig(LAST_AUDIO_ATTACHMENT_CONFIG_KEY);
if (!persisted) {
return null;
}
try {
const parsed = JSON.parse(persisted) as { data?: unknown; url?: unknown; mimeType?: unknown };
const data = typeof parsed.data === 'string' && parsed.data.length > 0 ? parsed.data : undefined;
const url = typeof parsed.url === 'string' && parsed.url.length > 0 ? parsed.url : undefined;
const mimeType = typeof parsed.mimeType === 'string' && parsed.mimeType.length > 0 ? parsed.mimeType : undefined;
if (!data && !url) {
return null;
}
return {
...(data ? { data } : {}),
...(url ? { url } : {}),
...(mimeType ? { mime_type: mimeType } : {}),
};
} catch {
return null;
}
}
private extractPseudoToolUse(content: string): PseudoToolUse | null {
if (!content) {
return null;
}
if (!/"type"\s*:\s*"tool_use"/.test(content)) {
return null;
}
const nameMatch = content.match(/"name"\s*:\s*"([^"]+)"/);
const idMatch = content.match(/"id"\s*:\s*"([^"]+)"/);
return {
name: nameMatch?.[1],
id: idMatch?.[1],
};
}
private extractToolCallsFromText(content: string): { toolCalls: ModelToolCall[]; remainingText: string } | null {
if (!content || content.indexOf('{') === -1) {
return null;
}
const extracted: ExtractedTextToolCall[] = [];
for (let i = 0; i < content.length; i++) {
if (content[i] !== '{') {
continue;
}
const end = this.findJsonObjectEnd(content, i);
if (end === -1) {
continue;
}
const candidate = content.slice(i, end + 1);
const parsedCall = this.parseTextToolUse(candidate, extracted.length + 1);
if (parsedCall) {
extracted.push({
toolCall: parsedCall,
start: i,
end: end + 1,
});
}
i = end;
}
if (extracted.length === 0) {
return null;
}
let remainingText = '';
let cursor = 0;
for (const item of extracted) {
remainingText += content.slice(cursor, item.start);
cursor = item.end;
}
remainingText += content.slice(cursor);
remainingText = remainingText.trim();
return {
toolCalls: extracted.map(e => e.toolCall),
remainingText,
};
}
private parseTextToolUse(candidate: string, ordinal: number): ModelToolCall | null {
let parsed: unknown;
try {
parsed = JSON.parse(candidate);
} catch {
return null;
}
if (!parsed || typeof parsed !== 'object') {
return null;
}
const obj = parsed as Record<string, unknown>;
if (obj.type !== 'tool_use') {
return null;
}
if (typeof obj.name !== 'string' || obj.name.trim().length === 0) {
return null;
}
const id = typeof obj.id === 'string' && obj.id.trim().length > 0
? obj.id
: `text_tool_call_${ordinal}`;
return {
id,
name: obj.name,
args: obj.input ?? {},
};
}
private extractMalformedShellToolCall(content: string): { toolCall: ModelToolCall; remainingText: string } | null {
if (!content || !/"type"\s*:\s*"tool_use"/.test(content)) {
return null;
}
const nameMatch = content.match(/"name"\s*:\s*"([^"]+)"/);
if (!nameMatch?.[1]) {
return null;
}
const name = nameMatch[1];
const normalized = name.replace(/_/g, '.').toLowerCase();
if (normalized !== 'shell.exec') {
return null;
}
// Recover malformed shell command payloads where inner quotes are not escaped.
const commandMatch = content.match(/"command"\s*:\s*"([\s\S]*)"\s*}\s*}/);
if (!commandMatch?.[1]) {
return null;
}
const command = this.sanitizeRecoveredShellCommand(commandMatch[1]);
if (!command) {
return null;
}
const idMatch = content.match(/"id"\s*:\s*"([^"]+)"/);
const id = idMatch?.[1]?.trim().length ? idMatch[1] : 'text_tool_call_recovered_shell';
return {
toolCall: {
id,
name,
args: { command },
},
remainingText: this.stripFirstToolUseObject(content),
};
}
private sanitizeRecoveredShellCommand(raw: string): string {
let command = raw.trim();
if (command.length === 0) {
return '';
}
// Common malformed pattern: opening quote duplicated into value.
if ((command.startsWith('"') && !command.endsWith('"')) || (command.startsWith('\'') && !command.endsWith('\''))) {
command = command.slice(1).trimStart();
}
return command;
}
private stripFirstToolUseObject(content: string): string {
const typeMatch = /"type"\s*:\s*"tool_use"/.exec(content);
if (!typeMatch || typeMatch.index < 0) {
return content.trim();
}
const objectStart = content.lastIndexOf('{', typeMatch.index);
if (objectStart < 0) {
return content.trim();
}
const objectEnd = this.findObjectEndByBraceDepth(content, objectStart);
if (objectEnd < 0) {
return content.trim();
}
return `${content.slice(0, objectStart)}${content.slice(objectEnd + 1)}`.trim();
}
private findObjectEndByBraceDepth(content: string, start: number): number {
let depth = 0;
for (let i = start; i < content.length; i++) {
const ch = content[i];
if (ch === '{') {
depth++;
} else if (ch === '}') {
depth--;
if (depth === 0) {
return i;
}
}
}
return -1;
}
private findJsonObjectEnd(content: string, start: number): number {
let depth = 0;
let inString = false;
let escaping = false;
for (let i = start; i < content.length; i++) {
const ch = content[i];
if (inString) {
if (escaping) {
escaping = false;
continue;
}
if (ch === '\\') {
escaping = true;
continue;
}
if (ch === '"') {
inString = false;
}
continue;
}
if (ch === '"') {
inString = true;
continue;
}
if (ch === '{') {
depth++;
continue;
}
if (ch === '}') {
depth--;
if (depth === 0) {
return i;
}
}
}
return -1;
}
private buildPseudoToolUseWarning(rawContent: string, pseudo: PseudoToolUse): string {
const toolName = pseudo.name ?? 'unknown';
const toolId = pseudo.id ?? 'unknown';
return [
'Tool call was emitted as plain text and was not executed.',
`Tool: ${toolName} (id: ${toolId})`,
'This usually means the current model/backend did not return structured tool metadata.',
'Original assistant output:',
rawContent,
].join('\n');
}
private normalizeAssistantContent(content: string): string {
if (content.trim().length > 0) {
return content;
}
return NativeAgent.EMPTY_RESPONSE_FALLBACK;
}
}