Improve in-flight cancel latency via run abort signal propagation

2026-02-19 12:24:39 -08:00
parent 290303c14e
commit 2c3a00f6dd
12 changed files with 148 additions and 20 deletions
@@ -100,7 +100,10 @@ export class AnthropicClient implements ModelClient {
      params.thinking = { type: 'enabled', budget_tokens: 4096 };
    }

-    const response = await this.client.messages.create(params) as AnthropicMessage;
+    const response = await this.client.messages.create(
+      params,
+      request.signal ? { signal: request.signal } : undefined,
+    ) as AnthropicMessage;

    const textContent = response.content.find((c) => c.type === 'text');
    const content = textContent?.type === 'text' ? textContent.text : '';
@@ -65,7 +65,10 @@ export class BedrockClient implements ModelClient {
    }

    const command = new ConverseCommand(params);
-    const response = await this.client.send(command);
+    const response = await this.client.send(
+      command,
+      request.signal ? { abortSignal: request.signal } : undefined,
+    );

    // Extract text and tool_use content from the response
    const outputContent = response.output?.message?.content ?? [];
@@ -126,7 +129,10 @@ export class BedrockClient implements ModelClient {

    try {
      const command = new ConverseStreamCommand(params);
-      const response = await this.client.send(command);
+      const response = await this.client.send(
+        command,
+        request.signal ? { abortSignal: request.signal } : undefined,
+      );

      let inputTokens = 0;
      let outputTokens = 0;
@@ -163,7 +163,10 @@ export class GitHubModelsClient implements ModelClient {
      (params as OpenAI.ChatCompletionCreateParamsNonStreaming & { reasoning_effort?: 'low' | 'medium' | 'high' }).reasoning_effort = 'medium';
    }

-    const response = await this.client.chat.completions.create(params);
+    const response = await this.client.chat.completions.create(
+      params,
+      request.signal ? { signal: request.signal } : undefined,
+    );

    const choice = response.choices[0];
    const content = choice?.message?.content ?? '';
@@ -237,7 +240,10 @@ export class GitHubModelsClient implements ModelClient {
    }

    try {
-      const stream = await this.client.chat.completions.create(params);
+      const stream = await this.client.chat.completions.create(
+        params,
+        request.signal ? { signal: request.signal } : undefined,
+      );

      let totalInputTokens = 0;
      let totalOutputTokens = 0;
@@ -247,13 +247,16 @@ export class LlamaCppClient implements ModelClient {
      }

      const controller = new AbortController();
+      const signal = request.signal
+        ? AbortSignal.any([request.signal, controller.signal])
+        : controller.signal;
      const timer = setTimeout(() => controller.abort(), this.requestTimeout);
      try {
        response = await fetch(`${this.endpoint}/v1/chat/completions`, {
          method: 'POST',
          headers,
          body: JSON.stringify(body),
-          signal: controller.signal,
+          signal,
        });
      } finally {
        clearTimeout(timer);
@@ -331,6 +334,7 @@ export class LlamaCppClient implements ModelClient {
        method: 'POST',
        headers,
        body: JSON.stringify(body),
+        signal: request.signal,
      });

      if (!response.ok) {
@@ -140,6 +140,7 @@ export class OpenAIClient implements ModelClient {
      method: 'POST',
      headers,
      body: JSON.stringify(body),
+      signal: request.signal,
    });

    if (!res.ok) {
@@ -277,7 +278,10 @@ export class OpenAIClient implements ModelClient {

    let response: OpenAI.ChatCompletion;
    try {
-      response = await this.client.chat.completions.create(params);
+      response = await this.client.chat.completions.create(
+        params,
+        request.signal ? { signal: request.signal } : undefined,
+      );
    } catch (error) {
      const status = typeof (error as { status?: unknown })?.status === 'number'
        ? (error as { status: number }).status
@@ -80,6 +80,8 @@ export interface ChatRequest {
  tools?: ToolDefinition[];
  /** Enable extended thinking/reasoning mode for this request. */
  thinking?: boolean;
+  /** Optional abort signal for cancelling in-flight provider requests. */
+  signal?: AbortSignal;
 }

 export interface ChatResponse {