feat: add streaming support and num_gpu option to Ollama client

2026-02-05 15:51:28 -08:00
parent a2e1f73493
commit dbf1acd822
3 changed files with 63 additions and 1 deletions
@@ -18,6 +18,7 @@ const modelConfigSchema = z.object({
  api_key: z.string().optional(),
  auth_token: z.string().optional(),
  for: z.array(z.string()).optional(),
  num_gpu: z.number().optional(),
 });
 const modelsSchema = z.object({
@@ -1,20 +1,23 @@
 import { Ollama } from 'ollama';
-import type { ChatRequest, ChatResponse, ModelClient } from '../types.js';
+import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from '../types.js';
 export interface OllamaClientConfig {
  host?: string;
  model: string;
  numGpu?: number;
 }
 export class OllamaClient implements ModelClient {
  private client: Ollama;
  private model: string;
  private numGpu: number;
  constructor(config: OllamaClientConfig) {
    this.client = new Ollama({
      host: config.host ?? 'http://localhost:11434',
    });
    this.model = config.model;
    this.numGpu = config.numGpu ?? -1;
  }
  async chat(request: ChatRequest): Promise<ChatResponse> {
@@ -31,6 +34,9 @@ export class OllamaClient implements ModelClient {
    const response = await this.client.chat({
      model: this.model,
      messages,
      options: {
        num_gpu: this.numGpu,
      },
    });
    return {
@@ -42,4 +48,58 @@ export class OllamaClient implements ModelClient {
      },
    };
  }
  async *chatStream(request: ChatRequest): AsyncIterable<ChatStreamEvent> {
    const messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }> = [];
    if (request.system) {
      messages.push({ role: 'system', content: request.system });
    }
    for (const msg of request.messages) {
      messages.push({ role: msg.role, content: msg.content });
    }
    try {
      const stream = await this.client.chat({
        model: this.model,
        messages,
        stream: true,
        options: {
          num_gpu: this.numGpu,
        },
      });
      let inputTokens = 0;
      let outputTokens = 0;
      for await (const chunk of stream) {
        if (chunk.message?.content) {
          yield { type: 'content', content: chunk.message.content };
        }
        if (chunk.prompt_eval_count) {
          inputTokens = chunk.prompt_eval_count;
        }
        if (chunk.eval_count) {
          outputTokens = chunk.eval_count;
        }
        if (chunk.done) {
          yield {
            type: 'done',
            usage: {
              inputTokens,
              outputTokens,
            },
          };
        }
      }
    } catch (error) {
      yield {
        type: 'error',
        error: error instanceof Error ? error : new Error(String(error)),
      };
    }
  }
 }
@@ -1,6 +1,7 @@
 export interface Message {
  role: 'user' | 'assistant';
  content: string;
  timestamp?: number;
 }
 export interface ChatRequest {