flynn/src/models/local/llamacpp.ts

import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from '../types.js';

export interface LlamaCppClientConfig {
  endpoint: string;
  authToken?: string;
}

interface LlamaCppMessage {
  role: 'system' | 'user' | 'assistant';
  content: string;
}

interface LlamaCppResponse {
  choices: Array<{ message: { content: string } }>;
  usage: { prompt_tokens: number; completion_tokens: number };
}

interface LlamaCppStreamChunk {
  choices: Array<{ delta?: { content?: string } }>;
  usage?: { prompt_tokens: number; completion_tokens: number };
}

export class LlamaCppClient implements ModelClient {
  private endpoint: string;
  private authToken?: string;

  constructor(config: LlamaCppClientConfig) {
    this.endpoint = config.endpoint.replace(/\/$/, '');
    this.authToken = config.authToken;
  }

  async chat(request: ChatRequest): Promise<ChatResponse> {
    const messages: LlamaCppMessage[] = [];

    if (request.system) {
      messages.push({ role: 'system', content: request.system });
    }

    for (const msg of request.messages) {
      messages.push({ role: msg.role, content: msg.content });
    }

    const headers: Record<string, string> = {
      'Content-Type': 'application/json',
    };

    if (this.authToken) {
      headers['Authorization'] = `Bearer ${this.authToken}`;
    }

    let response: Response;
    try {
      response = await fetch(`${this.endpoint}/v1/chat/completions`, {
        method: 'POST',
        headers,
        body: JSON.stringify({
          messages,
          max_tokens: request.maxTokens ?? 2048,
        }),
      });
    } catch (error) {
      if (error instanceof TypeError && error.message.includes('fetch failed')) {
        throw new Error(`llama-server not running at ${this.endpoint}`);
      }
      throw error;
    }

    if (!response.ok) {
      const text = await response.text();
      throw new Error(`llama-server error (${response.status}): ${text}`);
    }

    const data = (await response.json()) as LlamaCppResponse;

    return {
      content: data.choices[0]?.message?.content ?? '',
      stopReason: 'stop',
      usage: {
        inputTokens: data.usage?.prompt_tokens ?? 0,
        outputTokens: data.usage?.completion_tokens ?? 0,
      },
    };
  }

  async *chatStream(request: ChatRequest): AsyncIterable<ChatStreamEvent> {
    const messages: LlamaCppMessage[] = [];

    if (request.system) {
      messages.push({ role: 'system', content: request.system });
    }

    for (const msg of request.messages) {
      messages.push({ role: msg.role, content: msg.content });
    }

    const headers: Record<string, string> = {
      'Content-Type': 'application/json',
    };

    if (this.authToken) {
      headers['Authorization'] = `Bearer ${this.authToken}`;
    }

    try {
      const response = await fetch(`${this.endpoint}/v1/chat/completions`, {
        method: 'POST',
        headers,
        body: JSON.stringify({
          messages,
          max_tokens: request.maxTokens ?? 2048,
          stream: true,
        }),
      });

      if (!response.ok) {
        const text = await response.text();
        throw new Error(`llama-server error (${response.status}): ${text}`);
      }

      if (!response.body) {
        throw new Error('No response body for streaming');
      }

      const reader = response.body.getReader();
      const decoder = new TextDecoder();
      let buffer = '';
      let usage = { inputTokens: 0, outputTokens: 0 };

      while (true) {
        const { done, value } = await reader.read();
        if (done) break;

        buffer += decoder.decode(value, { stream: true });
        const lines = buffer.split('\n');
        buffer = lines.pop() ?? '';

        for (const line of lines) {
          const trimmed = line.trim();
          if (!trimmed || !trimmed.startsWith('data: ')) continue;

          const data = trimmed.slice(6);
          if (data === '[DONE]') continue;

          try {
            const chunk = JSON.parse(data) as LlamaCppStreamChunk;

            if (chunk.choices[0]?.delta?.content) {
              yield { type: 'content', content: chunk.choices[0].delta.content };
            }

            if (chunk.usage) {
              usage = {
                inputTokens: chunk.usage.prompt_tokens,
                outputTokens: chunk.usage.completion_tokens,
              };
            }
          } catch {
            // Skip malformed JSON
          }
        }
      }

      yield { type: 'done', usage };
    } catch (error) {
      yield {
        type: 'error',
        error: error instanceof Error ? error : new Error(String(error)),
      };
    }
  }
}