import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from '../types.js'; export interface LlamaCppClientConfig { endpoint: string; authToken?: string; } interface LlamaCppMessage { role: 'system' | 'user' | 'assistant'; content: string; } interface LlamaCppResponse { choices: Array<{ message: { content: string } }>; usage: { prompt_tokens: number; completion_tokens: number }; } interface LlamaCppStreamChunk { choices: Array<{ delta?: { content?: string } }>; usage?: { prompt_tokens: number; completion_tokens: number }; } export class LlamaCppClient implements ModelClient { private endpoint: string; private authToken?: string; constructor(config: LlamaCppClientConfig) { this.endpoint = config.endpoint.replace(/\/$/, ''); this.authToken = config.authToken; } async chat(request: ChatRequest): Promise { const messages: LlamaCppMessage[] = []; if (request.system) { messages.push({ role: 'system', content: request.system }); } for (const msg of request.messages) { messages.push({ role: msg.role, content: msg.content }); } const headers: Record = { 'Content-Type': 'application/json', }; if (this.authToken) { headers['Authorization'] = `Bearer ${this.authToken}`; } let response: Response; try { response = await fetch(`${this.endpoint}/v1/chat/completions`, { method: 'POST', headers, body: JSON.stringify({ messages, max_tokens: request.maxTokens ?? 2048, }), }); } catch (error) { if (error instanceof TypeError && error.message.includes('fetch failed')) { throw new Error(`llama-server not running at ${this.endpoint}`); } throw error; } if (!response.ok) { const text = await response.text(); throw new Error(`llama-server error (${response.status}): ${text}`); } const data = (await response.json()) as LlamaCppResponse; return { content: data.choices[0]?.message?.content ?? '', stopReason: 'stop', usage: { inputTokens: data.usage?.prompt_tokens ?? 0, outputTokens: data.usage?.completion_tokens ?? 0, }, }; } async *chatStream(request: ChatRequest): AsyncIterable { const messages: LlamaCppMessage[] = []; if (request.system) { messages.push({ role: 'system', content: request.system }); } for (const msg of request.messages) { messages.push({ role: msg.role, content: msg.content }); } const headers: Record = { 'Content-Type': 'application/json', }; if (this.authToken) { headers['Authorization'] = `Bearer ${this.authToken}`; } try { const response = await fetch(`${this.endpoint}/v1/chat/completions`, { method: 'POST', headers, body: JSON.stringify({ messages, max_tokens: request.maxTokens ?? 2048, stream: true, }), }); if (!response.ok) { const text = await response.text(); throw new Error(`llama-server error (${response.status}): ${text}`); } if (!response.body) { throw new Error('No response body for streaming'); } const reader = response.body.getReader(); const decoder = new TextDecoder(); let buffer = ''; let usage = { inputTokens: 0, outputTokens: 0 }; while (true) { const { done, value } = await reader.read(); if (done) break; buffer += decoder.decode(value, { stream: true }); const lines = buffer.split('\n'); buffer = lines.pop() ?? ''; for (const line of lines) { const trimmed = line.trim(); if (!trimmed || !trimmed.startsWith('data: ')) continue; const data = trimmed.slice(6); if (data === '[DONE]') continue; try { const chunk = JSON.parse(data) as LlamaCppStreamChunk; if (chunk.choices[0]?.delta?.content) { yield { type: 'content', content: chunk.choices[0].delta.content }; } if (chunk.usage) { usage = { inputTokens: chunk.usage.prompt_tokens, outputTokens: chunk.usage.completion_tokens, }; } } catch { // Skip malformed JSON } } } yield { type: 'done', usage }; } catch (error) { yield { type: 'error', error: error instanceof Error ? error : new Error(String(error)), }; } } }