diff --git a/docs/plans/2026-02-05-llamacpp-implementation.md b/docs/plans/2026-02-05-llamacpp-implementation.md new file mode 100644 index 0000000..e595b3d --- /dev/null +++ b/docs/plans/2026-02-05-llamacpp-implementation.md @@ -0,0 +1,556 @@ +# llama.cpp Integration Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Add LlamaCppClient to connect Flynn to a llama-server instance for local LLM inference with Qwen 2.5 14B. + +**Architecture:** New `LlamaCppClient` class implements existing `ModelClient` interface, communicating with llama-server's OpenAI-compatible `/v1/chat/completions` endpoint via HTTP/SSE. + +**Tech Stack:** TypeScript, native fetch API, Server-Sent Events for streaming + +--- + +## Task 1: Create LlamaCppClient with Basic Chat + +**Files:** +- Create: `src/models/local/llamacpp.ts` +- Create: `src/models/local/llamacpp.test.ts` + +### Step 1: Write the failing test + +Create `src/models/local/llamacpp.test.ts`: + +```typescript +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { LlamaCppClient } from './llamacpp.js'; + +describe('LlamaCppClient', () => { + const mockFetch = vi.fn(); + + beforeEach(() => { + vi.stubGlobal('fetch', mockFetch); + }); + + afterEach(() => { + vi.unstubAllGlobals(); + }); + + it('sends messages and returns response', async () => { + mockFetch.mockResolvedValue({ + ok: true, + json: () => Promise.resolve({ + choices: [{ message: { content: 'Hello from llama.cpp!' } }], + usage: { prompt_tokens: 10, completion_tokens: 5 }, + }), + }); + + const client = new LlamaCppClient({ + endpoint: 'http://localhost:8080', + }); + + const response = await client.chat({ + messages: [{ role: 'user', content: 'Hello' }], + }); + + expect(response.content).toBe('Hello from llama.cpp!'); + expect(response.usage.inputTokens).toBe(10); + expect(response.usage.outputTokens).toBe(5); + }); +}); +``` + +### Step 2: Run test to verify it fails + +Run: `npm test -- src/models/local/llamacpp.test.ts` +Expected: FAIL with "Cannot find module './llamacpp.js'" + +### Step 3: Write minimal implementation + +Create `src/models/local/llamacpp.ts`: + +```typescript +import type { ChatRequest, ChatResponse, ModelClient } from '../types.js'; + +export interface LlamaCppClientConfig { + endpoint: string; + authToken?: string; +} + +interface LlamaCppMessage { + role: 'system' | 'user' | 'assistant'; + content: string; +} + +interface LlamaCppResponse { + choices: Array<{ message: { content: string } }>; + usage: { prompt_tokens: number; completion_tokens: number }; +} + +export class LlamaCppClient implements ModelClient { + private endpoint: string; + private authToken?: string; + + constructor(config: LlamaCppClientConfig) { + this.endpoint = config.endpoint.replace(/\/$/, ''); + this.authToken = config.authToken; + } + + async chat(request: ChatRequest): Promise { + const messages: LlamaCppMessage[] = []; + + if (request.system) { + messages.push({ role: 'system', content: request.system }); + } + + for (const msg of request.messages) { + messages.push({ role: msg.role, content: msg.content }); + } + + const headers: Record = { + 'Content-Type': 'application/json', + }; + + if (this.authToken) { + headers['Authorization'] = `Bearer ${this.authToken}`; + } + + const response = await fetch(`${this.endpoint}/v1/chat/completions`, { + method: 'POST', + headers, + body: JSON.stringify({ + messages, + max_tokens: request.maxTokens ?? 2048, + }), + }); + + if (!response.ok) { + const text = await response.text(); + throw new Error(`llama-server error (${response.status}): ${text}`); + } + + const data = (await response.json()) as LlamaCppResponse; + + return { + content: data.choices[0]?.message?.content ?? '', + stopReason: 'stop', + usage: { + inputTokens: data.usage?.prompt_tokens ?? 0, + outputTokens: data.usage?.completion_tokens ?? 0, + }, + }; + } +} +``` + +### Step 4: Run test to verify it passes + +Run: `npm test -- src/models/local/llamacpp.test.ts` +Expected: PASS + +### Step 5: Commit + +```bash +git add src/models/local/llamacpp.ts src/models/local/llamacpp.test.ts +git commit -m "feat: add LlamaCppClient with basic chat support" +``` + +--- + +## Task 2: Add Streaming Support + +**Files:** +- Modify: `src/models/local/llamacpp.ts` +- Modify: `src/models/local/llamacpp.test.ts` + +### Step 1: Write the failing test + +Add to `src/models/local/llamacpp.test.ts`: + +```typescript +import type { ChatStreamEvent } from '../types.js'; + +// Add this test inside the describe block: + +it('streams responses via SSE', async () => { + const chunks = [ + 'data: {"choices":[{"delta":{"content":"Hello"}}]}\n\n', + 'data: {"choices":[{"delta":{"content":" world"}}]}\n\n', + 'data: {"choices":[{}],"usage":{"prompt_tokens":5,"completion_tokens":2}}\n\n', + 'data: [DONE]\n\n', + ]; + + const encoder = new TextEncoder(); + let chunkIndex = 0; + + const mockStream = new ReadableStream({ + pull(controller) { + if (chunkIndex < chunks.length) { + controller.enqueue(encoder.encode(chunks[chunkIndex])); + chunkIndex++; + } else { + controller.close(); + } + }, + }); + + mockFetch.mockResolvedValue({ + ok: true, + body: mockStream, + }); + + const client = new LlamaCppClient({ + endpoint: 'http://localhost:8080', + }); + + const events: ChatStreamEvent[] = []; + for await (const event of client.chatStream({ + messages: [{ role: 'user', content: 'Hi' }], + })) { + events.push(event); + } + + expect(events).toHaveLength(3); + expect(events[0]).toEqual({ type: 'content', content: 'Hello' }); + expect(events[1]).toEqual({ type: 'content', content: ' world' }); + expect(events[2]).toEqual({ + type: 'done', + usage: { inputTokens: 5, outputTokens: 2 }, + }); +}); +``` + +### Step 2: Run test to verify it fails + +Run: `npm test -- src/models/local/llamacpp.test.ts` +Expected: FAIL with "chatStream is not a function" or similar + +### Step 3: Write minimal implementation + +Add to `src/models/local/llamacpp.ts` (import and method): + +At the top, update imports: +```typescript +import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from '../types.js'; +``` + +Add interface for streaming: +```typescript +interface LlamaCppStreamChunk { + choices: Array<{ delta?: { content?: string } }>; + usage?: { prompt_tokens: number; completion_tokens: number }; +} +``` + +Add method to `LlamaCppClient` class: +```typescript +async *chatStream(request: ChatRequest): AsyncIterable { + const messages: LlamaCppMessage[] = []; + + if (request.system) { + messages.push({ role: 'system', content: request.system }); + } + + for (const msg of request.messages) { + messages.push({ role: msg.role, content: msg.content }); + } + + const headers: Record = { + 'Content-Type': 'application/json', + }; + + if (this.authToken) { + headers['Authorization'] = `Bearer ${this.authToken}`; + } + + try { + const response = await fetch(`${this.endpoint}/v1/chat/completions`, { + method: 'POST', + headers, + body: JSON.stringify({ + messages, + max_tokens: request.maxTokens ?? 2048, + stream: true, + }), + }); + + if (!response.ok) { + const text = await response.text(); + throw new Error(`llama-server error (${response.status}): ${text}`); + } + + if (!response.body) { + throw new Error('No response body for streaming'); + } + + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + let buffer = ''; + let usage = { inputTokens: 0, outputTokens: 0 }; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split('\n'); + buffer = lines.pop() ?? ''; + + for (const line of lines) { + const trimmed = line.trim(); + if (!trimmed || !trimmed.startsWith('data: ')) continue; + + const data = trimmed.slice(6); + if (data === '[DONE]') continue; + + try { + const chunk = JSON.parse(data) as LlamaCppStreamChunk; + + if (chunk.choices[0]?.delta?.content) { + yield { type: 'content', content: chunk.choices[0].delta.content }; + } + + if (chunk.usage) { + usage = { + inputTokens: chunk.usage.prompt_tokens, + outputTokens: chunk.usage.completion_tokens, + }; + } + } catch { + // Skip malformed JSON + } + } + } + + yield { type: 'done', usage }; + } catch (error) { + yield { + type: 'error', + error: error instanceof Error ? error : new Error(String(error)), + }; + } +} +``` + +### Step 4: Run test to verify it passes + +Run: `npm test -- src/models/local/llamacpp.test.ts` +Expected: PASS (2 tests) + +### Step 5: Commit + +```bash +git add src/models/local/llamacpp.ts src/models/local/llamacpp.test.ts +git commit -m "feat: add streaming support to LlamaCppClient" +``` + +--- + +## Task 3: Add Connection Error Handling + +**Files:** +- Modify: `src/models/local/llamacpp.test.ts` +- Modify: `src/models/local/llamacpp.ts` + +### Step 1: Write the failing test + +Add to `src/models/local/llamacpp.test.ts`: + +```typescript +it('throws clear error when server not running', async () => { + mockFetch.mockRejectedValue(new TypeError('fetch failed')); + + const client = new LlamaCppClient({ + endpoint: 'http://localhost:8080', + }); + + await expect(client.chat({ + messages: [{ role: 'user', content: 'Hello' }], + })).rejects.toThrow('llama-server not running at http://localhost:8080'); +}); +``` + +### Step 2: Run test to verify it fails + +Run: `npm test -- src/models/local/llamacpp.test.ts` +Expected: FAIL - error message doesn't match + +### Step 3: Update implementation + +Wrap the fetch call in `chat()` method with error handling: + +```typescript +async chat(request: ChatRequest): Promise { + const messages: LlamaCppMessage[] = []; + + if (request.system) { + messages.push({ role: 'system', content: request.system }); + } + + for (const msg of request.messages) { + messages.push({ role: msg.role, content: msg.content }); + } + + const headers: Record = { + 'Content-Type': 'application/json', + }; + + if (this.authToken) { + headers['Authorization'] = `Bearer ${this.authToken}`; + } + + let response: Response; + try { + response = await fetch(`${this.endpoint}/v1/chat/completions`, { + method: 'POST', + headers, + body: JSON.stringify({ + messages, + max_tokens: request.maxTokens ?? 2048, + }), + }); + } catch (error) { + if (error instanceof TypeError && error.message.includes('fetch failed')) { + throw new Error(`llama-server not running at ${this.endpoint}`); + } + throw error; + } + + if (!response.ok) { + const text = await response.text(); + throw new Error(`llama-server error (${response.status}): ${text}`); + } + + const data = (await response.json()) as LlamaCppResponse; + + return { + content: data.choices[0]?.message?.content ?? '', + stopReason: 'stop', + usage: { + inputTokens: data.usage?.prompt_tokens ?? 0, + outputTokens: data.usage?.completion_tokens ?? 0, + }, + }; +} +``` + +### Step 4: Run test to verify it passes + +Run: `npm test -- src/models/local/llamacpp.test.ts` +Expected: PASS (3 tests) + +### Step 5: Commit + +```bash +git add src/models/local/llamacpp.ts src/models/local/llamacpp.test.ts +git commit -m "feat: add clear error message when llama-server not running" +``` + +--- + +## Task 4: Export and Wire Up Client + +**Files:** +- Modify: `src/models/local/index.ts` +- Modify: `src/models/index.ts` +- Modify: `src/daemon/index.ts` + +### Step 1: Export from local/index.ts + +Update `src/models/local/index.ts`: + +```typescript +export { OllamaClient, type OllamaClientConfig } from './ollama.js'; +export { LlamaCppClient, type LlamaCppClientConfig } from './llamacpp.js'; +``` + +### Step 2: Export from models/index.ts + +Update `src/models/index.ts`: + +```typescript +export { AnthropicClient, type AnthropicClientConfig } from './anthropic.js'; +export { OpenAIClient, type OpenAIClientConfig } from './openai.js'; +export { OllamaClient, type OllamaClientConfig } from './local/index.js'; +export { LlamaCppClient, type LlamaCppClientConfig } from './local/index.js'; +export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js'; +export type { Message, ChatRequest, ChatResponse, ModelClient } from './types.js'; +``` + +### Step 3: Wire up in daemon + +Update `src/daemon/index.ts` import: + +```typescript +import { AnthropicClient, OpenAIClient, OllamaClient, LlamaCppClient, ModelRouter } from '../models/index.js'; +``` + +Update `createModelRouter` function, add llamacpp case in the `if (models.local)` block: + +```typescript +if (models.local) { + if (models.local.provider === 'ollama') { + localClient = new OllamaClient({ + model: models.local.model, + host: models.local.endpoint, + }); + } else if (models.local.provider === 'llamacpp') { + localClient = new LlamaCppClient({ + endpoint: models.local.endpoint ?? 'http://localhost:8080', + authToken: models.local.auth_token, + }); + } +} +``` + +### Step 4: Run all tests + +Run: `npm test` +Expected: All tests pass + +### Step 5: Commit + +```bash +git add src/models/local/index.ts src/models/index.ts src/daemon/index.ts +git commit -m "feat: wire up LlamaCppClient to model router" +``` + +--- + +## Task 5: Run Full Test Suite and Verify + +**Files:** None (verification only) + +### Step 1: Run full test suite + +Run: `npm test` +Expected: All tests pass (should be 62+ tests now) + +### Step 2: Type check + +Run: `npx tsc --noEmit` +Expected: No errors + +### Step 3: Build + +Run: `npm run build` +Expected: Build succeeds + +### Step 4: Final commit (if any changes needed) + +If any fixes were needed, commit them. + +--- + +## Summary + +| Task | Description | Tests Added | +|------|-------------|-------------| +| 1 | Basic chat support | 1 | +| 2 | Streaming via SSE | 1 | +| 3 | Connection error handling | 1 | +| 4 | Export and wire up | 0 | +| 5 | Verification | 0 | + +**Total new tests:** 3 +**Files created:** 2 (`llamacpp.ts`, `llamacpp.test.ts`) +**Files modified:** 3 (`local/index.ts`, `models/index.ts`, `daemon/index.ts`)