557 lines
14 KiB
Markdown
557 lines
14 KiB
Markdown
# llama.cpp Integration Implementation Plan
|
|
|
|
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
|
|
|
**Goal:** Add LlamaCppClient to connect Flynn to a llama-server instance for local LLM inference with Qwen 2.5 14B.
|
|
|
|
**Architecture:** New `LlamaCppClient` class implements existing `ModelClient` interface, communicating with llama-server's OpenAI-compatible `/v1/chat/completions` endpoint via HTTP/SSE.
|
|
|
|
**Tech Stack:** TypeScript, native fetch API, Server-Sent Events for streaming
|
|
|
|
---
|
|
|
|
## Task 1: Create LlamaCppClient with Basic Chat
|
|
|
|
**Files:**
|
|
- Create: `src/models/local/llamacpp.ts`
|
|
- Create: `src/models/local/llamacpp.test.ts`
|
|
|
|
### Step 1: Write the failing test
|
|
|
|
Create `src/models/local/llamacpp.test.ts`:
|
|
|
|
```typescript
|
|
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
|
import { LlamaCppClient } from './llamacpp.js';
|
|
|
|
describe('LlamaCppClient', () => {
|
|
const mockFetch = vi.fn();
|
|
|
|
beforeEach(() => {
|
|
vi.stubGlobal('fetch', mockFetch);
|
|
});
|
|
|
|
afterEach(() => {
|
|
vi.unstubAllGlobals();
|
|
});
|
|
|
|
it('sends messages and returns response', async () => {
|
|
mockFetch.mockResolvedValue({
|
|
ok: true,
|
|
json: () => Promise.resolve({
|
|
choices: [{ message: { content: 'Hello from llama.cpp!' } }],
|
|
usage: { prompt_tokens: 10, completion_tokens: 5 },
|
|
}),
|
|
});
|
|
|
|
const client = new LlamaCppClient({
|
|
endpoint: 'http://localhost:8080',
|
|
});
|
|
|
|
const response = await client.chat({
|
|
messages: [{ role: 'user', content: 'Hello' }],
|
|
});
|
|
|
|
expect(response.content).toBe('Hello from llama.cpp!');
|
|
expect(response.usage.inputTokens).toBe(10);
|
|
expect(response.usage.outputTokens).toBe(5);
|
|
});
|
|
});
|
|
```
|
|
|
|
### Step 2: Run test to verify it fails
|
|
|
|
Run: `npm test -- src/models/local/llamacpp.test.ts`
|
|
Expected: FAIL with "Cannot find module './llamacpp.js'"
|
|
|
|
### Step 3: Write minimal implementation
|
|
|
|
Create `src/models/local/llamacpp.ts`:
|
|
|
|
```typescript
|
|
import type { ChatRequest, ChatResponse, ModelClient } from '../types.js';
|
|
|
|
export interface LlamaCppClientConfig {
|
|
endpoint: string;
|
|
authToken?: string;
|
|
}
|
|
|
|
interface LlamaCppMessage {
|
|
role: 'system' | 'user' | 'assistant';
|
|
content: string;
|
|
}
|
|
|
|
interface LlamaCppResponse {
|
|
choices: Array<{ message: { content: string } }>;
|
|
usage: { prompt_tokens: number; completion_tokens: number };
|
|
}
|
|
|
|
export class LlamaCppClient implements ModelClient {
|
|
private endpoint: string;
|
|
private authToken?: string;
|
|
|
|
constructor(config: LlamaCppClientConfig) {
|
|
this.endpoint = config.endpoint.replace(/\/$/, '');
|
|
this.authToken = config.authToken;
|
|
}
|
|
|
|
async chat(request: ChatRequest): Promise<ChatResponse> {
|
|
const messages: LlamaCppMessage[] = [];
|
|
|
|
if (request.system) {
|
|
messages.push({ role: 'system', content: request.system });
|
|
}
|
|
|
|
for (const msg of request.messages) {
|
|
messages.push({ role: msg.role, content: msg.content });
|
|
}
|
|
|
|
const headers: Record<string, string> = {
|
|
'Content-Type': 'application/json',
|
|
};
|
|
|
|
if (this.authToken) {
|
|
headers['Authorization'] = `Bearer ${this.authToken}`;
|
|
}
|
|
|
|
const response = await fetch(`${this.endpoint}/v1/chat/completions`, {
|
|
method: 'POST',
|
|
headers,
|
|
body: JSON.stringify({
|
|
messages,
|
|
max_tokens: request.maxTokens ?? 2048,
|
|
}),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
const text = await response.text();
|
|
throw new Error(`llama-server error (${response.status}): ${text}`);
|
|
}
|
|
|
|
const data = (await response.json()) as LlamaCppResponse;
|
|
|
|
return {
|
|
content: data.choices[0]?.message?.content ?? '',
|
|
stopReason: 'stop',
|
|
usage: {
|
|
inputTokens: data.usage?.prompt_tokens ?? 0,
|
|
outputTokens: data.usage?.completion_tokens ?? 0,
|
|
},
|
|
};
|
|
}
|
|
}
|
|
```
|
|
|
|
### Step 4: Run test to verify it passes
|
|
|
|
Run: `npm test -- src/models/local/llamacpp.test.ts`
|
|
Expected: PASS
|
|
|
|
### Step 5: Commit
|
|
|
|
```bash
|
|
git add src/models/local/llamacpp.ts src/models/local/llamacpp.test.ts
|
|
git commit -m "feat: add LlamaCppClient with basic chat support"
|
|
```
|
|
|
|
---
|
|
|
|
## Task 2: Add Streaming Support
|
|
|
|
**Files:**
|
|
- Modify: `src/models/local/llamacpp.ts`
|
|
- Modify: `src/models/local/llamacpp.test.ts`
|
|
|
|
### Step 1: Write the failing test
|
|
|
|
Add to `src/models/local/llamacpp.test.ts`:
|
|
|
|
```typescript
|
|
import type { ChatStreamEvent } from '../types.js';
|
|
|
|
// Add this test inside the describe block:
|
|
|
|
it('streams responses via SSE', async () => {
|
|
const chunks = [
|
|
'data: {"choices":[{"delta":{"content":"Hello"}}]}\n\n',
|
|
'data: {"choices":[{"delta":{"content":" world"}}]}\n\n',
|
|
'data: {"choices":[{}],"usage":{"prompt_tokens":5,"completion_tokens":2}}\n\n',
|
|
'data: [DONE]\n\n',
|
|
];
|
|
|
|
const encoder = new TextEncoder();
|
|
let chunkIndex = 0;
|
|
|
|
const mockStream = new ReadableStream({
|
|
pull(controller) {
|
|
if (chunkIndex < chunks.length) {
|
|
controller.enqueue(encoder.encode(chunks[chunkIndex]));
|
|
chunkIndex++;
|
|
} else {
|
|
controller.close();
|
|
}
|
|
},
|
|
});
|
|
|
|
mockFetch.mockResolvedValue({
|
|
ok: true,
|
|
body: mockStream,
|
|
});
|
|
|
|
const client = new LlamaCppClient({
|
|
endpoint: 'http://localhost:8080',
|
|
});
|
|
|
|
const events: ChatStreamEvent[] = [];
|
|
for await (const event of client.chatStream({
|
|
messages: [{ role: 'user', content: 'Hi' }],
|
|
})) {
|
|
events.push(event);
|
|
}
|
|
|
|
expect(events).toHaveLength(3);
|
|
expect(events[0]).toEqual({ type: 'content', content: 'Hello' });
|
|
expect(events[1]).toEqual({ type: 'content', content: ' world' });
|
|
expect(events[2]).toEqual({
|
|
type: 'done',
|
|
usage: { inputTokens: 5, outputTokens: 2 },
|
|
});
|
|
});
|
|
```
|
|
|
|
### Step 2: Run test to verify it fails
|
|
|
|
Run: `npm test -- src/models/local/llamacpp.test.ts`
|
|
Expected: FAIL with "chatStream is not a function" or similar
|
|
|
|
### Step 3: Write minimal implementation
|
|
|
|
Add to `src/models/local/llamacpp.ts` (import and method):
|
|
|
|
At the top, update imports:
|
|
```typescript
|
|
import type { ChatRequest, ChatResponse, ChatStreamEvent, ModelClient } from '../types.js';
|
|
```
|
|
|
|
Add interface for streaming:
|
|
```typescript
|
|
interface LlamaCppStreamChunk {
|
|
choices: Array<{ delta?: { content?: string } }>;
|
|
usage?: { prompt_tokens: number; completion_tokens: number };
|
|
}
|
|
```
|
|
|
|
Add method to `LlamaCppClient` class:
|
|
```typescript
|
|
async *chatStream(request: ChatRequest): AsyncIterable<ChatStreamEvent> {
|
|
const messages: LlamaCppMessage[] = [];
|
|
|
|
if (request.system) {
|
|
messages.push({ role: 'system', content: request.system });
|
|
}
|
|
|
|
for (const msg of request.messages) {
|
|
messages.push({ role: msg.role, content: msg.content });
|
|
}
|
|
|
|
const headers: Record<string, string> = {
|
|
'Content-Type': 'application/json',
|
|
};
|
|
|
|
if (this.authToken) {
|
|
headers['Authorization'] = `Bearer ${this.authToken}`;
|
|
}
|
|
|
|
try {
|
|
const response = await fetch(`${this.endpoint}/v1/chat/completions`, {
|
|
method: 'POST',
|
|
headers,
|
|
body: JSON.stringify({
|
|
messages,
|
|
max_tokens: request.maxTokens ?? 2048,
|
|
stream: true,
|
|
}),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
const text = await response.text();
|
|
throw new Error(`llama-server error (${response.status}): ${text}`);
|
|
}
|
|
|
|
if (!response.body) {
|
|
throw new Error('No response body for streaming');
|
|
}
|
|
|
|
const reader = response.body.getReader();
|
|
const decoder = new TextDecoder();
|
|
let buffer = '';
|
|
let usage = { inputTokens: 0, outputTokens: 0 };
|
|
|
|
while (true) {
|
|
const { done, value } = await reader.read();
|
|
if (done) break;
|
|
|
|
buffer += decoder.decode(value, { stream: true });
|
|
const lines = buffer.split('\n');
|
|
buffer = lines.pop() ?? '';
|
|
|
|
for (const line of lines) {
|
|
const trimmed = line.trim();
|
|
if (!trimmed || !trimmed.startsWith('data: ')) continue;
|
|
|
|
const data = trimmed.slice(6);
|
|
if (data === '[DONE]') continue;
|
|
|
|
try {
|
|
const chunk = JSON.parse(data) as LlamaCppStreamChunk;
|
|
|
|
if (chunk.choices[0]?.delta?.content) {
|
|
yield { type: 'content', content: chunk.choices[0].delta.content };
|
|
}
|
|
|
|
if (chunk.usage) {
|
|
usage = {
|
|
inputTokens: chunk.usage.prompt_tokens,
|
|
outputTokens: chunk.usage.completion_tokens,
|
|
};
|
|
}
|
|
} catch {
|
|
// Skip malformed JSON
|
|
}
|
|
}
|
|
}
|
|
|
|
yield { type: 'done', usage };
|
|
} catch (error) {
|
|
yield {
|
|
type: 'error',
|
|
error: error instanceof Error ? error : new Error(String(error)),
|
|
};
|
|
}
|
|
}
|
|
```
|
|
|
|
### Step 4: Run test to verify it passes
|
|
|
|
Run: `npm test -- src/models/local/llamacpp.test.ts`
|
|
Expected: PASS (2 tests)
|
|
|
|
### Step 5: Commit
|
|
|
|
```bash
|
|
git add src/models/local/llamacpp.ts src/models/local/llamacpp.test.ts
|
|
git commit -m "feat: add streaming support to LlamaCppClient"
|
|
```
|
|
|
|
---
|
|
|
|
## Task 3: Add Connection Error Handling
|
|
|
|
**Files:**
|
|
- Modify: `src/models/local/llamacpp.test.ts`
|
|
- Modify: `src/models/local/llamacpp.ts`
|
|
|
|
### Step 1: Write the failing test
|
|
|
|
Add to `src/models/local/llamacpp.test.ts`:
|
|
|
|
```typescript
|
|
it('throws clear error when server not running', async () => {
|
|
mockFetch.mockRejectedValue(new TypeError('fetch failed'));
|
|
|
|
const client = new LlamaCppClient({
|
|
endpoint: 'http://localhost:8080',
|
|
});
|
|
|
|
await expect(client.chat({
|
|
messages: [{ role: 'user', content: 'Hello' }],
|
|
})).rejects.toThrow('llama-server not running at http://localhost:8080');
|
|
});
|
|
```
|
|
|
|
### Step 2: Run test to verify it fails
|
|
|
|
Run: `npm test -- src/models/local/llamacpp.test.ts`
|
|
Expected: FAIL - error message doesn't match
|
|
|
|
### Step 3: Update implementation
|
|
|
|
Wrap the fetch call in `chat()` method with error handling:
|
|
|
|
```typescript
|
|
async chat(request: ChatRequest): Promise<ChatResponse> {
|
|
const messages: LlamaCppMessage[] = [];
|
|
|
|
if (request.system) {
|
|
messages.push({ role: 'system', content: request.system });
|
|
}
|
|
|
|
for (const msg of request.messages) {
|
|
messages.push({ role: msg.role, content: msg.content });
|
|
}
|
|
|
|
const headers: Record<string, string> = {
|
|
'Content-Type': 'application/json',
|
|
};
|
|
|
|
if (this.authToken) {
|
|
headers['Authorization'] = `Bearer ${this.authToken}`;
|
|
}
|
|
|
|
let response: Response;
|
|
try {
|
|
response = await fetch(`${this.endpoint}/v1/chat/completions`, {
|
|
method: 'POST',
|
|
headers,
|
|
body: JSON.stringify({
|
|
messages,
|
|
max_tokens: request.maxTokens ?? 2048,
|
|
}),
|
|
});
|
|
} catch (error) {
|
|
if (error instanceof TypeError && error.message.includes('fetch failed')) {
|
|
throw new Error(`llama-server not running at ${this.endpoint}`);
|
|
}
|
|
throw error;
|
|
}
|
|
|
|
if (!response.ok) {
|
|
const text = await response.text();
|
|
throw new Error(`llama-server error (${response.status}): ${text}`);
|
|
}
|
|
|
|
const data = (await response.json()) as LlamaCppResponse;
|
|
|
|
return {
|
|
content: data.choices[0]?.message?.content ?? '',
|
|
stopReason: 'stop',
|
|
usage: {
|
|
inputTokens: data.usage?.prompt_tokens ?? 0,
|
|
outputTokens: data.usage?.completion_tokens ?? 0,
|
|
},
|
|
};
|
|
}
|
|
```
|
|
|
|
### Step 4: Run test to verify it passes
|
|
|
|
Run: `npm test -- src/models/local/llamacpp.test.ts`
|
|
Expected: PASS (3 tests)
|
|
|
|
### Step 5: Commit
|
|
|
|
```bash
|
|
git add src/models/local/llamacpp.ts src/models/local/llamacpp.test.ts
|
|
git commit -m "feat: add clear error message when llama-server not running"
|
|
```
|
|
|
|
---
|
|
|
|
## Task 4: Export and Wire Up Client
|
|
|
|
**Files:**
|
|
- Modify: `src/models/local/index.ts`
|
|
- Modify: `src/models/index.ts`
|
|
- Modify: `src/daemon/index.ts`
|
|
|
|
### Step 1: Export from local/index.ts
|
|
|
|
Update `src/models/local/index.ts`:
|
|
|
|
```typescript
|
|
export { OllamaClient, type OllamaClientConfig } from './ollama.js';
|
|
export { LlamaCppClient, type LlamaCppClientConfig } from './llamacpp.js';
|
|
```
|
|
|
|
### Step 2: Export from models/index.ts
|
|
|
|
Update `src/models/index.ts`:
|
|
|
|
```typescript
|
|
export { AnthropicClient, type AnthropicClientConfig } from './anthropic.js';
|
|
export { OpenAIClient, type OpenAIClientConfig } from './openai.js';
|
|
export { OllamaClient, type OllamaClientConfig } from './local/index.js';
|
|
export { LlamaCppClient, type LlamaCppClientConfig } from './local/index.js';
|
|
export { ModelRouter, type ModelRouterConfig, type ModelTier } from './router.js';
|
|
export type { Message, ChatRequest, ChatResponse, ModelClient } from './types.js';
|
|
```
|
|
|
|
### Step 3: Wire up in daemon
|
|
|
|
Update `src/daemon/index.ts` import:
|
|
|
|
```typescript
|
|
import { AnthropicClient, OpenAIClient, OllamaClient, LlamaCppClient, ModelRouter } from '../models/index.js';
|
|
```
|
|
|
|
Update `createModelRouter` function, add llamacpp case in the `if (models.local)` block:
|
|
|
|
```typescript
|
|
if (models.local) {
|
|
if (models.local.provider === 'ollama') {
|
|
localClient = new OllamaClient({
|
|
model: models.local.model,
|
|
host: models.local.endpoint,
|
|
});
|
|
} else if (models.local.provider === 'llamacpp') {
|
|
localClient = new LlamaCppClient({
|
|
endpoint: models.local.endpoint ?? 'http://localhost:8080',
|
|
authToken: models.local.auth_token,
|
|
});
|
|
}
|
|
}
|
|
```
|
|
|
|
### Step 4: Run all tests
|
|
|
|
Run: `npm test`
|
|
Expected: All tests pass
|
|
|
|
### Step 5: Commit
|
|
|
|
```bash
|
|
git add src/models/local/index.ts src/models/index.ts src/daemon/index.ts
|
|
git commit -m "feat: wire up LlamaCppClient to model router"
|
|
```
|
|
|
|
---
|
|
|
|
## Task 5: Run Full Test Suite and Verify
|
|
|
|
**Files:** None (verification only)
|
|
|
|
### Step 1: Run full test suite
|
|
|
|
Run: `npm test`
|
|
Expected: All tests pass (should be 62+ tests now)
|
|
|
|
### Step 2: Type check
|
|
|
|
Run: `npx tsc --noEmit`
|
|
Expected: No errors
|
|
|
|
### Step 3: Build
|
|
|
|
Run: `npm run build`
|
|
Expected: Build succeeds
|
|
|
|
### Step 4: Final commit (if any changes needed)
|
|
|
|
If any fixes were needed, commit them.
|
|
|
|
---
|
|
|
|
## Summary
|
|
|
|
| Task | Description | Tests Added |
|
|
|------|-------------|-------------|
|
|
| 1 | Basic chat support | 1 |
|
|
| 2 | Streaming via SSE | 1 |
|
|
| 3 | Connection error handling | 1 |
|
|
| 4 | Export and wire up | 0 |
|
|
| 5 | Verification | 0 |
|
|
|
|
**Total new tests:** 3
|
|
**Files created:** 2 (`llamacpp.ts`, `llamacpp.test.ts`)
|
|
**Files modified:** 3 (`local/index.ts`, `models/index.ts`, `daemon/index.ts`)
|