Files
flynn/src/memory/chunker.ts
T
William Valentin 88731a50e3 feat: add heartbeat monitor and vector memory search (Tier 2)
Heartbeat:
- HeartbeatMonitor with 5 checks: gateway, model, channels, memory, disk
- Configurable interval, failure threshold, notification channel
- Recovery notifications when health restores
- 25 new tests

Vector Memory Search:
- EmbeddingProvider interface with OpenAI, Gemini, Ollama, LlamaCpp backends
- SQLite-backed VectorStore with cosine similarity search
- Text chunker with paragraph-aware splitting and overlap
- HybridSearch merging keyword + vector results with configurable weight
- Background indexer with dirty-namespace tracking
- Graceful fallback to keyword search when embeddings unavailable
- 51 new tests

Config: automation.heartbeat + memory.embedding schema sections
Total: 950 tests passing, all types clean
2026-02-07 14:45:11 -08:00

164 lines
4.4 KiB
TypeScript

/**
* Text chunker that splits markdown content into overlapping chunks
* for embedding generation.
*/
/**
* A single chunk of text extracted from a memory namespace.
*/
export interface Chunk {
/** The chunk text content. */
text: string;
/** The memory namespace this chunk came from. */
namespace: string;
/** 1-based start line number in the original content. */
startLine: number;
/** 1-based end line number in the original content. */
endLine: number;
}
export interface ChunkOptions {
/** Target chunk size in characters. */
chunkSize: number;
/** Number of overlapping characters between consecutive chunks. */
chunkOverlap: number;
}
const DEFAULT_CHUNK_OPTIONS: ChunkOptions = {
chunkSize: 512,
chunkOverlap: 50,
};
/**
* Split content into overlapping chunks suitable for embedding.
*
* Strategy:
* 1. Split on paragraph boundaries (double newline).
* 2. Merge small paragraphs to reach target chunk size.
* 3. Track line numbers accurately through splits.
* 4. Add overlap from previous chunk for context continuity.
*/
export function chunkText(
content: string,
namespace: string,
options?: Partial<ChunkOptions>,
): Chunk[] {
const opts = { ...DEFAULT_CHUNK_OPTIONS, ...options };
if (content.trim().length === 0) {
return [];
}
const lines = content.split('\n');
// Build paragraph groups: each paragraph is a contiguous set of lines
// separated by blank lines (double newline boundaries).
const paragraphs: { text: string; startLine: number; endLine: number }[] = [];
let currentLines: string[] = [];
let currentStart = 1; // 1-based
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const lineNum = i + 1; // 1-based
if (line.trim() === '' && currentLines.length > 0) {
// End of a paragraph
paragraphs.push({
text: currentLines.join('\n'),
startLine: currentStart,
endLine: lineNum - 1,
});
currentLines = [];
currentStart = lineNum + 1;
} else if (line.trim() !== '') {
if (currentLines.length === 0) {
currentStart = lineNum;
}
currentLines.push(line);
} else {
// Empty line and no current paragraph — advance start
currentStart = lineNum + 1;
}
}
// Flush remaining
if (currentLines.length > 0) {
paragraphs.push({
text: currentLines.join('\n'),
startLine: currentStart,
endLine: lines.length,
});
}
if (paragraphs.length === 0) {
return [];
}
// Merge paragraphs into chunks, respecting the target size
const chunks: Chunk[] = [];
let chunkParagraphs: typeof paragraphs = [];
let chunkLength = 0;
for (const para of paragraphs) {
const paraLength = para.text.length;
// If adding this paragraph would exceed the target, flush current chunk
if (chunkLength > 0 && chunkLength + paraLength + 1 > opts.chunkSize) {
chunks.push(buildChunk(chunkParagraphs, namespace));
// Start a new chunk — include overlap from previous chunk
const overlapChunk = getOverlapParagraphs(chunkParagraphs, opts.chunkOverlap);
chunkParagraphs = overlapChunk;
chunkLength = overlapChunk.reduce((sum, p) => sum + p.text.length, 0);
}
chunkParagraphs.push(para);
chunkLength += paraLength + (chunkLength > 0 ? 1 : 0); // +1 for separator
}
// Flush remaining
if (chunkParagraphs.length > 0) {
chunks.push(buildChunk(chunkParagraphs, namespace));
}
return chunks;
}
/** Build a Chunk from a list of paragraph entries. */
function buildChunk(
paragraphs: { text: string; startLine: number; endLine: number }[],
namespace: string,
): Chunk {
return {
text: paragraphs.map((p) => p.text).join('\n\n'),
namespace,
startLine: paragraphs[0].startLine,
endLine: paragraphs[paragraphs.length - 1].endLine,
};
}
/**
* Get trailing paragraphs from the previous chunk for overlap.
* Takes paragraphs from the end until we've accumulated enough characters.
*/
function getOverlapParagraphs(
paragraphs: { text: string; startLine: number; endLine: number }[],
overlapChars: number,
): { text: string; startLine: number; endLine: number }[] {
if (overlapChars <= 0) {
return [];
}
const result: typeof paragraphs = [];
let totalChars = 0;
for (let i = paragraphs.length - 1; i >= 0; i--) {
totalChars += paragraphs[i].text.length;
result.unshift(paragraphs[i]);
if (totalChars >= overlapChars) {
break;
}
}
return result;
}