88731a50e3
Heartbeat: - HeartbeatMonitor with 5 checks: gateway, model, channels, memory, disk - Configurable interval, failure threshold, notification channel - Recovery notifications when health restores - 25 new tests Vector Memory Search: - EmbeddingProvider interface with OpenAI, Gemini, Ollama, LlamaCpp backends - SQLite-backed VectorStore with cosine similarity search - Text chunker with paragraph-aware splitting and overlap - HybridSearch merging keyword + vector results with configurable weight - Background indexer with dirty-namespace tracking - Graceful fallback to keyword search when embeddings unavailable - 51 new tests Config: automation.heartbeat + memory.embedding schema sections Total: 950 tests passing, all types clean
164 lines
4.4 KiB
TypeScript
164 lines
4.4 KiB
TypeScript
/**
|
|
* Text chunker that splits markdown content into overlapping chunks
|
|
* for embedding generation.
|
|
*/
|
|
|
|
/**
|
|
* A single chunk of text extracted from a memory namespace.
|
|
*/
|
|
export interface Chunk {
|
|
/** The chunk text content. */
|
|
text: string;
|
|
/** The memory namespace this chunk came from. */
|
|
namespace: string;
|
|
/** 1-based start line number in the original content. */
|
|
startLine: number;
|
|
/** 1-based end line number in the original content. */
|
|
endLine: number;
|
|
}
|
|
|
|
export interface ChunkOptions {
|
|
/** Target chunk size in characters. */
|
|
chunkSize: number;
|
|
/** Number of overlapping characters between consecutive chunks. */
|
|
chunkOverlap: number;
|
|
}
|
|
|
|
const DEFAULT_CHUNK_OPTIONS: ChunkOptions = {
|
|
chunkSize: 512,
|
|
chunkOverlap: 50,
|
|
};
|
|
|
|
/**
|
|
* Split content into overlapping chunks suitable for embedding.
|
|
*
|
|
* Strategy:
|
|
* 1. Split on paragraph boundaries (double newline).
|
|
* 2. Merge small paragraphs to reach target chunk size.
|
|
* 3. Track line numbers accurately through splits.
|
|
* 4. Add overlap from previous chunk for context continuity.
|
|
*/
|
|
export function chunkText(
|
|
content: string,
|
|
namespace: string,
|
|
options?: Partial<ChunkOptions>,
|
|
): Chunk[] {
|
|
const opts = { ...DEFAULT_CHUNK_OPTIONS, ...options };
|
|
|
|
if (content.trim().length === 0) {
|
|
return [];
|
|
}
|
|
|
|
const lines = content.split('\n');
|
|
|
|
// Build paragraph groups: each paragraph is a contiguous set of lines
|
|
// separated by blank lines (double newline boundaries).
|
|
const paragraphs: { text: string; startLine: number; endLine: number }[] = [];
|
|
let currentLines: string[] = [];
|
|
let currentStart = 1; // 1-based
|
|
|
|
for (let i = 0; i < lines.length; i++) {
|
|
const line = lines[i];
|
|
const lineNum = i + 1; // 1-based
|
|
|
|
if (line.trim() === '' && currentLines.length > 0) {
|
|
// End of a paragraph
|
|
paragraphs.push({
|
|
text: currentLines.join('\n'),
|
|
startLine: currentStart,
|
|
endLine: lineNum - 1,
|
|
});
|
|
currentLines = [];
|
|
currentStart = lineNum + 1;
|
|
} else if (line.trim() !== '') {
|
|
if (currentLines.length === 0) {
|
|
currentStart = lineNum;
|
|
}
|
|
currentLines.push(line);
|
|
} else {
|
|
// Empty line and no current paragraph — advance start
|
|
currentStart = lineNum + 1;
|
|
}
|
|
}
|
|
|
|
// Flush remaining
|
|
if (currentLines.length > 0) {
|
|
paragraphs.push({
|
|
text: currentLines.join('\n'),
|
|
startLine: currentStart,
|
|
endLine: lines.length,
|
|
});
|
|
}
|
|
|
|
if (paragraphs.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
// Merge paragraphs into chunks, respecting the target size
|
|
const chunks: Chunk[] = [];
|
|
let chunkParagraphs: typeof paragraphs = [];
|
|
let chunkLength = 0;
|
|
|
|
for (const para of paragraphs) {
|
|
const paraLength = para.text.length;
|
|
|
|
// If adding this paragraph would exceed the target, flush current chunk
|
|
if (chunkLength > 0 && chunkLength + paraLength + 1 > opts.chunkSize) {
|
|
chunks.push(buildChunk(chunkParagraphs, namespace));
|
|
// Start a new chunk — include overlap from previous chunk
|
|
const overlapChunk = getOverlapParagraphs(chunkParagraphs, opts.chunkOverlap);
|
|
chunkParagraphs = overlapChunk;
|
|
chunkLength = overlapChunk.reduce((sum, p) => sum + p.text.length, 0);
|
|
}
|
|
|
|
chunkParagraphs.push(para);
|
|
chunkLength += paraLength + (chunkLength > 0 ? 1 : 0); // +1 for separator
|
|
}
|
|
|
|
// Flush remaining
|
|
if (chunkParagraphs.length > 0) {
|
|
chunks.push(buildChunk(chunkParagraphs, namespace));
|
|
}
|
|
|
|
return chunks;
|
|
}
|
|
|
|
/** Build a Chunk from a list of paragraph entries. */
|
|
function buildChunk(
|
|
paragraphs: { text: string; startLine: number; endLine: number }[],
|
|
namespace: string,
|
|
): Chunk {
|
|
return {
|
|
text: paragraphs.map((p) => p.text).join('\n\n'),
|
|
namespace,
|
|
startLine: paragraphs[0].startLine,
|
|
endLine: paragraphs[paragraphs.length - 1].endLine,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get trailing paragraphs from the previous chunk for overlap.
|
|
* Takes paragraphs from the end until we've accumulated enough characters.
|
|
*/
|
|
function getOverlapParagraphs(
|
|
paragraphs: { text: string; startLine: number; endLine: number }[],
|
|
overlapChars: number,
|
|
): { text: string; startLine: number; endLine: number }[] {
|
|
if (overlapChars <= 0) {
|
|
return [];
|
|
}
|
|
|
|
const result: typeof paragraphs = [];
|
|
let totalChars = 0;
|
|
|
|
for (let i = paragraphs.length - 1; i >= 0; i--) {
|
|
totalChars += paragraphs[i].text.length;
|
|
result.unshift(paragraphs[i]);
|
|
if (totalChars >= overlapChars) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|