/** * Text chunker that splits markdown content into overlapping chunks * for embedding generation. */ /** * A single chunk of text extracted from a memory namespace. */ export interface Chunk { /** The chunk text content. */ text: string; /** The memory namespace this chunk came from. */ namespace: string; /** 1-based start line number in the original content. */ startLine: number; /** 1-based end line number in the original content. */ endLine: number; } export interface ChunkOptions { /** Target chunk size in characters. */ chunkSize: number; /** Number of overlapping characters between consecutive chunks. */ chunkOverlap: number; } const DEFAULT_CHUNK_OPTIONS: ChunkOptions = { chunkSize: 512, chunkOverlap: 50, }; /** * Split content into overlapping chunks suitable for embedding. * * Strategy: * 1. Split on paragraph boundaries (double newline). * 2. Merge small paragraphs to reach target chunk size. * 3. Track line numbers accurately through splits. * 4. Add overlap from previous chunk for context continuity. */ export function chunkText( content: string, namespace: string, options?: Partial, ): Chunk[] { const opts = { ...DEFAULT_CHUNK_OPTIONS, ...options }; if (content.trim().length === 0) { return []; } const lines = content.split('\n'); // Build paragraph groups: each paragraph is a contiguous set of lines // separated by blank lines (double newline boundaries). const paragraphs: { text: string; startLine: number; endLine: number }[] = []; let currentLines: string[] = []; let currentStart = 1; // 1-based for (let i = 0; i < lines.length; i++) { const line = lines[i]; const lineNum = i + 1; // 1-based if (line.trim() === '' && currentLines.length > 0) { // End of a paragraph paragraphs.push({ text: currentLines.join('\n'), startLine: currentStart, endLine: lineNum - 1, }); currentLines = []; currentStart = lineNum + 1; } else if (line.trim() !== '') { if (currentLines.length === 0) { currentStart = lineNum; } currentLines.push(line); } else { // Empty line and no current paragraph — advance start currentStart = lineNum + 1; } } // Flush remaining if (currentLines.length > 0) { paragraphs.push({ text: currentLines.join('\n'), startLine: currentStart, endLine: lines.length, }); } if (paragraphs.length === 0) { return []; } // Merge paragraphs into chunks, respecting the target size const chunks: Chunk[] = []; let chunkParagraphs: typeof paragraphs = []; let chunkLength = 0; for (const para of paragraphs) { const paraLength = para.text.length; // If adding this paragraph would exceed the target, flush current chunk if (chunkLength > 0 && chunkLength + paraLength + 1 > opts.chunkSize) { chunks.push(buildChunk(chunkParagraphs, namespace)); // Start a new chunk — include overlap from previous chunk const overlapChunk = getOverlapParagraphs(chunkParagraphs, opts.chunkOverlap); chunkParagraphs = overlapChunk; chunkLength = overlapChunk.reduce((sum, p) => sum + p.text.length, 0); } chunkParagraphs.push(para); chunkLength += paraLength + (chunkLength > 0 ? 1 : 0); // +1 for separator } // Flush remaining if (chunkParagraphs.length > 0) { chunks.push(buildChunk(chunkParagraphs, namespace)); } return chunks; } /** Build a Chunk from a list of paragraph entries. */ function buildChunk( paragraphs: { text: string; startLine: number; endLine: number }[], namespace: string, ): Chunk { return { text: paragraphs.map((p) => p.text).join('\n\n'), namespace, startLine: paragraphs[0].startLine, endLine: paragraphs[paragraphs.length - 1].endLine, }; } /** * Get trailing paragraphs from the previous chunk for overlap. * Takes paragraphs from the end until we've accumulated enough characters. */ function getOverlapParagraphs( paragraphs: { text: string; startLine: number; endLine: number }[], overlapChars: number, ): { text: string; startLine: number; endLine: number }[] { if (overlapChars <= 0) { return []; } const result: typeof paragraphs = []; let totalChars = 0; for (let i = paragraphs.length - 1; i >= 0; i--) { totalChars += paragraphs[i].text.length; result.unshift(paragraphs[i]); if (totalChars >= overlapChars) { break; } } return result; }