import { describe, it, expect } from 'vitest'; import { chunkText } from './chunker.js'; import type { Chunk } from './chunker.js'; describe('chunkText', () => { it('returns empty array for empty content', () => { expect(chunkText('', 'test')).toEqual([]); expect(chunkText(' \n\n ', 'test')).toEqual([]); }); it('returns single chunk for small content', () => { const content = 'Hello world\nSecond line'; const chunks = chunkText(content, 'notes', { chunkSize: 1000, chunkOverlap: 0 }); expect(chunks).toHaveLength(1); expect(chunks[0].text).toBe('Hello world\nSecond line'); expect(chunks[0].namespace).toBe('notes'); expect(chunks[0].startLine).toBe(1); expect(chunks[0].endLine).toBe(2); }); it('splits on paragraph boundaries (double newline)', () => { const content = 'Paragraph one line one\nParagraph one line two\n\nParagraph two line one\nParagraph two line two'; const chunks = chunkText(content, 'test', { chunkSize: 30, chunkOverlap: 0 }); // Should split into two chunks at the paragraph boundary expect(chunks.length).toBeGreaterThanOrEqual(2); expect(chunks[0].text).toContain('Paragraph one'); expect(chunks[1].text).toContain('Paragraph two'); }); it('merges small paragraphs to reach target chunk size', () => { const content = 'A\n\nB\n\nC\n\nD'; const chunks = chunkText(content, 'test', { chunkSize: 100, chunkOverlap: 0 }); // All paragraphs are tiny, so they should all fit in one chunk expect(chunks).toHaveLength(1); expect(chunks[0].text).toContain('A'); expect(chunks[0].text).toContain('D'); }); it('tracks line numbers accurately', () => { const content = 'Line one\n\nLine three\n\nLine five'; const chunks = chunkText(content, 'test', { chunkSize: 10, chunkOverlap: 0 }); // First chunk should start at line 1 expect(chunks[0].startLine).toBe(1); expect(chunks[0].endLine).toBe(1); // Line three is on actual line 3 const lineThreeChunk = chunks.find((c) => c.text.includes('Line three')); expect(lineThreeChunk).toBeDefined(); expect(lineThreeChunk!.startLine).toBe(3); // Line five is on actual line 5 const lineFiveChunk = chunks.find((c) => c.text.includes('Line five')); expect(lineFiveChunk).toBeDefined(); expect(lineFiveChunk!.startLine).toBe(5); }); it('includes overlap between consecutive chunks', () => { // Create content with clear paragraphs that force splitting const para1 = 'First paragraph with enough text to matter'; const para2 = 'Second paragraph with some more text'; const para3 = 'Third paragraph and final content here'; const content = `${para1}\n\n${para2}\n\n${para3}`; // Use a chunk size that forces splitting, with overlap const chunks = chunkText(content, 'test', { chunkSize: 50, chunkOverlap: 40 }); // With overlap, later chunks should contain content from previous paragraphs if (chunks.length >= 2) { // Check that there's some content overlap between consecutive chunks const lastChunk = chunks[chunks.length - 1]; const prevChunk = chunks[chunks.length - 2]; // Either chunks share content or at least have proper sequencing expect(lastChunk.startLine).toBeLessThanOrEqual(prevChunk.endLine + 5); } }); it('preserves namespace in all chunks', () => { const content = 'Para one\n\nPara two\n\nPara three'; const chunks = chunkText(content, 'sessions/abc123', { chunkSize: 10, chunkOverlap: 0 }); for (const chunk of chunks) { expect(chunk.namespace).toBe('sessions/abc123'); } }); it('handles content with multiple consecutive blank lines', () => { const content = 'First\n\n\n\nSecond'; const chunks = chunkText(content, 'test', { chunkSize: 1000, chunkOverlap: 0 }); expect(chunks.length).toBeGreaterThanOrEqual(1); expect(chunks.some((c) => c.text.includes('First'))).toBe(true); expect(chunks.some((c) => c.text.includes('Second'))).toBe(true); }); it('handles single-line content', () => { const chunks = chunkText('single line', 'test', { chunkSize: 100, chunkOverlap: 0 }); expect(chunks).toHaveLength(1); expect(chunks[0].text).toBe('single line'); expect(chunks[0].startLine).toBe(1); expect(chunks[0].endLine).toBe(1); }); });