flynn/src/memory/chunker.ts

/**
 * Text chunker that splits markdown content into overlapping chunks
 * for embedding generation.
 */

/**
 * A single chunk of text extracted from a memory namespace.
 */
export interface Chunk {
  /** The chunk text content. */
  text: string;
  /** The memory namespace this chunk came from. */
  namespace: string;
  /** 1-based start line number in the original content. */
  startLine: number;
  /** 1-based end line number in the original content. */
  endLine: number;
}

export interface ChunkOptions {
  /** Target chunk size in characters. */
  chunkSize: number;
  /** Number of overlapping characters between consecutive chunks. */
  chunkOverlap: number;
}

const DEFAULT_CHUNK_OPTIONS: ChunkOptions = {
  chunkSize: 512,
  chunkOverlap: 50,
};

/**
 * Split content into overlapping chunks suitable for embedding.
 *
 * Strategy:
 * 1. Split on paragraph boundaries (double newline).
 * 2. Merge small paragraphs to reach target chunk size.
 * 3. Track line numbers accurately through splits.
 * 4. Add overlap from previous chunk for context continuity.
 */
export function chunkText(
  content: string,
  namespace: string,
  options?: Partial<ChunkOptions>,
): Chunk[] {
  const opts = { ...DEFAULT_CHUNK_OPTIONS, ...options };

  if (content.trim().length === 0) {
    return [];
  }

  const lines = content.split('\n');

  // Build paragraph groups: each paragraph is a contiguous set of lines
  // separated by blank lines (double newline boundaries).
  const paragraphs: { text: string; startLine: number; endLine: number }[] = [];
  let currentLines: string[] = [];
  let currentStart = 1; // 1-based

  for (let i = 0; i < lines.length; i++) {
    const line = lines[i];
    const lineNum = i + 1; // 1-based

    if (line.trim() === '' && currentLines.length > 0) {
      // End of a paragraph
      paragraphs.push({
        text: currentLines.join('\n'),
        startLine: currentStart,
        endLine: lineNum - 1,
      });
      currentLines = [];
      currentStart = lineNum + 1;
    } else if (line.trim() !== '') {
      if (currentLines.length === 0) {
        currentStart = lineNum;
      }
      currentLines.push(line);
    } else {
      // Empty line and no current paragraph — advance start
      currentStart = lineNum + 1;
    }
  }

  // Flush remaining
  if (currentLines.length > 0) {
    paragraphs.push({
      text: currentLines.join('\n'),
      startLine: currentStart,
      endLine: lines.length,
    });
  }

  if (paragraphs.length === 0) {
    return [];
  }

  // Merge paragraphs into chunks, respecting the target size
  const chunks: Chunk[] = [];
  let chunkParagraphs: typeof paragraphs = [];
  let chunkLength = 0;

  for (const para of paragraphs) {
    const paraLength = para.text.length;

    // If adding this paragraph would exceed the target, flush current chunk
    if (chunkLength > 0 && chunkLength + paraLength + 1 > opts.chunkSize) {
      chunks.push(buildChunk(chunkParagraphs, namespace));
      // Start a new chunk — include overlap from previous chunk
      const overlapChunk = getOverlapParagraphs(chunkParagraphs, opts.chunkOverlap);
      chunkParagraphs = overlapChunk;
      chunkLength = overlapChunk.reduce((sum, p) => sum + p.text.length, 0);
    }

    chunkParagraphs.push(para);
    chunkLength += paraLength + (chunkLength > 0 ? 1 : 0); // +1 for separator
  }

  // Flush remaining
  if (chunkParagraphs.length > 0) {
    chunks.push(buildChunk(chunkParagraphs, namespace));
  }

  return chunks;
}

/** Build a Chunk from a list of paragraph entries. */
function buildChunk(
  paragraphs: { text: string; startLine: number; endLine: number }[],
  namespace: string,
): Chunk {
  return {
    text: paragraphs.map((p) => p.text).join('\n\n'),
    namespace,
    startLine: paragraphs[0].startLine,
    endLine: paragraphs[paragraphs.length - 1].endLine,
  };
}

/**
 * Get trailing paragraphs from the previous chunk for overlap.
 * Takes paragraphs from the end until we've accumulated enough characters.
 */
function getOverlapParagraphs(
  paragraphs: { text: string; startLine: number; endLine: number }[],
  overlapChars: number,
): { text: string; startLine: number; endLine: number }[] {
  if (overlapChars <= 0) {
    return [];
  }

  const result: typeof paragraphs = [];
  let totalChars = 0;

  for (let i = paragraphs.length - 1; i >= 0; i--) {
    totalChars += paragraphs[i].text.length;
    result.unshift(paragraphs[i]);
    if (totalChars >= overlapChars) {
      break;
    }
  }

  return result;
}