flynn/src/utils/html.ts

/**
 * HTML sanitization utilities for converting HTML content to clean plain text.
 * Used primarily for Gmail API output where snippets and bodies contain
 * HTML entities and tags that shouldn't leak into tool output.
 */

/** Named HTML entities to their character equivalents. */
const NAMED_ENTITIES: Record<string, string> = {
  '&amp;': '&',
  '&lt;': '<',
  '&gt;': '>',
  '&quot;': '"',
  '&#39;': "'",
  '&apos;': "'",
  '&nbsp;': ' ',
};

/**
 * Decode a single HTML entity (named, decimal, or hex) to its character.
 */
function decodeEntity(entity: string): string {
  // Named entity
  const named = NAMED_ENTITIES[entity.toLowerCase()];
  if (named) {return named;}

  // Decimal numeric entity: &#NNN;
  const decMatch = entity.match(/^&#(\d+);$/);
  if (decMatch) {
    const code = parseInt(decMatch[1], 10);
    return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
  }

  // Hex numeric entity: &#xHH;
  const hexMatch = entity.match(/^&#x([0-9a-fA-F]+);$/);
  if (hexMatch) {
    const code = parseInt(hexMatch[1], 16);
    return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
  }

  return entity;
}

/**
 * Sanitize HTML content to plain text.
 *
 * - Converts `<br>` variants to newlines
 * - Strips all remaining HTML tags
 * - Decodes HTML entities (named, decimal, hex)
 * - Collapses runs of spaces/tabs on each line (preserves newlines)
 *
 * @param text Raw HTML or HTML-entity-encoded text
 * @returns Clean plain text
 */
export function sanitizeHtml(text: string): string {
  if (!text) {return '';}

  let result = text;

  // Convert <br> variants to newlines (before stripping tags)
  result = result.replace(/<br\s*\/?>/gi, '\n');

  // Strip all remaining HTML tags
  result = result.replace(/<[^>]+>/g, '');

  // Decode HTML entities
  result = result.replace(/&(?:#x[0-9a-fA-F]+|#\d+|[a-zA-Z]+);/g, decodeEntity);

  // Collapse runs of horizontal whitespace (spaces/tabs) on each line, but preserve newlines
  result = result.replace(/[^\S\n]+/g, ' ');

  // Trim each line and remove excessive blank lines (3+ consecutive → 2)
  result = result
    .split('\n')
    .map(line => line.trim())
    .join('\n')
    .replace(/\n{3,}/g, '\n\n');

  return result.trim();
}