/** * HTML sanitization utilities for converting HTML content to clean plain text. * Used primarily for Gmail API output where snippets and bodies contain * HTML entities and tags that shouldn't leak into tool output. */ /** Named HTML entities to their character equivalents. */ const NAMED_ENTITIES: Record = { '&': '&', '<': '<', '>': '>', '"': '"', ''': "'", ''': "'", ' ': ' ', }; /** * Decode a single HTML entity (named, decimal, or hex) to its character. */ function decodeEntity(entity: string): string { // Named entity const named = NAMED_ENTITIES[entity.toLowerCase()]; if (named) {return named;} // Decimal numeric entity: &#NNN; const decMatch = entity.match(/^&#(\d+);$/); if (decMatch) { const code = parseInt(decMatch[1], 10); return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity; } // Hex numeric entity: &#xHH; const hexMatch = entity.match(/^&#x([0-9a-fA-F]+);$/); if (hexMatch) { const code = parseInt(hexMatch[1], 16); return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity; } return entity; } /** * Sanitize HTML content to plain text. * * - Converts `
` variants to newlines * - Strips all remaining HTML tags * - Decodes HTML entities (named, decimal, hex) * - Collapses runs of spaces/tabs on each line (preserves newlines) * * @param text Raw HTML or HTML-entity-encoded text * @returns Clean plain text */ export function sanitizeHtml(text: string): string { if (!text) {return '';} let result = text; // Convert
variants to newlines (before stripping tags) result = result.replace(//gi, '\n'); // Strip all remaining HTML tags result = result.replace(/<[^>]+>/g, ''); // Decode HTML entities result = result.replace(/&(?:#x[0-9a-fA-F]+|#\d+|[a-zA-Z]+);/g, decodeEntity); // Collapse runs of horizontal whitespace (spaces/tabs) on each line, but preserve newlines result = result.replace(/[^\S\n]+/g, ' '); // Trim each line and remove excessive blank lines (3+ consecutive → 2) result = result .split('\n') .map(line => line.trim()) .join('\n') .replace(/\n{3,}/g, '\n\n'); return result.trim(); }