Files
flynn/src/utils/html.ts
T
William Valentin 6090508bad style: auto-fix ESLint issues (curly braces and formatting)
- Add curly braces to all if/else/for/while statements
- Fix indentation and trailing spaces
- Auto-fixed 372 linting errors using eslint --fix
- Remaining issues are warnings only (non-null assertions, explicit any types)
2026-02-11 10:30:24 -08:00

80 lines
2.2 KiB
TypeScript

/**
* HTML sanitization utilities for converting HTML content to clean plain text.
* Used primarily for Gmail API output where snippets and bodies contain
* HTML entities and tags that shouldn't leak into tool output.
*/
/** Named HTML entities to their character equivalents. */
const NAMED_ENTITIES: Record<string, string> = {
'&amp;': '&',
'&lt;': '<',
'&gt;': '>',
'&quot;': '"',
'&#39;': "'",
'&apos;': "'",
'&nbsp;': ' ',
};
/**
* Decode a single HTML entity (named, decimal, or hex) to its character.
*/
function decodeEntity(entity: string): string {
// Named entity
const named = NAMED_ENTITIES[entity.toLowerCase()];
if (named) {return named;}
// Decimal numeric entity: &#NNN;
const decMatch = entity.match(/^&#(\d+);$/);
if (decMatch) {
const code = parseInt(decMatch[1], 10);
return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
}
// Hex numeric entity: &#xHH;
const hexMatch = entity.match(/^&#x([0-9a-fA-F]+);$/);
if (hexMatch) {
const code = parseInt(hexMatch[1], 16);
return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
}
return entity;
}
/**
* Sanitize HTML content to plain text.
*
* - Converts `<br>` variants to newlines
* - Strips all remaining HTML tags
* - Decodes HTML entities (named, decimal, hex)
* - Collapses runs of spaces/tabs on each line (preserves newlines)
*
* @param text Raw HTML or HTML-entity-encoded text
* @returns Clean plain text
*/
export function sanitizeHtml(text: string): string {
if (!text) {return '';}
let result = text;
// Convert <br> variants to newlines (before stripping tags)
result = result.replace(/<br\s*\/?>/gi, '\n');
// Strip all remaining HTML tags
result = result.replace(/<[^>]+>/g, '');
// Decode HTML entities
result = result.replace(/&(?:#x[0-9a-fA-F]+|#\d+|[a-zA-Z]+);/g, decodeEntity);
// Collapse runs of horizontal whitespace (spaces/tabs) on each line, but preserve newlines
result = result.replace(/[^\S\n]+/g, ' ');
// Trim each line and remove excessive blank lines (3+ consecutive → 2)
result = result
.split('\n')
.map(line => line.trim())
.join('\n')
.replace(/\n{3,}/g, '\n\n');
return result.trim();
}