6090508bad
- Add curly braces to all if/else/for/while statements - Fix indentation and trailing spaces - Auto-fixed 372 linting errors using eslint --fix - Remaining issues are warnings only (non-null assertions, explicit any types)
80 lines
2.2 KiB
TypeScript
80 lines
2.2 KiB
TypeScript
/**
|
|
* HTML sanitization utilities for converting HTML content to clean plain text.
|
|
* Used primarily for Gmail API output where snippets and bodies contain
|
|
* HTML entities and tags that shouldn't leak into tool output.
|
|
*/
|
|
|
|
/** Named HTML entities to their character equivalents. */
|
|
const NAMED_ENTITIES: Record<string, string> = {
|
|
'&': '&',
|
|
'<': '<',
|
|
'>': '>',
|
|
'"': '"',
|
|
''': "'",
|
|
''': "'",
|
|
' ': ' ',
|
|
};
|
|
|
|
/**
|
|
* Decode a single HTML entity (named, decimal, or hex) to its character.
|
|
*/
|
|
function decodeEntity(entity: string): string {
|
|
// Named entity
|
|
const named = NAMED_ENTITIES[entity.toLowerCase()];
|
|
if (named) {return named;}
|
|
|
|
// Decimal numeric entity: &#NNN;
|
|
const decMatch = entity.match(/^&#(\d+);$/);
|
|
if (decMatch) {
|
|
const code = parseInt(decMatch[1], 10);
|
|
return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
|
|
}
|
|
|
|
// Hex numeric entity: &#xHH;
|
|
const hexMatch = entity.match(/^&#x([0-9a-fA-F]+);$/);
|
|
if (hexMatch) {
|
|
const code = parseInt(hexMatch[1], 16);
|
|
return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
|
|
}
|
|
|
|
return entity;
|
|
}
|
|
|
|
/**
|
|
* Sanitize HTML content to plain text.
|
|
*
|
|
* - Converts `<br>` variants to newlines
|
|
* - Strips all remaining HTML tags
|
|
* - Decodes HTML entities (named, decimal, hex)
|
|
* - Collapses runs of spaces/tabs on each line (preserves newlines)
|
|
*
|
|
* @param text Raw HTML or HTML-entity-encoded text
|
|
* @returns Clean plain text
|
|
*/
|
|
export function sanitizeHtml(text: string): string {
|
|
if (!text) {return '';}
|
|
|
|
let result = text;
|
|
|
|
// Convert <br> variants to newlines (before stripping tags)
|
|
result = result.replace(/<br\s*\/?>/gi, '\n');
|
|
|
|
// Strip all remaining HTML tags
|
|
result = result.replace(/<[^>]+>/g, '');
|
|
|
|
// Decode HTML entities
|
|
result = result.replace(/&(?:#x[0-9a-fA-F]+|#\d+|[a-zA-Z]+);/g, decodeEntity);
|
|
|
|
// Collapse runs of horizontal whitespace (spaces/tabs) on each line, but preserve newlines
|
|
result = result.replace(/[^\S\n]+/g, ' ');
|
|
|
|
// Trim each line and remove excessive blank lines (3+ consecutive → 2)
|
|
result = result
|
|
.split('\n')
|
|
.map(line => line.trim())
|
|
.join('\n')
|
|
.replace(/\n{3,}/g, '\n\n');
|
|
|
|
return result.trim();
|
|
}
|