fix(gmail): sanitize HTML entities and tags in tool output
Gmail API returns snippets with HTML entities (&, ', <br>, etc.) that leaked into LLM responses as raw HTML. Added shared sanitizeHtml() utility in src/utils/html.ts and applied it to gmail tool snippets, HTML body fallback, and gmail watcher snippets.
This commit is contained in:
@@ -0,0 +1,79 @@
|
||||
/**
|
||||
* HTML sanitization utilities for converting HTML content to clean plain text.
|
||||
* Used primarily for Gmail API output where snippets and bodies contain
|
||||
* HTML entities and tags that shouldn't leak into tool output.
|
||||
*/
|
||||
|
||||
/** Named HTML entities to their character equivalents. */
|
||||
const NAMED_ENTITIES: Record<string, string> = {
|
||||
'&': '&',
|
||||
'<': '<',
|
||||
'>': '>',
|
||||
'"': '"',
|
||||
''': "'",
|
||||
''': "'",
|
||||
' ': ' ',
|
||||
};
|
||||
|
||||
/**
|
||||
* Decode a single HTML entity (named, decimal, or hex) to its character.
|
||||
*/
|
||||
function decodeEntity(entity: string): string {
|
||||
// Named entity
|
||||
const named = NAMED_ENTITIES[entity.toLowerCase()];
|
||||
if (named) return named;
|
||||
|
||||
// Decimal numeric entity: &#NNN;
|
||||
const decMatch = entity.match(/^&#(\d+);$/);
|
||||
if (decMatch) {
|
||||
const code = parseInt(decMatch[1], 10);
|
||||
return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
|
||||
}
|
||||
|
||||
// Hex numeric entity: &#xHH;
|
||||
const hexMatch = entity.match(/^&#x([0-9a-fA-F]+);$/);
|
||||
if (hexMatch) {
|
||||
const code = parseInt(hexMatch[1], 16);
|
||||
return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
|
||||
}
|
||||
|
||||
return entity;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize HTML content to plain text.
|
||||
*
|
||||
* - Converts `<br>` variants to newlines
|
||||
* - Strips all remaining HTML tags
|
||||
* - Decodes HTML entities (named, decimal, hex)
|
||||
* - Collapses runs of spaces/tabs on each line (preserves newlines)
|
||||
*
|
||||
* @param text Raw HTML or HTML-entity-encoded text
|
||||
* @returns Clean plain text
|
||||
*/
|
||||
export function sanitizeHtml(text: string): string {
|
||||
if (!text) return '';
|
||||
|
||||
let result = text;
|
||||
|
||||
// Convert <br> variants to newlines (before stripping tags)
|
||||
result = result.replace(/<br\s*\/?>/gi, '\n');
|
||||
|
||||
// Strip all remaining HTML tags
|
||||
result = result.replace(/<[^>]+>/g, '');
|
||||
|
||||
// Decode HTML entities
|
||||
result = result.replace(/&(?:#x[0-9a-fA-F]+|#\d+|[a-zA-Z]+);/g, decodeEntity);
|
||||
|
||||
// Collapse runs of horizontal whitespace (spaces/tabs) on each line, but preserve newlines
|
||||
result = result.replace(/[^\S\n]+/g, ' ');
|
||||
|
||||
// Trim each line and remove excessive blank lines (3+ consecutive → 2)
|
||||
result = result
|
||||
.split('\n')
|
||||
.map(line => line.trim())
|
||||
.join('\n')
|
||||
.replace(/\n{3,}/g, '\n\n');
|
||||
|
||||
return result.trim();
|
||||
}
|
||||
Reference in New Issue
Block a user