fix(gmail): sanitize HTML entities and tags in tool output

Gmail API returns snippets with HTML entities (&amp;, &#39;, <br>, etc.)
that leaked into LLM responses as raw HTML. Added shared sanitizeHtml()
utility in src/utils/html.ts and applied it to gmail tool snippets,
HTML body fallback, and gmail watcher snippets.
This commit is contained in:
William Valentin
2026-02-10 16:30:14 -08:00
parent 4317492e4b
commit 4ce8e81c01
6 changed files with 281 additions and 4 deletions
+119
View File
@@ -0,0 +1,119 @@
import { describe, it, expect } from 'vitest';
import { sanitizeHtml } from './html.js';
describe('sanitizeHtml', () => {
it('returns empty string for empty/falsy input', () => {
expect(sanitizeHtml('')).toBe('');
expect(sanitizeHtml(undefined as unknown as string)).toBe('');
});
it('passes through clean text unchanged', () => {
expect(sanitizeHtml('Hello world')).toBe('Hello world');
expect(sanitizeHtml('No entities here')).toBe('No entities here');
});
// ── <br> conversion ────────────────────────────────────────────────────
it('converts <br> to newline', () => {
expect(sanitizeHtml('Hello<br>World')).toBe('Hello\nWorld');
});
it('converts <br/> to newline', () => {
expect(sanitizeHtml('Hello<br/>World')).toBe('Hello\nWorld');
});
it('converts <br /> to newline', () => {
expect(sanitizeHtml('Hello<br />World')).toBe('Hello\nWorld');
});
it('converts <BR> (uppercase) to newline', () => {
expect(sanitizeHtml('Hello<BR>World')).toBe('Hello\nWorld');
});
// ── Tag stripping ─────────────────────────────────────────────────────
it('strips HTML tags', () => {
expect(sanitizeHtml('<p>Hello</p>')).toBe('Hello');
expect(sanitizeHtml('<b>bold</b> and <i>italic</i>')).toBe('bold and italic');
});
it('strips complex nested tags', () => {
expect(sanitizeHtml('<div class="foo"><span>text</span></div>')).toBe('text');
});
// ── Named entity decoding ────────────────────────────────────────────
it('decodes &amp; to &', () => {
expect(sanitizeHtml('Tom &amp; Jerry')).toBe('Tom & Jerry');
});
it('decodes &lt; and &gt;', () => {
expect(sanitizeHtml('a &lt; b &gt; c')).toBe('a < b > c');
});
it('decodes &quot; and &#39;', () => {
expect(sanitizeHtml('He said &quot;hello&quot; and it&#39;s fine')).toBe(
'He said "hello" and it\'s fine',
);
});
it('decodes &nbsp; to space', () => {
expect(sanitizeHtml('hello&nbsp;world')).toBe('hello world');
});
it('decodes &apos;', () => {
expect(sanitizeHtml('it&apos;s')).toBe("it's");
});
// ── Numeric entity decoding ──────────────────────────────────────────
it('decodes decimal numeric entities (&#NNN;)', () => {
expect(sanitizeHtml('&#169;')).toBe('©'); // ©
expect(sanitizeHtml('&#8364;')).toBe('€'); // €
});
it('decodes hex numeric entities (&#xHH;)', () => {
expect(sanitizeHtml('&#x2F;')).toBe('/');
expect(sanitizeHtml('&#xA9;')).toBe('©');
});
// ── Whitespace handling ──────────────────────────────────────────────
it('collapses multiple spaces to single space', () => {
expect(sanitizeHtml('hello world')).toBe('hello world');
});
it('preserves intentional newlines from <br> conversion', () => {
expect(sanitizeHtml('line1<br><br>line3')).toBe('line1\n\nline3');
});
it('collapses 3+ consecutive newlines to 2', () => {
expect(sanitizeHtml('a<br><br><br><br>b')).toBe('a\n\nb');
});
// ── Realistic Gmail scenarios ────────────────────────────────────────
it('handles a realistic Gmail snippet with HTML entities', () => {
const snippet = 'William, an exceptional credit rating is this many points away&#8230; Don&#39;t miss out &amp; check now';
expect(sanitizeHtml(snippet)).toBe(
"William, an exceptional credit rating is this many points away\u2026 Don't miss out & check now",
);
});
it('handles Gmail HTML body fallback with tags and entities', () => {
const html = '<html><body><p>Hello William,</p><br><p>Your balance is &lt;$500&gt;.</p><br><p>Thanks &amp; regards,<br>Bank</p></body></html>';
const result = sanitizeHtml(html);
expect(result).toContain('Hello William,');
expect(result).toContain('Your balance is <$500>.');
expect(result).toContain('Thanks & regards,');
expect(result).not.toContain('&amp;');
expect(result).not.toContain('&lt;');
expect(result).not.toContain('<p>');
expect(result).not.toContain('<html>');
});
it('handles double-encoded entities (does not double-decode)', () => {
// &amp;amp; should become &amp; (one level), not &
expect(sanitizeHtml('&amp;amp;')).toBe('&amp;');
});
});
+79
View File
@@ -0,0 +1,79 @@
/**
* HTML sanitization utilities for converting HTML content to clean plain text.
* Used primarily for Gmail API output where snippets and bodies contain
* HTML entities and tags that shouldn't leak into tool output.
*/
/** Named HTML entities to their character equivalents. */
const NAMED_ENTITIES: Record<string, string> = {
'&amp;': '&',
'&lt;': '<',
'&gt;': '>',
'&quot;': '"',
'&#39;': "'",
'&apos;': "'",
'&nbsp;': ' ',
};
/**
* Decode a single HTML entity (named, decimal, or hex) to its character.
*/
function decodeEntity(entity: string): string {
// Named entity
const named = NAMED_ENTITIES[entity.toLowerCase()];
if (named) return named;
// Decimal numeric entity: &#NNN;
const decMatch = entity.match(/^&#(\d+);$/);
if (decMatch) {
const code = parseInt(decMatch[1], 10);
return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
}
// Hex numeric entity: &#xHH;
const hexMatch = entity.match(/^&#x([0-9a-fA-F]+);$/);
if (hexMatch) {
const code = parseInt(hexMatch[1], 16);
return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
}
return entity;
}
/**
* Sanitize HTML content to plain text.
*
* - Converts `<br>` variants to newlines
* - Strips all remaining HTML tags
* - Decodes HTML entities (named, decimal, hex)
* - Collapses runs of spaces/tabs on each line (preserves newlines)
*
* @param text Raw HTML or HTML-entity-encoded text
* @returns Clean plain text
*/
export function sanitizeHtml(text: string): string {
if (!text) return '';
let result = text;
// Convert <br> variants to newlines (before stripping tags)
result = result.replace(/<br\s*\/?>/gi, '\n');
// Strip all remaining HTML tags
result = result.replace(/<[^>]+>/g, '');
// Decode HTML entities
result = result.replace(/&(?:#x[0-9a-fA-F]+|#\d+|[a-zA-Z]+);/g, decodeEntity);
// Collapse runs of horizontal whitespace (spaces/tabs) on each line, but preserve newlines
result = result.replace(/[^\S\n]+/g, ' ');
// Trim each line and remove excessive blank lines (3+ consecutive → 2)
result = result
.split('\n')
.map(line => line.trim())
.join('\n')
.replace(/\n{3,}/g, '\n\n');
return result.trim();
}