From 4ce8e81c01c661d2ae36304cc908c700187022bd Mon Sep 17 00:00:00 2001 From: William Valentin Date: Tue, 10 Feb 2026 16:30:14 -0800 Subject: [PATCH] fix(gmail): sanitize HTML entities and tags in tool output Gmail API returns snippets with HTML entities (&, ',
, etc.) that leaked into LLM responses as raw HTML. Added shared sanitizeHtml() utility in src/utils/html.ts and applied it to gmail tool snippets, HTML body fallback, and gmail watcher snippets. --- docs/plans/state.json | 15 ++++ src/automation/gmail.ts | 3 +- src/tools/builtin/gmail.test.ts | 62 +++++++++++++++++ src/tools/builtin/gmail.ts | 7 +- src/utils/html.test.ts | 119 ++++++++++++++++++++++++++++++++ src/utils/html.ts | 79 +++++++++++++++++++++ 6 files changed, 281 insertions(+), 4 deletions(-) create mode 100644 src/utils/html.test.ts create mode 100644 src/utils/html.ts diff --git a/docs/plans/state.json b/docs/plans/state.json index 132e343..953c453 100644 --- a/docs/plans/state.json +++ b/docs/plans/state.json @@ -947,6 +947,21 @@ "config/default.yaml" ] }, + "gmail-html-sanitization": { + "status": "completed", + "date": "2026-02-10", + "summary": "Sanitize HTML entities and tags in Gmail tool output. Gmail API returns snippets with HTML entities (&, ',
, etc.) that leaked into LLM responses. Added shared sanitizeHtml() utility and applied to both gmail tool and gmail watcher.", + "files_created": [ + "src/utils/html.ts", + "src/utils/html.test.ts" + ], + "files_modified": [ + "src/tools/builtin/gmail.ts", + "src/tools/builtin/gmail.test.ts", + "src/automation/gmail.ts" + ], + "test_status": "21/21 passing (html.test) + 18/18 passing (gmail.test) + 16/16 passing (automation/gmail.test)" + }, "tui-fullscreen-improvements": { "status": "completed", "date": "2026-02-10", diff --git a/src/automation/gmail.ts b/src/automation/gmail.ts index ee1377d..907ba4d 100644 --- a/src/automation/gmail.ts +++ b/src/automation/gmail.ts @@ -5,6 +5,7 @@ import { homedir } from 'os'; import type { GmailConfig } from '../config/schema.js'; import type { ChannelAdapter, ChannelStatus, InboundMessage, OutboundMessage } from '../channels/types.js'; import { parseInterval } from './heartbeat.js'; +import { sanitizeHtml } from '../utils/html.js'; /** Minimal interface for the parts of ChannelRegistry we need. */ interface ChannelLookup { @@ -368,7 +369,7 @@ export class GmailWatcher implements ChannelAdapter { from: getHeader('From'), to: getHeader('To'), subject: getHeader('Subject'), - snippet: msg.data.snippet ?? '', + snippet: sanitizeHtml(msg.data.snippet ?? ''), date: getHeader('Date'), labels: msg.data.labelIds ?? [], }; diff --git a/src/tools/builtin/gmail.test.ts b/src/tools/builtin/gmail.test.ts index 92ece82..51046dc 100644 --- a/src/tools/builtin/gmail.test.ts +++ b/src/tools/builtin/gmail.test.ts @@ -191,6 +191,36 @@ describe('gmail.list', () => { expect(result.success).toBe(true); expect(result.output).toBe('No messages found.'); }); + + it('sanitizes HTML entities in snippets', async () => { + setupValidAuth(); + mockMessagesList.mockResolvedValue({ + data: { + messages: [{ id: 'msg1' }], + }, + }); + mockMessagesGet.mockResolvedValueOnce( + mockMessageDetails( + 'msg1', + 'experian@test.com', + 'Credit Alert', + 'Mon, 10 Feb 2026', + 'William, your score is rising's & it… Don't miss out
Check now', + ), + ); + + const [listTool] = createGmailTools(testConfig); + const result = await listTool.execute({}); + + expect(result.success).toBe(true); + expect(result.output).not.toContain('''); + expect(result.output).not.toContain('&'); + expect(result.output).not.toContain('…'); + expect(result.output).not.toContain('
'); + expect(result.output).toContain("rising's"); + expect(result.output).toContain('& it'); + expect(result.output).toContain("Don't miss out"); + }); }); describe('gmail.search', () => { @@ -360,6 +390,38 @@ describe('gmail.read', () => { expect(result.output).not.toContain(''); }); + it('decodes HTML entities in HTML-only body fallback', async () => { + setupValidAuth(); + const htmlBody = '

Hello & welcome


Price: <$100>


It's great

'; + mockMessagesGet.mockResolvedValue({ + data: { + payload: { + mimeType: 'multipart/alternative', + headers: [ + { name: 'From', value: 'sender@example.com' }, + { name: 'To', value: 'will@example.com' }, + { name: 'Subject', value: 'HTML Entities' }, + { name: 'Date', value: 'Mon, 10 Feb 2026 12:00:00 -0000' }, + ], + parts: [ + { mimeType: 'text/html', body: { data: toBase64Url(htmlBody) } }, + ], + }, + }, + }); + + const [, , readTool] = createGmailTools(testConfig); + const result = await readTool.execute({ id: 'msg-entities' }); + + expect(result.success).toBe(true); + expect(result.output).toContain('Hello & welcome'); + expect(result.output).toContain('Price: <$100>'); + expect(result.output).toContain("It's great"); + expect(result.output).not.toContain('&'); + expect(result.output).not.toContain('<'); + expect(result.output).not.toContain('''); + }); + it('returns error when credentials missing', async () => { mockExistsSync.mockReturnValue(false); const [, , readTool] = createGmailTools(testConfig); diff --git a/src/tools/builtin/gmail.ts b/src/tools/builtin/gmail.ts index 44f4632..1ad9b69 100644 --- a/src/tools/builtin/gmail.ts +++ b/src/tools/builtin/gmail.ts @@ -4,6 +4,7 @@ import { resolve } from 'path'; import { homedir } from 'os'; import type { GmailConfig } from '../../config/schema.js'; import type { Tool, ToolResult } from '../types.js'; +import { sanitizeHtml } from '../../utils/html.js'; /** Expand ~ to home directory. */ function expandPath(p: string): string { @@ -79,7 +80,7 @@ async function fetchMessageDetails( from: getHeader('From'), subject: getHeader('Subject'), date: getHeader('Date'), - snippet: msg.data.snippet ?? '', + snippet: sanitizeHtml(msg.data.snippet ?? ''), }; } catch { return null; @@ -123,8 +124,8 @@ function extractTextBody(payload: { } } if (htmlFallback) { - // Strip HTML tags for a rough plain-text rendering - return htmlFallback.replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim(); + // Convert HTML to clean plain text + return sanitizeHtml(htmlFallback); } } diff --git a/src/utils/html.test.ts b/src/utils/html.test.ts new file mode 100644 index 0000000..c5a6753 --- /dev/null +++ b/src/utils/html.test.ts @@ -0,0 +1,119 @@ +import { describe, it, expect } from 'vitest'; +import { sanitizeHtml } from './html.js'; + +describe('sanitizeHtml', () => { + it('returns empty string for empty/falsy input', () => { + expect(sanitizeHtml('')).toBe(''); + expect(sanitizeHtml(undefined as unknown as string)).toBe(''); + }); + + it('passes through clean text unchanged', () => { + expect(sanitizeHtml('Hello world')).toBe('Hello world'); + expect(sanitizeHtml('No entities here')).toBe('No entities here'); + }); + + // ──
conversion ──────────────────────────────────────────────────── + + it('converts
to newline', () => { + expect(sanitizeHtml('Hello
World')).toBe('Hello\nWorld'); + }); + + it('converts
to newline', () => { + expect(sanitizeHtml('Hello
World')).toBe('Hello\nWorld'); + }); + + it('converts
to newline', () => { + expect(sanitizeHtml('Hello
World')).toBe('Hello\nWorld'); + }); + + it('converts
(uppercase) to newline', () => { + expect(sanitizeHtml('Hello
World')).toBe('Hello\nWorld'); + }); + + // ── Tag stripping ───────────────────────────────────────────────────── + + it('strips HTML tags', () => { + expect(sanitizeHtml('

Hello

')).toBe('Hello'); + expect(sanitizeHtml('bold and italic')).toBe('bold and italic'); + }); + + it('strips complex nested tags', () => { + expect(sanitizeHtml('
text
')).toBe('text'); + }); + + // ── Named entity decoding ──────────────────────────────────────────── + + it('decodes & to &', () => { + expect(sanitizeHtml('Tom & Jerry')).toBe('Tom & Jerry'); + }); + + it('decodes < and >', () => { + expect(sanitizeHtml('a < b > c')).toBe('a < b > c'); + }); + + it('decodes " and '', () => { + expect(sanitizeHtml('He said "hello" and it's fine')).toBe( + 'He said "hello" and it\'s fine', + ); + }); + + it('decodes   to space', () => { + expect(sanitizeHtml('hello world')).toBe('hello world'); + }); + + it('decodes '', () => { + expect(sanitizeHtml('it's')).toBe("it's"); + }); + + // ── Numeric entity decoding ────────────────────────────────────────── + + it('decodes decimal numeric entities (&#NNN;)', () => { + expect(sanitizeHtml('©')).toBe('©'); // © + expect(sanitizeHtml('€')).toBe('€'); // € + }); + + it('decodes hex numeric entities (&#xHH;)', () => { + expect(sanitizeHtml('/')).toBe('/'); + expect(sanitizeHtml('©')).toBe('©'); + }); + + // ── Whitespace handling ────────────────────────────────────────────── + + it('collapses multiple spaces to single space', () => { + expect(sanitizeHtml('hello world')).toBe('hello world'); + }); + + it('preserves intentional newlines from
conversion', () => { + expect(sanitizeHtml('line1

line3')).toBe('line1\n\nline3'); + }); + + it('collapses 3+ consecutive newlines to 2', () => { + expect(sanitizeHtml('a



b')).toBe('a\n\nb'); + }); + + // ── Realistic Gmail scenarios ──────────────────────────────────────── + + it('handles a realistic Gmail snippet with HTML entities', () => { + const snippet = 'William, an exceptional credit rating is this many points away… Don't miss out & check now'; + expect(sanitizeHtml(snippet)).toBe( + "William, an exceptional credit rating is this many points away\u2026 Don't miss out & check now", + ); + }); + + it('handles Gmail HTML body fallback with tags and entities', () => { + const html = '

Hello William,


Your balance is <$500>.


Thanks & regards,
Bank

'; + const result = sanitizeHtml(html); + expect(result).toContain('Hello William,'); + expect(result).toContain('Your balance is <$500>.'); + expect(result).toContain('Thanks & regards,'); + expect(result).not.toContain('&'); + expect(result).not.toContain('<'); + expect(result).not.toContain('

'); + expect(result).not.toContain(''); + }); + + it('handles double-encoded entities (does not double-decode)', () => { + // &amp; should become & (one level), not & + expect(sanitizeHtml('&amp;')).toBe('&'); + }); +}); diff --git a/src/utils/html.ts b/src/utils/html.ts new file mode 100644 index 0000000..f6687ae --- /dev/null +++ b/src/utils/html.ts @@ -0,0 +1,79 @@ +/** + * HTML sanitization utilities for converting HTML content to clean plain text. + * Used primarily for Gmail API output where snippets and bodies contain + * HTML entities and tags that shouldn't leak into tool output. + */ + +/** Named HTML entities to their character equivalents. */ +const NAMED_ENTITIES: Record = { + '&': '&', + '<': '<', + '>': '>', + '"': '"', + ''': "'", + ''': "'", + ' ': ' ', +}; + +/** + * Decode a single HTML entity (named, decimal, or hex) to its character. + */ +function decodeEntity(entity: string): string { + // Named entity + const named = NAMED_ENTITIES[entity.toLowerCase()]; + if (named) return named; + + // Decimal numeric entity: &#NNN; + const decMatch = entity.match(/^&#(\d+);$/); + if (decMatch) { + const code = parseInt(decMatch[1], 10); + return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity; + } + + // Hex numeric entity: &#xHH; + const hexMatch = entity.match(/^&#x([0-9a-fA-F]+);$/); + if (hexMatch) { + const code = parseInt(hexMatch[1], 16); + return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity; + } + + return entity; +} + +/** + * Sanitize HTML content to plain text. + * + * - Converts `
` variants to newlines + * - Strips all remaining HTML tags + * - Decodes HTML entities (named, decimal, hex) + * - Collapses runs of spaces/tabs on each line (preserves newlines) + * + * @param text Raw HTML or HTML-entity-encoded text + * @returns Clean plain text + */ +export function sanitizeHtml(text: string): string { + if (!text) return ''; + + let result = text; + + // Convert
variants to newlines (before stripping tags) + result = result.replace(//gi, '\n'); + + // Strip all remaining HTML tags + result = result.replace(/<[^>]+>/g, ''); + + // Decode HTML entities + result = result.replace(/&(?:#x[0-9a-fA-F]+|#\d+|[a-zA-Z]+);/g, decodeEntity); + + // Collapse runs of horizontal whitespace (spaces/tabs) on each line, but preserve newlines + result = result.replace(/[^\S\n]+/g, ' '); + + // Trim each line and remove excessive blank lines (3+ consecutive → 2) + result = result + .split('\n') + .map(line => line.trim()) + .join('\n') + .replace(/\n{3,}/g, '\n\n'); + + return result.trim(); +}