diff --git a/docs/plans/state.json b/docs/plans/state.json
index 132e343..953c453 100644
--- a/docs/plans/state.json
+++ b/docs/plans/state.json
@@ -947,6 +947,21 @@
"config/default.yaml"
]
},
+ "gmail-html-sanitization": {
+ "status": "completed",
+ "date": "2026-02-10",
+ "summary": "Sanitize HTML entities and tags in Gmail tool output. Gmail API returns snippets with HTML entities (&, ',
, etc.) that leaked into LLM responses. Added shared sanitizeHtml() utility and applied to both gmail tool and gmail watcher.",
+ "files_created": [
+ "src/utils/html.ts",
+ "src/utils/html.test.ts"
+ ],
+ "files_modified": [
+ "src/tools/builtin/gmail.ts",
+ "src/tools/builtin/gmail.test.ts",
+ "src/automation/gmail.ts"
+ ],
+ "test_status": "21/21 passing (html.test) + 18/18 passing (gmail.test) + 16/16 passing (automation/gmail.test)"
+ },
"tui-fullscreen-improvements": {
"status": "completed",
"date": "2026-02-10",
diff --git a/src/automation/gmail.ts b/src/automation/gmail.ts
index ee1377d..907ba4d 100644
--- a/src/automation/gmail.ts
+++ b/src/automation/gmail.ts
@@ -5,6 +5,7 @@ import { homedir } from 'os';
import type { GmailConfig } from '../config/schema.js';
import type { ChannelAdapter, ChannelStatus, InboundMessage, OutboundMessage } from '../channels/types.js';
import { parseInterval } from './heartbeat.js';
+import { sanitizeHtml } from '../utils/html.js';
/** Minimal interface for the parts of ChannelRegistry we need. */
interface ChannelLookup {
@@ -368,7 +369,7 @@ export class GmailWatcher implements ChannelAdapter {
from: getHeader('From'),
to: getHeader('To'),
subject: getHeader('Subject'),
- snippet: msg.data.snippet ?? '',
+ snippet: sanitizeHtml(msg.data.snippet ?? ''),
date: getHeader('Date'),
labels: msg.data.labelIds ?? [],
};
diff --git a/src/tools/builtin/gmail.test.ts b/src/tools/builtin/gmail.test.ts
index 92ece82..51046dc 100644
--- a/src/tools/builtin/gmail.test.ts
+++ b/src/tools/builtin/gmail.test.ts
@@ -191,6 +191,36 @@ describe('gmail.list', () => {
expect(result.success).toBe(true);
expect(result.output).toBe('No messages found.');
});
+
+ it('sanitizes HTML entities in snippets', async () => {
+ setupValidAuth();
+ mockMessagesList.mockResolvedValue({
+ data: {
+ messages: [{ id: 'msg1' }],
+ },
+ });
+ mockMessagesGet.mockResolvedValueOnce(
+ mockMessageDetails(
+ 'msg1',
+ 'experian@test.com',
+ 'Credit Alert',
+ 'Mon, 10 Feb 2026',
+ 'William, your score is rising's & it… Don't miss out
Check now',
+ ),
+ );
+
+ const [listTool] = createGmailTools(testConfig);
+ const result = await listTool.execute({});
+
+ expect(result.success).toBe(true);
+ expect(result.output).not.toContain(''');
+ expect(result.output).not.toContain('&');
+ expect(result.output).not.toContain('…');
+ expect(result.output).not.toContain('
');
+ expect(result.output).toContain("rising's");
+ expect(result.output).toContain('& it');
+ expect(result.output).toContain("Don't miss out");
+ });
});
describe('gmail.search', () => {
@@ -360,6 +390,38 @@ describe('gmail.read', () => {
expect(result.output).not.toContain('');
});
+ it('decodes HTML entities in HTML-only body fallback', async () => {
+ setupValidAuth();
+ const htmlBody = '
Hello & welcome
Price: <$100>
It's great
'; + mockMessagesGet.mockResolvedValue({ + data: { + payload: { + mimeType: 'multipart/alternative', + headers: [ + { name: 'From', value: 'sender@example.com' }, + { name: 'To', value: 'will@example.com' }, + { name: 'Subject', value: 'HTML Entities' }, + { name: 'Date', value: 'Mon, 10 Feb 2026 12:00:00 -0000' }, + ], + parts: [ + { mimeType: 'text/html', body: { data: toBase64Url(htmlBody) } }, + ], + }, + }, + }); + + const [, , readTool] = createGmailTools(testConfig); + const result = await readTool.execute({ id: 'msg-entities' }); + + expect(result.success).toBe(true); + expect(result.output).toContain('Hello & welcome'); + expect(result.output).toContain('Price: <$100>'); + expect(result.output).toContain("It's great"); + expect(result.output).not.toContain('&'); + expect(result.output).not.toContain('<'); + expect(result.output).not.toContain('''); + }); + it('returns error when credentials missing', async () => { mockExistsSync.mockReturnValue(false); const [, , readTool] = createGmailTools(testConfig); diff --git a/src/tools/builtin/gmail.ts b/src/tools/builtin/gmail.ts index 44f4632..1ad9b69 100644 --- a/src/tools/builtin/gmail.ts +++ b/src/tools/builtin/gmail.ts @@ -4,6 +4,7 @@ import { resolve } from 'path'; import { homedir } from 'os'; import type { GmailConfig } from '../../config/schema.js'; import type { Tool, ToolResult } from '../types.js'; +import { sanitizeHtml } from '../../utils/html.js'; /** Expand ~ to home directory. */ function expandPath(p: string): string { @@ -79,7 +80,7 @@ async function fetchMessageDetails( from: getHeader('From'), subject: getHeader('Subject'), date: getHeader('Date'), - snippet: msg.data.snippet ?? '', + snippet: sanitizeHtml(msg.data.snippet ?? ''), }; } catch { return null; @@ -123,8 +124,8 @@ function extractTextBody(payload: { } } if (htmlFallback) { - // Strip HTML tags for a rough plain-text rendering - return htmlFallback.replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim(); + // Convert HTML to clean plain text + return sanitizeHtml(htmlFallback); } } diff --git a/src/utils/html.test.ts b/src/utils/html.test.ts new file mode 100644 index 0000000..c5a6753 --- /dev/null +++ b/src/utils/html.test.ts @@ -0,0 +1,119 @@ +import { describe, it, expect } from 'vitest'; +import { sanitizeHtml } from './html.js'; + +describe('sanitizeHtml', () => { + it('returns empty string for empty/falsy input', () => { + expect(sanitizeHtml('')).toBe(''); + expect(sanitizeHtml(undefined as unknown as string)).toBe(''); + }); + + it('passes through clean text unchanged', () => { + expect(sanitizeHtml('Hello world')).toBe('Hello world'); + expect(sanitizeHtml('No entities here')).toBe('No entities here'); + }); + + // ──Hello
')).toBe('Hello'); + expect(sanitizeHtml('bold and italic')).toBe('bold and italic'); + }); + + it('strips complex nested tags', () => { + expect(sanitizeHtml('Hello William,
Your balance is <$500>.
Thanks & regards,
Bank
');
+ expect(result).not.toContain('');
+ });
+
+ it('handles double-encoded entities (does not double-decode)', () => {
+ // & should become & (one level), not &
+ expect(sanitizeHtml('&')).toBe('&');
+ });
+});
diff --git a/src/utils/html.ts b/src/utils/html.ts
new file mode 100644
index 0000000..f6687ae
--- /dev/null
+++ b/src/utils/html.ts
@@ -0,0 +1,79 @@
+/**
+ * HTML sanitization utilities for converting HTML content to clean plain text.
+ * Used primarily for Gmail API output where snippets and bodies contain
+ * HTML entities and tags that shouldn't leak into tool output.
+ */
+
+/** Named HTML entities to their character equivalents. */
+const NAMED_ENTITIES: Record
` variants to newlines
+ * - Strips all remaining HTML tags
+ * - Decodes HTML entities (named, decimal, hex)
+ * - Collapses runs of spaces/tabs on each line (preserves newlines)
+ *
+ * @param text Raw HTML or HTML-entity-encoded text
+ * @returns Clean plain text
+ */
+export function sanitizeHtml(text: string): string {
+ if (!text) return '';
+
+ let result = text;
+
+ // Convert
variants to newlines (before stripping tags)
+ result = result.replace(/
/gi, '\n');
+
+ // Strip all remaining HTML tags
+ result = result.replace(/<[^>]+>/g, '');
+
+ // Decode HTML entities
+ result = result.replace(/&(?:#x[0-9a-fA-F]+|#\d+|[a-zA-Z]+);/g, decodeEntity);
+
+ // Collapse runs of horizontal whitespace (spaces/tabs) on each line, but preserve newlines
+ result = result.replace(/[^\S\n]+/g, ' ');
+
+ // Trim each line and remove excessive blank lines (3+ consecutive → 2)
+ result = result
+ .split('\n')
+ .map(line => line.trim())
+ .join('\n')
+ .replace(/\n{3,}/g, '\n\n');
+
+ return result.trim();
+}