fix(gmail): sanitize HTML entities and tags in tool output
Gmail API returns snippets with HTML entities (&, ', <br>, etc.) that leaked into LLM responses as raw HTML. Added shared sanitizeHtml() utility in src/utils/html.ts and applied it to gmail tool snippets, HTML body fallback, and gmail watcher snippets.
This commit is contained in:
@@ -947,6 +947,21 @@
|
|||||||
"config/default.yaml"
|
"config/default.yaml"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
"gmail-html-sanitization": {
|
||||||
|
"status": "completed",
|
||||||
|
"date": "2026-02-10",
|
||||||
|
"summary": "Sanitize HTML entities and tags in Gmail tool output. Gmail API returns snippets with HTML entities (&, ', <br>, etc.) that leaked into LLM responses. Added shared sanitizeHtml() utility and applied to both gmail tool and gmail watcher.",
|
||||||
|
"files_created": [
|
||||||
|
"src/utils/html.ts",
|
||||||
|
"src/utils/html.test.ts"
|
||||||
|
],
|
||||||
|
"files_modified": [
|
||||||
|
"src/tools/builtin/gmail.ts",
|
||||||
|
"src/tools/builtin/gmail.test.ts",
|
||||||
|
"src/automation/gmail.ts"
|
||||||
|
],
|
||||||
|
"test_status": "21/21 passing (html.test) + 18/18 passing (gmail.test) + 16/16 passing (automation/gmail.test)"
|
||||||
|
},
|
||||||
"tui-fullscreen-improvements": {
|
"tui-fullscreen-improvements": {
|
||||||
"status": "completed",
|
"status": "completed",
|
||||||
"date": "2026-02-10",
|
"date": "2026-02-10",
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import { homedir } from 'os';
|
|||||||
import type { GmailConfig } from '../config/schema.js';
|
import type { GmailConfig } from '../config/schema.js';
|
||||||
import type { ChannelAdapter, ChannelStatus, InboundMessage, OutboundMessage } from '../channels/types.js';
|
import type { ChannelAdapter, ChannelStatus, InboundMessage, OutboundMessage } from '../channels/types.js';
|
||||||
import { parseInterval } from './heartbeat.js';
|
import { parseInterval } from './heartbeat.js';
|
||||||
|
import { sanitizeHtml } from '../utils/html.js';
|
||||||
|
|
||||||
/** Minimal interface for the parts of ChannelRegistry we need. */
|
/** Minimal interface for the parts of ChannelRegistry we need. */
|
||||||
interface ChannelLookup {
|
interface ChannelLookup {
|
||||||
@@ -368,7 +369,7 @@ export class GmailWatcher implements ChannelAdapter {
|
|||||||
from: getHeader('From'),
|
from: getHeader('From'),
|
||||||
to: getHeader('To'),
|
to: getHeader('To'),
|
||||||
subject: getHeader('Subject'),
|
subject: getHeader('Subject'),
|
||||||
snippet: msg.data.snippet ?? '',
|
snippet: sanitizeHtml(msg.data.snippet ?? ''),
|
||||||
date: getHeader('Date'),
|
date: getHeader('Date'),
|
||||||
labels: msg.data.labelIds ?? [],
|
labels: msg.data.labelIds ?? [],
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -191,6 +191,36 @@ describe('gmail.list', () => {
|
|||||||
expect(result.success).toBe(true);
|
expect(result.success).toBe(true);
|
||||||
expect(result.output).toBe('No messages found.');
|
expect(result.output).toBe('No messages found.');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('sanitizes HTML entities in snippets', async () => {
|
||||||
|
setupValidAuth();
|
||||||
|
mockMessagesList.mockResolvedValue({
|
||||||
|
data: {
|
||||||
|
messages: [{ id: 'msg1' }],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
mockMessagesGet.mockResolvedValueOnce(
|
||||||
|
mockMessageDetails(
|
||||||
|
'msg1',
|
||||||
|
'experian@test.com',
|
||||||
|
'Credit Alert',
|
||||||
|
'Mon, 10 Feb 2026',
|
||||||
|
'William, your score is rising's & it… Don't miss out<br>Check now',
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
const [listTool] = createGmailTools(testConfig);
|
||||||
|
const result = await listTool.execute({});
|
||||||
|
|
||||||
|
expect(result.success).toBe(true);
|
||||||
|
expect(result.output).not.toContain(''');
|
||||||
|
expect(result.output).not.toContain('&');
|
||||||
|
expect(result.output).not.toContain('…');
|
||||||
|
expect(result.output).not.toContain('<br>');
|
||||||
|
expect(result.output).toContain("rising's");
|
||||||
|
expect(result.output).toContain('& it');
|
||||||
|
expect(result.output).toContain("Don't miss out");
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('gmail.search', () => {
|
describe('gmail.search', () => {
|
||||||
@@ -360,6 +390,38 @@ describe('gmail.read', () => {
|
|||||||
expect(result.output).not.toContain('<html>');
|
expect(result.output).not.toContain('<html>');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('decodes HTML entities in HTML-only body fallback', async () => {
|
||||||
|
setupValidAuth();
|
||||||
|
const htmlBody = '<html><body><p>Hello & welcome</p><br><p>Price: <$100></p><br><p>It's great</p></body></html>';
|
||||||
|
mockMessagesGet.mockResolvedValue({
|
||||||
|
data: {
|
||||||
|
payload: {
|
||||||
|
mimeType: 'multipart/alternative',
|
||||||
|
headers: [
|
||||||
|
{ name: 'From', value: 'sender@example.com' },
|
||||||
|
{ name: 'To', value: 'will@example.com' },
|
||||||
|
{ name: 'Subject', value: 'HTML Entities' },
|
||||||
|
{ name: 'Date', value: 'Mon, 10 Feb 2026 12:00:00 -0000' },
|
||||||
|
],
|
||||||
|
parts: [
|
||||||
|
{ mimeType: 'text/html', body: { data: toBase64Url(htmlBody) } },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const [, , readTool] = createGmailTools(testConfig);
|
||||||
|
const result = await readTool.execute({ id: 'msg-entities' });
|
||||||
|
|
||||||
|
expect(result.success).toBe(true);
|
||||||
|
expect(result.output).toContain('Hello & welcome');
|
||||||
|
expect(result.output).toContain('Price: <$100>');
|
||||||
|
expect(result.output).toContain("It's great");
|
||||||
|
expect(result.output).not.toContain('&');
|
||||||
|
expect(result.output).not.toContain('<');
|
||||||
|
expect(result.output).not.toContain(''');
|
||||||
|
});
|
||||||
|
|
||||||
it('returns error when credentials missing', async () => {
|
it('returns error when credentials missing', async () => {
|
||||||
mockExistsSync.mockReturnValue(false);
|
mockExistsSync.mockReturnValue(false);
|
||||||
const [, , readTool] = createGmailTools(testConfig);
|
const [, , readTool] = createGmailTools(testConfig);
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import { resolve } from 'path';
|
|||||||
import { homedir } from 'os';
|
import { homedir } from 'os';
|
||||||
import type { GmailConfig } from '../../config/schema.js';
|
import type { GmailConfig } from '../../config/schema.js';
|
||||||
import type { Tool, ToolResult } from '../types.js';
|
import type { Tool, ToolResult } from '../types.js';
|
||||||
|
import { sanitizeHtml } from '../../utils/html.js';
|
||||||
|
|
||||||
/** Expand ~ to home directory. */
|
/** Expand ~ to home directory. */
|
||||||
function expandPath(p: string): string {
|
function expandPath(p: string): string {
|
||||||
@@ -79,7 +80,7 @@ async function fetchMessageDetails(
|
|||||||
from: getHeader('From'),
|
from: getHeader('From'),
|
||||||
subject: getHeader('Subject'),
|
subject: getHeader('Subject'),
|
||||||
date: getHeader('Date'),
|
date: getHeader('Date'),
|
||||||
snippet: msg.data.snippet ?? '',
|
snippet: sanitizeHtml(msg.data.snippet ?? ''),
|
||||||
};
|
};
|
||||||
} catch {
|
} catch {
|
||||||
return null;
|
return null;
|
||||||
@@ -123,8 +124,8 @@ function extractTextBody(payload: {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (htmlFallback) {
|
if (htmlFallback) {
|
||||||
// Strip HTML tags for a rough plain-text rendering
|
// Convert HTML to clean plain text
|
||||||
return htmlFallback.replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim();
|
return sanitizeHtml(htmlFallback);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,119 @@
|
|||||||
|
import { describe, it, expect } from 'vitest';
|
||||||
|
import { sanitizeHtml } from './html.js';
|
||||||
|
|
||||||
|
describe('sanitizeHtml', () => {
|
||||||
|
it('returns empty string for empty/falsy input', () => {
|
||||||
|
expect(sanitizeHtml('')).toBe('');
|
||||||
|
expect(sanitizeHtml(undefined as unknown as string)).toBe('');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('passes through clean text unchanged', () => {
|
||||||
|
expect(sanitizeHtml('Hello world')).toBe('Hello world');
|
||||||
|
expect(sanitizeHtml('No entities here')).toBe('No entities here');
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── <br> conversion ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
it('converts <br> to newline', () => {
|
||||||
|
expect(sanitizeHtml('Hello<br>World')).toBe('Hello\nWorld');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('converts <br/> to newline', () => {
|
||||||
|
expect(sanitizeHtml('Hello<br/>World')).toBe('Hello\nWorld');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('converts <br /> to newline', () => {
|
||||||
|
expect(sanitizeHtml('Hello<br />World')).toBe('Hello\nWorld');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('converts <BR> (uppercase) to newline', () => {
|
||||||
|
expect(sanitizeHtml('Hello<BR>World')).toBe('Hello\nWorld');
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Tag stripping ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
it('strips HTML tags', () => {
|
||||||
|
expect(sanitizeHtml('<p>Hello</p>')).toBe('Hello');
|
||||||
|
expect(sanitizeHtml('<b>bold</b> and <i>italic</i>')).toBe('bold and italic');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('strips complex nested tags', () => {
|
||||||
|
expect(sanitizeHtml('<div class="foo"><span>text</span></div>')).toBe('text');
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Named entity decoding ────────────────────────────────────────────
|
||||||
|
|
||||||
|
it('decodes & to &', () => {
|
||||||
|
expect(sanitizeHtml('Tom & Jerry')).toBe('Tom & Jerry');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('decodes < and >', () => {
|
||||||
|
expect(sanitizeHtml('a < b > c')).toBe('a < b > c');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('decodes " and '', () => {
|
||||||
|
expect(sanitizeHtml('He said "hello" and it's fine')).toBe(
|
||||||
|
'He said "hello" and it\'s fine',
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('decodes to space', () => {
|
||||||
|
expect(sanitizeHtml('hello world')).toBe('hello world');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('decodes '', () => {
|
||||||
|
expect(sanitizeHtml('it's')).toBe("it's");
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Numeric entity decoding ──────────────────────────────────────────
|
||||||
|
|
||||||
|
it('decodes decimal numeric entities (&#NNN;)', () => {
|
||||||
|
expect(sanitizeHtml('©')).toBe('©'); // ©
|
||||||
|
expect(sanitizeHtml('€')).toBe('€'); // €
|
||||||
|
});
|
||||||
|
|
||||||
|
it('decodes hex numeric entities (&#xHH;)', () => {
|
||||||
|
expect(sanitizeHtml('/')).toBe('/');
|
||||||
|
expect(sanitizeHtml('©')).toBe('©');
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Whitespace handling ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
it('collapses multiple spaces to single space', () => {
|
||||||
|
expect(sanitizeHtml('hello world')).toBe('hello world');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('preserves intentional newlines from <br> conversion', () => {
|
||||||
|
expect(sanitizeHtml('line1<br><br>line3')).toBe('line1\n\nline3');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('collapses 3+ consecutive newlines to 2', () => {
|
||||||
|
expect(sanitizeHtml('a<br><br><br><br>b')).toBe('a\n\nb');
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Realistic Gmail scenarios ────────────────────────────────────────
|
||||||
|
|
||||||
|
it('handles a realistic Gmail snippet with HTML entities', () => {
|
||||||
|
const snippet = 'William, an exceptional credit rating is this many points away… Don't miss out & check now';
|
||||||
|
expect(sanitizeHtml(snippet)).toBe(
|
||||||
|
"William, an exceptional credit rating is this many points away\u2026 Don't miss out & check now",
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles Gmail HTML body fallback with tags and entities', () => {
|
||||||
|
const html = '<html><body><p>Hello William,</p><br><p>Your balance is <$500>.</p><br><p>Thanks & regards,<br>Bank</p></body></html>';
|
||||||
|
const result = sanitizeHtml(html);
|
||||||
|
expect(result).toContain('Hello William,');
|
||||||
|
expect(result).toContain('Your balance is <$500>.');
|
||||||
|
expect(result).toContain('Thanks & regards,');
|
||||||
|
expect(result).not.toContain('&');
|
||||||
|
expect(result).not.toContain('<');
|
||||||
|
expect(result).not.toContain('<p>');
|
||||||
|
expect(result).not.toContain('<html>');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles double-encoded entities (does not double-decode)', () => {
|
||||||
|
// &amp; should become & (one level), not &
|
||||||
|
expect(sanitizeHtml('&amp;')).toBe('&');
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -0,0 +1,79 @@
|
|||||||
|
/**
|
||||||
|
* HTML sanitization utilities for converting HTML content to clean plain text.
|
||||||
|
* Used primarily for Gmail API output where snippets and bodies contain
|
||||||
|
* HTML entities and tags that shouldn't leak into tool output.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** Named HTML entities to their character equivalents. */
|
||||||
|
const NAMED_ENTITIES: Record<string, string> = {
|
||||||
|
'&': '&',
|
||||||
|
'<': '<',
|
||||||
|
'>': '>',
|
||||||
|
'"': '"',
|
||||||
|
''': "'",
|
||||||
|
''': "'",
|
||||||
|
' ': ' ',
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decode a single HTML entity (named, decimal, or hex) to its character.
|
||||||
|
*/
|
||||||
|
function decodeEntity(entity: string): string {
|
||||||
|
// Named entity
|
||||||
|
const named = NAMED_ENTITIES[entity.toLowerCase()];
|
||||||
|
if (named) return named;
|
||||||
|
|
||||||
|
// Decimal numeric entity: &#NNN;
|
||||||
|
const decMatch = entity.match(/^&#(\d+);$/);
|
||||||
|
if (decMatch) {
|
||||||
|
const code = parseInt(decMatch[1], 10);
|
||||||
|
return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hex numeric entity: &#xHH;
|
||||||
|
const hexMatch = entity.match(/^&#x([0-9a-fA-F]+);$/);
|
||||||
|
if (hexMatch) {
|
||||||
|
const code = parseInt(hexMatch[1], 16);
|
||||||
|
return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
|
||||||
|
}
|
||||||
|
|
||||||
|
return entity;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sanitize HTML content to plain text.
|
||||||
|
*
|
||||||
|
* - Converts `<br>` variants to newlines
|
||||||
|
* - Strips all remaining HTML tags
|
||||||
|
* - Decodes HTML entities (named, decimal, hex)
|
||||||
|
* - Collapses runs of spaces/tabs on each line (preserves newlines)
|
||||||
|
*
|
||||||
|
* @param text Raw HTML or HTML-entity-encoded text
|
||||||
|
* @returns Clean plain text
|
||||||
|
*/
|
||||||
|
export function sanitizeHtml(text: string): string {
|
||||||
|
if (!text) return '';
|
||||||
|
|
||||||
|
let result = text;
|
||||||
|
|
||||||
|
// Convert <br> variants to newlines (before stripping tags)
|
||||||
|
result = result.replace(/<br\s*\/?>/gi, '\n');
|
||||||
|
|
||||||
|
// Strip all remaining HTML tags
|
||||||
|
result = result.replace(/<[^>]+>/g, '');
|
||||||
|
|
||||||
|
// Decode HTML entities
|
||||||
|
result = result.replace(/&(?:#x[0-9a-fA-F]+|#\d+|[a-zA-Z]+);/g, decodeEntity);
|
||||||
|
|
||||||
|
// Collapse runs of horizontal whitespace (spaces/tabs) on each line, but preserve newlines
|
||||||
|
result = result.replace(/[^\S\n]+/g, ' ');
|
||||||
|
|
||||||
|
// Trim each line and remove excessive blank lines (3+ consecutive → 2)
|
||||||
|
result = result
|
||||||
|
.split('\n')
|
||||||
|
.map(line => line.trim())
|
||||||
|
.join('\n')
|
||||||
|
.replace(/\n{3,}/g, '\n\n');
|
||||||
|
|
||||||
|
return result.trim();
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user