fix(gmail): sanitize HTML entities and tags in tool output

Gmail API returns snippets with HTML entities (&, ', <br>, etc.) that leaked into LLM responses as raw HTML. Added shared sanitizeHtml() utility in src/utils/html.ts and applied it to gmail tool snippets, HTML body fallback, and gmail watcher snippets.
2026-02-10 16:30:14 -08:00
parent 4317492e4b
commit 4ce8e81c01
6 changed files with 281 additions and 4 deletions
@@ -5,6 +5,7 @@ import { homedir } from 'os';
 import type { GmailConfig } from '../config/schema.js';
 import type { ChannelAdapter, ChannelStatus, InboundMessage, OutboundMessage } from '../channels/types.js';
 import { parseInterval } from './heartbeat.js';
+import { sanitizeHtml } from '../utils/html.js';

 /** Minimal interface for the parts of ChannelRegistry we need. */
 interface ChannelLookup {
@@ -368,7 +369,7 @@ export class GmailWatcher implements ChannelAdapter {
        from: getHeader('From'),
        to: getHeader('To'),
        subject: getHeader('Subject'),
-        snippet: msg.data.snippet ?? '',
+        snippet: sanitizeHtml(msg.data.snippet ?? ''),
        date: getHeader('Date'),
        labels: msg.data.labelIds ?? [],
      };
@@ -191,6 +191,36 @@ describe('gmail.list', () => {
    expect(result.success).toBe(true);
    expect(result.output).toBe('No messages found.');
  });
+
+  it('sanitizes HTML entities in snippets', async () => {
+    setupValidAuth();
+    mockMessagesList.mockResolvedValue({
+      data: {
+        messages: [{ id: 'msg1' }],
+      },
+    });
+    mockMessagesGet.mockResolvedValueOnce(
+      mockMessageDetails(
+        'msg1',
+        'experian@test.com',
+        'Credit Alert',
+        'Mon, 10 Feb 2026',
+        'William, your score is rising&#39;s &amp; it&#8230; Don&apos;t miss out<br>Check now',
+      ),
+    );
+
+    const [listTool] = createGmailTools(testConfig);
+    const result = await listTool.execute({});
+
+    expect(result.success).toBe(true);
+    expect(result.output).not.toContain('&#39;');
+    expect(result.output).not.toContain('&amp;');
+    expect(result.output).not.toContain('&#8230;');
+    expect(result.output).not.toContain('<br>');
+    expect(result.output).toContain("rising's");
+    expect(result.output).toContain('& it');
+    expect(result.output).toContain("Don't miss out");
+  });
 });

 describe('gmail.search', () => {
@@ -360,6 +390,38 @@ describe('gmail.read', () => {
    expect(result.output).not.toContain('<html>');
  });

+  it('decodes HTML entities in HTML-only body fallback', async () => {
+    setupValidAuth();
+    const htmlBody = '<html><body><p>Hello &amp; welcome</p><br><p>Price: &lt;$100&gt;</p><br><p>It&#39;s great</p></body></html>';
+    mockMessagesGet.mockResolvedValue({
+      data: {
+        payload: {
+          mimeType: 'multipart/alternative',
+          headers: [
+            { name: 'From', value: 'sender@example.com' },
+            { name: 'To', value: 'will@example.com' },
+            { name: 'Subject', value: 'HTML Entities' },
+            { name: 'Date', value: 'Mon, 10 Feb 2026 12:00:00 -0000' },
+          ],
+          parts: [
+            { mimeType: 'text/html', body: { data: toBase64Url(htmlBody) } },
+          ],
+        },
+      },
+    });
+
+    const [, , readTool] = createGmailTools(testConfig);
+    const result = await readTool.execute({ id: 'msg-entities' });
+
+    expect(result.success).toBe(true);
+    expect(result.output).toContain('Hello & welcome');
+    expect(result.output).toContain('Price: <$100>');
+    expect(result.output).toContain("It's great");
+    expect(result.output).not.toContain('&amp;');
+    expect(result.output).not.toContain('&lt;');
+    expect(result.output).not.toContain('&#39;');
+  });
+
  it('returns error when credentials missing', async () => {
    mockExistsSync.mockReturnValue(false);
    const [, , readTool] = createGmailTools(testConfig);
@@ -4,6 +4,7 @@ import { resolve } from 'path';
 import { homedir } from 'os';
 import type { GmailConfig } from '../../config/schema.js';
 import type { Tool, ToolResult } from '../types.js';
+import { sanitizeHtml } from '../../utils/html.js';

 /** Expand ~ to home directory. */
 function expandPath(p: string): string {
@@ -79,7 +80,7 @@ async function fetchMessageDetails(
      from: getHeader('From'),
      subject: getHeader('Subject'),
      date: getHeader('Date'),
-      snippet: msg.data.snippet ?? '',
+      snippet: sanitizeHtml(msg.data.snippet ?? ''),
    };
  } catch {
    return null;
@@ -123,8 +124,8 @@ function extractTextBody(payload: {
      }
    }
    if (htmlFallback) {
-      // Strip HTML tags for a rough plain-text rendering
-      return htmlFallback.replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim();
+      // Convert HTML to clean plain text
+      return sanitizeHtml(htmlFallback);
    }
  }

@@ -0,0 +1,119 @@
+import { describe, it, expect } from 'vitest';
+import { sanitizeHtml } from './html.js';
+
+describe('sanitizeHtml', () => {
+  it('returns empty string for empty/falsy input', () => {
+    expect(sanitizeHtml('')).toBe('');
+    expect(sanitizeHtml(undefined as unknown as string)).toBe('');
+  });
+
+  it('passes through clean text unchanged', () => {
+    expect(sanitizeHtml('Hello world')).toBe('Hello world');
+    expect(sanitizeHtml('No entities here')).toBe('No entities here');
+  });
+
+  // ── <br> conversion ────────────────────────────────────────────────────
+
+  it('converts <br> to newline', () => {
+    expect(sanitizeHtml('Hello<br>World')).toBe('Hello\nWorld');
+  });
+
+  it('converts <br/> to newline', () => {
+    expect(sanitizeHtml('Hello<br/>World')).toBe('Hello\nWorld');
+  });
+
+  it('converts <br /> to newline', () => {
+    expect(sanitizeHtml('Hello<br />World')).toBe('Hello\nWorld');
+  });
+
+  it('converts <BR> (uppercase) to newline', () => {
+    expect(sanitizeHtml('Hello<BR>World')).toBe('Hello\nWorld');
+  });
+
+  // ── Tag stripping ─────────────────────────────────────────────────────
+
+  it('strips HTML tags', () => {
+    expect(sanitizeHtml('<p>Hello</p>')).toBe('Hello');
+    expect(sanitizeHtml('<b>bold</b> and <i>italic</i>')).toBe('bold and italic');
+  });
+
+  it('strips complex nested tags', () => {
+    expect(sanitizeHtml('<div class="foo"><span>text</span></div>')).toBe('text');
+  });
+
+  // ── Named entity decoding ────────────────────────────────────────────
+
+  it('decodes &amp; to &', () => {
+    expect(sanitizeHtml('Tom &amp; Jerry')).toBe('Tom & Jerry');
+  });
+
+  it('decodes &lt; and &gt;', () => {
+    expect(sanitizeHtml('a &lt; b &gt; c')).toBe('a < b > c');
+  });
+
+  it('decodes &quot; and &#39;', () => {
+    expect(sanitizeHtml('He said &quot;hello&quot; and it&#39;s fine')).toBe(
+      'He said "hello" and it\'s fine',
+    );
+  });
+
+  it('decodes &nbsp; to space', () => {
+    expect(sanitizeHtml('hello&nbsp;world')).toBe('hello world');
+  });
+
+  it('decodes &apos;', () => {
+    expect(sanitizeHtml('it&apos;s')).toBe("it's");
+  });
+
+  // ── Numeric entity decoding ──────────────────────────────────────────
+
+  it('decodes decimal numeric entities (&#NNN;)', () => {
+    expect(sanitizeHtml('&#169;')).toBe('©'); // ©
+    expect(sanitizeHtml('&#8364;')).toBe('€'); // €
+  });
+
+  it('decodes hex numeric entities (&#xHH;)', () => {
+    expect(sanitizeHtml('&#x2F;')).toBe('/');
+    expect(sanitizeHtml('&#xA9;')).toBe('©');
+  });
+
+  // ── Whitespace handling ──────────────────────────────────────────────
+
+  it('collapses multiple spaces to single space', () => {
+    expect(sanitizeHtml('hello    world')).toBe('hello world');
+  });
+
+  it('preserves intentional newlines from <br> conversion', () => {
+    expect(sanitizeHtml('line1<br><br>line3')).toBe('line1\n\nline3');
+  });
+
+  it('collapses 3+ consecutive newlines to 2', () => {
+    expect(sanitizeHtml('a<br><br><br><br>b')).toBe('a\n\nb');
+  });
+
+  // ── Realistic Gmail scenarios ────────────────────────────────────────
+
+  it('handles a realistic Gmail snippet with HTML entities', () => {
+    const snippet = 'William, an exceptional credit rating is this many points away&#8230; Don&#39;t miss out &amp; check now';
+    expect(sanitizeHtml(snippet)).toBe(
+      "William, an exceptional credit rating is this many points away\u2026 Don't miss out & check now",
+    );
+  });
+
+  it('handles Gmail HTML body fallback with tags and entities', () => {
+    const html = '<html><body><p>Hello William,</p><br><p>Your balance is &lt;$500&gt;.</p><br><p>Thanks &amp; regards,<br>Bank</p></body></html>';
+    const result = sanitizeHtml(html);
+    expect(result).toContain('Hello William,');
+    expect(result).toContain('Your balance is <$500>.');
+    expect(result).toContain('Thanks & regards,');
+    expect(result).not.toContain('&amp;');
+    expect(result).not.toContain('&lt;');
+    expect(result).not.toContain('<p>');
+    expect(result).not.toContain('<html>');
+  });
+
+  it('handles double-encoded entities (does not double-decode)', () => {
+    // &amp;amp; should become &amp; (one level), not &
+    expect(sanitizeHtml('&amp;amp;')).toBe('&amp;');
+  });
+});
@@ -0,0 +1,79 @@
+/**
+ * HTML sanitization utilities for converting HTML content to clean plain text.
+ * Used primarily for Gmail API output where snippets and bodies contain
+ * HTML entities and tags that shouldn't leak into tool output.
+ */
+
+/** Named HTML entities to their character equivalents. */
+const NAMED_ENTITIES: Record<string, string> = {
+  '&amp;': '&',
+  '&lt;': '<',
+  '&gt;': '>',
+  '&quot;': '"',
+  '&#39;': "'",
+  '&apos;': "'",
+  '&nbsp;': ' ',
+};
+
+/**
+ * Decode a single HTML entity (named, decimal, or hex) to its character.
+ */
+function decodeEntity(entity: string): string {
+  // Named entity
+  const named = NAMED_ENTITIES[entity.toLowerCase()];
+  if (named) return named;
+
+  // Decimal numeric entity: &#NNN;
+  const decMatch = entity.match(/^&#(\d+);$/);
+  if (decMatch) {
+    const code = parseInt(decMatch[1], 10);
+    return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
+  }
+
+  // Hex numeric entity: &#xHH;
+  const hexMatch = entity.match(/^&#x([0-9a-fA-F]+);$/);
+  if (hexMatch) {
+    const code = parseInt(hexMatch[1], 16);
+    return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
+  }
+
+  return entity;
+}
+
+/**
+ * Sanitize HTML content to plain text.
+ *
+ * - Converts `<br>` variants to newlines
+ * - Strips all remaining HTML tags
+ * - Decodes HTML entities (named, decimal, hex)
+ * - Collapses runs of spaces/tabs on each line (preserves newlines)
+ *
+ * @param text Raw HTML or HTML-entity-encoded text
+ * @returns Clean plain text
+ */
+export function sanitizeHtml(text: string): string {
+  if (!text) return '';
+
+  let result = text;
+
+  // Convert <br> variants to newlines (before stripping tags)
+  result = result.replace(/<br\s*\/?>/gi, '\n');
+
+  // Strip all remaining HTML tags
+  result = result.replace(/<[^>]+>/g, '');
+
+  // Decode HTML entities
+  result = result.replace(/&(?:#x[0-9a-fA-F]+|#\d+|[a-zA-Z]+);/g, decodeEntity);
+
+  // Collapse runs of horizontal whitespace (spaces/tabs) on each line, but preserve newlines
+  result = result.replace(/[^\S\n]+/g, ' ');
+
+  // Trim each line and remove excessive blank lines (3+ consecutive → 2)
+  result = result
+    .split('\n')
+    .map(line => line.trim())
+    .join('\n')
+    .replace(/\n{3,}/g, '\n\n');
+
+  return result.trim();
+}