fix(gmail): sanitize HTML entities and tags in tool output

Gmail API returns snippets with HTML entities (&, ', <br>, etc.) that leaked into LLM responses as raw HTML. Added shared sanitizeHtml() utility in src/utils/html.ts and applied it to gmail tool snippets, HTML body fallback, and gmail watcher snippets.
2026-02-10 16:30:14 -08:00
parent 4317492e4b
commit 4ce8e81c01
6 changed files with 281 additions and 4 deletions
@@ -0,0 +1,79 @@
+/**
+ * HTML sanitization utilities for converting HTML content to clean plain text.
+ * Used primarily for Gmail API output where snippets and bodies contain
+ * HTML entities and tags that shouldn't leak into tool output.
+ */
+
+/** Named HTML entities to their character equivalents. */
+const NAMED_ENTITIES: Record<string, string> = {
+  '&amp;': '&',
+  '&lt;': '<',
+  '&gt;': '>',
+  '&quot;': '"',
+  '&#39;': "'",
+  '&apos;': "'",
+  '&nbsp;': ' ',
+};
+
+/**
+ * Decode a single HTML entity (named, decimal, or hex) to its character.
+ */
+function decodeEntity(entity: string): string {
+  // Named entity
+  const named = NAMED_ENTITIES[entity.toLowerCase()];
+  if (named) return named;
+
+  // Decimal numeric entity: &#NNN;
+  const decMatch = entity.match(/^&#(\d+);$/);
+  if (decMatch) {
+    const code = parseInt(decMatch[1], 10);
+    return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
+  }
+
+  // Hex numeric entity: &#xHH;
+  const hexMatch = entity.match(/^&#x([0-9a-fA-F]+);$/);
+  if (hexMatch) {
+    const code = parseInt(hexMatch[1], 16);
+    return code > 0 && code <= 0x10ffff ? String.fromCodePoint(code) : entity;
+  }
+
+  return entity;
+}
+
+/**
+ * Sanitize HTML content to plain text.
+ *
+ * - Converts `<br>` variants to newlines
+ * - Strips all remaining HTML tags
+ * - Decodes HTML entities (named, decimal, hex)
+ * - Collapses runs of spaces/tabs on each line (preserves newlines)
+ *
+ * @param text Raw HTML or HTML-entity-encoded text
+ * @returns Clean plain text
+ */
+export function sanitizeHtml(text: string): string {
+  if (!text) return '';
+
+  let result = text;
+
+  // Convert <br> variants to newlines (before stripping tags)
+  result = result.replace(/<br\s*\/?>/gi, '\n');
+
+  // Strip all remaining HTML tags
+  result = result.replace(/<[^>]+>/g, '');
+
+  // Decode HTML entities
+  result = result.replace(/&(?:#x[0-9a-fA-F]+|#\d+|[a-zA-Z]+);/g, decodeEntity);
+
+  // Collapse runs of horizontal whitespace (spaces/tabs) on each line, but preserve newlines
+  result = result.replace(/[^\S\n]+/g, ' ');
+
+  // Trim each line and remove excessive blank lines (3+ consecutive → 2)
+  result = result
+    .split('\n')
+    .map(line => line.trim())
+    .join('\n')
+    .replace(/\n{3,}/g, '\n\n');
+
+  return result.trim();
+}